# Project

In [None]:
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.formula.api import ols
from statsmodels.graphics.regressionplots import plot_regress_exog

In [None]:
path = 'D:/OneDrive - University of Warwick/Dissertation/review-Alaska_10.json'

data_raw = pd.read_json(path, lines=True, encoding='utf-8')

This indicates the presence of NA values in the data. However, as this is one of the aspects we intend to investigate, we will selectively perform data cleaning at a later stage.

In [None]:
print(data_raw.info())

In [None]:
print(data_raw.head())

## Data Dictionary

- index: The index of the data.
- user_id: The ID of the reviewer.
- name: The name of the reviewer.
- time: The time of the review in Unix time format.
- rating: The rating given by the reviewer for the business.
- text: The text of the review.
- pics: Pictures associated with the review.
- resp: The business response to the review, including Unix time and the text of the response.
- gmap_id: The ID of the business.

Due to the nature of our research topic, which is to explore the influence of time of day on online ratings across different devices, we will be selecting specific data variables for further analysis. The data variables of interest include "time," "rating," and "pics." The reason for selecting "pics" is due to the unfortunate inability to obtain data directly related to device types in the comments. Therefore, we need to make a crucial assumption: 
**we assume that comments with pictures are uploaded using mobile devices, while comments without pictures are uploaded using non-mobile devices.**

## Data Preprocessing



In this section, we will perform data preprocessing, which includes data cleaning and data transformation. Data cleaning involves handling missing values, outliers, and inconsistencies in the dataset. Data transformation may involve converting the "pics" data into device type data, etc. These steps allow us to make use of the available information and derive meaningful insights from the dataset. 

In [None]:
# Label encoding and one-hot encoding
data_modified = (
    data_raw
    .assign(time=lambda x: pd.to_datetime(x['time'], unit='ms').dt.hour)
    # time[1,2,3] represents ['Morning', 'Noon', 'Evening']
    .assign(time=lambda x: pd.cut(x['time'], bins=[0, 8, 16, 24], labels=[1,2,3], right=False))
    # device[0,1] represents ['Non-mobile device', 'Mobile devices']
    .assign(device=lambda x: x['pics'].notnull().astype(int))
    .filter(['time', 'rating', 'device'])
)

print(data_modified[:10])


In [None]:
# Histogram for data distribution by time
counts = data_modified['time'].value_counts().sort_index()
counts.plot(kind='bar', alpha=0.7)
plt.xlabel('Time')
plt.ylabel('Count')
plt.title('Distribution of Data by Time')
plt.xticks(rotation=0)  # Rotate x-axis labels

for i, v in enumerate(counts):
    plt.text(i, v + 0.01 * counts.max(), f'{v / counts.sum() * 100:.1f}%', ha='center')

plt.show()

# Histogram for data distribution by rating
counts = data_modified['rating'].value_counts().sort_index()
counts.plot(kind='bar', alpha=0.7)
plt.xlabel('Rating')
plt.ylabel('Count')
plt.title('Distribution of Data by Rating')
plt.xticks(rotation=0)  # Rotate x-axis labels

for i, v in enumerate(counts):
    plt.text(i, v + 0.01 * counts.max(), f'{v / counts.sum() * 100:.1f}%', ha='center')

plt.show()

# Histogram for data distribution by devices
counts = data_modified['device'].value_counts().sort_index()
counts.plot(kind='bar', alpha=0.7)
plt.xlabel('Device')
plt.ylabel('Count')
plt.title('Distribution of Data by Device')
plt.xticks(rotation=0)  # Rotate x-axis labels

for i, v in enumerate(counts):
    plt.text(i, v + 0.01 * counts.max(), f'{v / counts.sum() * 100:.1f}%', ha='center')

plt.show()

In [None]:
print(data_modified.isnull().sum())

### Chi-square

In [None]:
# Chi-square test for 'time' and 'device'
chi2_time_device, p_time_device, dof_time_device, expected_time_device = stats.chi2_contingency(pd.crosstab(data_modified['time'], data_modified['device']))
print("Chi-square test result for 'time' and 'device':")
print("Chi-square statistic:", chi2_time_device)
print("P-value:", p_time_device)

In [None]:
# Chi-square test for 'rating' and 'device'
chi2_rating_device, p_rating_device, dof_rating_device, expected_rating_device = stats.chi2_contingency(pd.crosstab(data_modified['rating'], data_modified['device']))
print("\nChi-square test result for 'rating' and 'device':")
print("Chi-square statistic:", chi2_rating_device)
print("P-value:", p_rating_device)

In [None]:
# Chi-square test for 'rating' and 'time'
chi2_rating_time, p_rating_time, dof_rating_time, expected_rating_time = stats.chi2_contingency(pd.crosstab(data_modified['rating'], data_modified['time']))
print("\nChi-square test result for 'rating' and 'time':")
print("Chi-square statistic:", chi2_rating_time)
print("P-value:", p_rating_time)

The results of the chi-square test indicate that there is some degree of correlation between all three variables.

### Correlation Matrix

In [None]:
data_modified['time'] = data_modified['time'].astype(int)
corr_matrix = data_modified.corr()

print(corr_matrix)


In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.show()


### Multicollinearity test

In [None]:
X = data_modified[['time', 'rating', 'device']]
vif_data = pd.DataFrame()
vif_data['Features'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print("\nMulticollinearity test:")
print(vif_data)

The Variance Inflation Factors (VIF) for the 'time', 'rating', and 'device' variables are 4.27, 4.32, and 1.05 respectively. Typically, a VIF value greater than 5 or 10 indicates high multicollinearity. Here, none of the VIF values exceed these thresholds, suggesting that multicollinearity is not a significant concern in this dataset.

Summary: In the data processing phase of this project, we first cleaned and transformed the data appropriately. We then performed exploratory data analysis to visualize the distribution of ratings across different times of day and devices. Next, we conducted chi-square tests which revealed significant associations between 'time', 'device', and 'rating'. Correlation Matrix then proved these associations are not linear. A Variance Inflation Factor (VIF) test confirmed that multicollinearity was not a significant concern. This prepared the data effectively for the subsequent modeling phase.

## Modeling

In [None]:
# as.factor()?

### Train and Test Sets

In [None]:
# Separate features (X) and target variable (y) in the balanced dataset
X = data_modified.drop('rating', axis=1)
y = data_modified['rating']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2221877)


### Data Balancing

According to the "rating distribution", it can be seen that there is a significant data imbalance between the different ratings, and stratified sampling may not be able to address this issue. Therefore, we plan to employ undersampling as a strategy to achieve balance in the dataset.

In [None]:
# # Separate features (X) and target variable (y)
# X = data_modified.drop('rating', axis=1)
# y = data_modified['rating']

# Apply SMOTEENN using training sets
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

# # Create a new balanced dataframe
# balanced_data = pd.DataFrame(X_resampled, columns=X.columns)
# balanced_data['rating'] = y_resampled

# # Check the balanced distribution of ratings
# print(balanced_data['rating'].value_counts())

# Check the balanced distribution of ratings
print(pd.Series(y_resampled).value_counts())


In [None]:
# from imblearn.combine import SMOTETomek

# # Separate features (X) and target variable (y)
# X = data_modified.drop('rating', axis=1)
# y = data_modified['rating']

# # Apply SMOTETomek
# smote_tomek = SMOTETomek(random_state=42)
# X_resampled, y_resampled = smote_tomek.fit_resample(X, y)

# # Create a new balanced dataframe
# balanced_data = pd.DataFrame(X_resampled, columns=X.columns)
# balanced_data['rating'] = y_resampled

# # Check the balanced distribution of ratings
# print(balanced_data['rating'].value_counts())


In [None]:
# Print the shapes of the training and testing sets
print("Training set shape:", X_resampled.shape, y_resampled.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

### Multiple Linear Regression

In [None]:
# Fit the linear regression model
model = LinearRegression()
result = model.fit(X_resampled, y_resampled)
predictions = model.predict(X_test)

# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, predictions))
print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))

In [None]:
# Merge predictions and y_test into a single data frame
plot_regression = pd.DataFrame({'predictions': predictions, 'actual_rating': y_test})

# Create a scatter plot with linear regression line
sns.lmplot(x="predictions", y="actual_rating", data=plot_regression, scatter_kws={"alpha": 0.5})
plt.xlabel('Predicted Rating')
plt.ylabel('Actual Rating')
plt.title('Predicted vs Actual Ratings')
plt.show()

This plot does not form a significant linear relationship between the predicted and actual values, which confirms my previous correlation coefficient calculations that there is no significant linear relationship between my variables.

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# 创建模型
model = LogisticRegression()

# 训练模型
model.fit(X_resampled, y_resampled)

# 在测试集上进行预测
predictions = model.predict(X_test)

# 评估模型
print('Accuracy score: ', accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

# 可视化混淆矩阵
cm = confusion_matrix(y_test, predictions)
sns.heatmap(cm, annot=True, fmt=".0f")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [None]:
from sklearn.linear_model import LogisticRegression

# 创建多项逻辑回归模型，设置 multi_class 参数为 'multinomial'
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

# 训练模型
model.fit(X_resampled, y_resampled)

# 在测试集上进行预测
predictions = model.predict(X_test)

# 评估模型
from sklearn.metrics import accuracy_score
print('Accuracy score: ', accuracy_score(y_test, predictions))


### Polynomial Regression

In [None]:
# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.pipeline import make_pipeline

# # Fit the polynomial regression model
# degree=2
# polyreg = make_pipeline(PolynomialFeatures(degree), LinearRegression())
# polyreg.fit(X_train, y_train)
# predictions = polyreg.predict(X_test)

# # Model evaluation
# print('mean_squared_error : ', mean_squared_error(y_test, predictions))
# print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))


### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

# 创建模型
model = DecisionTreeClassifier()

# 训练模型
model.fit(X_resampled, y_resampled)

# 在测试集上进行预测
predictions = model.predict(X_test)

# 评估模型
print('Accuracy score: ', accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

# 可视化混淆矩阵
cm = confusion_matrix(y_test, predictions)
sns.heatmap(cm, annot=True, fmt=".0f")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# 创建模型
model = RandomForestClassifier()

# 训练模型
model.fit(X_resampled, y_resampled)

# 在测试集上进行预测
predictions = model.predict(X_test)

# 评估模型
print('Accuracy score: ', accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

# 可视化混淆矩阵
cm = confusion_matrix(y_test, predictions)
sns.heatmap(cm, annot=True, fmt=".0f")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


### Two-way ANOVA

In [None]:
# # Separate features (X) and target variable (y) in the balanced dataset
# X = balanced_data.drop('rating', axis=1)
# y = balanced_data['rating']

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2221877)

# Combine the features and target variable into a single DataFrame
data_train = X_resampled.copy()
data_train['rating'] = y_resampled

# Define the formula
formula = 'rating ~ C(time) + C(pics)'

# Fit the model on the training set
model = ols(formula, data=data_train).fit()

# Perform ANOVA analysis on the training set
anova_results = sm.stats.anova_lm(model)
print(anova_results)

In [None]:
# Create the residual plot
fig = plt.figure(figsize=(12, 8))
plot_regress_exog(model, 'C(pics)[T.1]', fig=fig)

# Show the plot
plt.show()

Although the p-values of the two columns are very optimistic, the degrees of freedom of the residuals are too large.

### Linear Probability Model (OLS)

In [None]:
# lpm_mod = sm.OLS(y_train, X_train)
# lpm_res = lpm_mod.fit()
# print("Parameters: ", lpm_res.params[:-1])

### Logit Model

In [None]:
# # Mapping score values from 1-5 to a range between 0 and 1
# y_train_mapped = np.interp(y_train, (1, 5), (0, 1))

# # Model fitting using the mapped y_train_mapped
# logit_mod = sm.Logit(y_train_mapped, X_train)
# logit_res = logit_mod.fit(disp=0)
# print(logit_res.summary())

In [None]:
# logit_res.pred_table()

#### Marginal Effects

In [None]:
# margeff = logit_res.get_margeff()
# print(margeff.summary())


In [None]:
# print(logit_res.summary())


### Multinomial Logit

In [None]:
# mlogit_mod = sm.MNLogit(y_train, X_train)
# mlogit_res = mlogit_mod.fit()
# print(mlogit_res.params)


### Negative Binomial

In [None]:
# mod_nbin = sm.NegativeBinomial(y_train, X_train)
# res_nbin = mod_nbin.fit(disp=False)
# print(res_nbin.summary())


### Alternative solvers

In [None]:
# mlogit_res = mlogit_mod.fit(method="bfgs", maxiter=250)
# print(mlogit_res.summary())


### Model Comparisons
