# Regression Model

In this notebook we will build a simple multiple regression model which will be used as a baseline model for the comparision with the xGBoost Model

In [46]:
# Importing Packages 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [58]:
# Importing Data with Features 
data_hedonic = pd.read_csv ('/Users/paulahofmann/Documents/Coding/Online-Review/FeaturePreperation/Data_with_Features/Features_hedonic_total.csv')
data_utilitarian_mouse = pd.read_csv ('/Users/paulahofmann/Documents/Coding/Online-Review/FeaturePreperation/Data_with_Features/Features_utilitarian_mouse.csv')
data_utilitarian_filter = pd.read_csv ('/Users/paulahofmann/Documents/Coding/Online-Review/FeaturePreperation/Data_with_Features/Features_utilitarian_filter.csv')
data_utilitarian_razor = pd.read_csv ('/Users/paulahofmann/Documents/Coding/Online-Review/FeaturePreperation/Data_with_Features/Features_utilitarian_razor.csv')

In [59]:
unique_parent_asin = data_hedonic['parent_asin'].unique()
print(unique_parent_asin)

['B09GWLJPTH' 'B08JHZHWZ3' 'B07D13QGXM' 'B00WXP607C' 'B00SNM5US4'
 'B00LMIVLXY' 'B0C5WPD2RN' 'B0BLGN9N39']


In [60]:
unique_title = data_hedonic['product_type'].unique()
print(unique_title)

['Face Oil' 'Mario Game' 'Minecraft Game' 'Men Cologne' 'Hair Treatment'
 'Heat Protector' 'Facial Spray' 'Cologne Unisex']


In [62]:
# First we will simply sort all reviews that have at least one helpful Vote
data_utilitarian_mouse_reg = data_utilitarian_mouse[data_utilitarian_mouse['helpful_vote'] > 0]
data_utilitarian_razor = data_utilitarian_razor[data_utilitarian_razor['helpful_vote'] > 0]
data_utilitarian_filter = data_utilitarian_filter[data_utilitarian_filter['helpful_vote'] > 0]


# Filter Hedonic Product based on Product Type Face Spray, must helpful reviews
data_hedonic = data_hedonic[data_hedonic['product_type'] == 'Hair Treatment']

data_hedonic = data_hedonic[data_hedonic['helpful_vote'] > 0]


In [63]:
## Summarizing all Features in a List

features = ['rating','rating_number', 'sentiment', 'price', 'noun_count', 'adj_count', 'adv_count', 'word_count', 
                  'sentence_count', 'avg_words_per_sentence', 'title_length', 'review_extremity', 
                  'elapsed_time_days', 'image', 'year','month','day','hour']

target = 'helpful_ratio'

In [64]:
# We will start with the Utilitarian Razor Category

# Split the data into training and testing sets
X = data_utilitarian_mouse_reg[features]
y = data_utilitarian_mouse_reg[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Interpret the results
coefficients = pd.DataFrame({'Feature': features, 'Coefficient': model.coef_})
print(coefficients)

Mean Squared Error: 0.00021081970431262445
                   Feature   Coefficient
0                   rating  1.195187e-03
1            rating_number  4.140939e-15
2                sentiment -9.238922e-04
3                    price  6.967896e-16
4               noun_count  2.957091e-04
5                adj_count -2.342702e-04
6                adv_count  4.173713e-04
7               word_count -5.575308e-05
8           sentence_count -2.846632e-04
9   avg_words_per_sentence  4.213562e-05
10            title_length -1.052677e-04
11        review_extremity  1.195187e-03
12       elapsed_time_days -1.253961e-04
13                   image -1.698220e-03
14                    year -4.851871e-02
15                   month -3.619094e-03
16                     day -3.328178e-04
17                    hour -1.587325e-04


In [65]:
# We will go on with the Utilitarian Filter Category

# Split the data into training and testing sets
X = data_utilitarian_filter[features]
y = data_utilitarian_filter[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Interpret the results
coefficients = pd.DataFrame({'Feature': features, 'Coefficient': model.coef_})
print(coefficients)

Mean Squared Error: 8.750155867271995e-06
                   Feature   Coefficient
0                   rating  2.696997e-05
1            rating_number -3.750709e-14
2                sentiment -4.755345e-05
3                    price  7.311494e-15
4               noun_count -5.033102e-05
5                adj_count -2.859932e-04
6                adv_count  1.181081e-04
7               word_count  3.610864e-05
8           sentence_count  2.803110e-04
9   avg_words_per_sentence -7.908242e-06
10            title_length -8.029188e-05
11        review_extremity  2.696997e-05
12       elapsed_time_days -8.110801e-04
13                   image  9.354406e-03
14                    year -2.966822e-01
15                   month -2.501303e-02
16                     day -8.521153e-04
17                    hour -5.727190e-05


In [66]:
# Moving on with razor

# Split the data into training and testing sets
X = data_utilitarian_razor[features]
y = data_utilitarian_razor[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Interpret the results
coefficients = pd.DataFrame({'Feature': features, 'Coefficient': model.coef_})
print(coefficients)

Mean Squared Error: 2.2055041344609002e-05
                   Feature   Coefficient
0                   rating  2.962152e-04
1            rating_number  9.651582e-16
2                sentiment -5.905886e-04
3                    price -1.887975e-15
4               noun_count -2.471319e-05
5                adj_count  1.379121e-04
6                adv_count -2.706500e-04
7               word_count  1.248499e-04
8           sentence_count -1.154017e-03
9   avg_words_per_sentence -3.069972e-04
10            title_length  1.057619e-04
11        review_extremity  2.962152e-04
12       elapsed_time_days -8.326489e-05
13                   image  1.226834e-03
14                    year -3.050268e-02
15                   month -2.475384e-03
16                     day -1.277459e-04
17                    hour  3.461597e-06


In [69]:
# Moving on with hedonic face spray

# Split the data into training and testing sets
X = data_hedonic[features]
y = data_hedonic[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Interpret the results
coefficients = pd.DataFrame({'Feature': features, 'Coefficient': model.coef_})
print(coefficients)

Mean Squared Error: 1.311383933471001e-06
                   Feature   Coefficient
0                   rating -8.970135e-05
1            rating_number -6.277103e-16
2                sentiment  2.346536e-04
3                    price -3.737313e-16
4               noun_count -4.792102e-05
5                adj_count -1.247098e-04
6                adv_count -1.382085e-04
7               word_count  4.116803e-05
8           sentence_count -1.201284e-04
9   avg_words_per_sentence  1.596825e-05
10            title_length -7.397399e-06
11        review_extremity -8.970135e-05
12       elapsed_time_days -2.842775e-05
13                   image  2.879392e-04
14                    year -1.046224e-02
15                   month -9.118812e-04
16                     day -2.340776e-05
17                    hour -1.696953e-05


In [71]:
import scipy.stats as stats

# Define your features
features = ['sentiment', 'noun_count', 'adj_count', 'adv_count', 'word_count', 
                  'sentence_count', 'avg_words_per_sentence', 'title_length', 'review_extremity', 
                  'elapsed_time_days', 'image']

# Define your datasets for hedonic and utilitarian products
datasets = [data_hedonic, data_utilitarian_razor, data_utilitarian_filter, data_utilitarian_mouse_reg]
product_labels = ['Hedonic Face Spray', 'Utilitarian Razor', 'Utilitarian Filter', 'Utilitarian Mouse']

# Perform statistical testing for each feature
for feature in features:
    print(f"Feature: {feature}")
    for i in range(len(datasets)):
        for j in range(i+1, len(datasets)):
            dataset_1 = datasets[i][feature]
            dataset_2 = datasets[j][feature]
            t_stat, p_value = stats.ttest_ind(dataset_1, dataset_2)
            if p_value < 0.05:  # Considering statistical significance at alpha = 0.05
                print(f"Comparison between {product_labels[i]} and {product_labels[j]}:")
                print(f"T-statistic: {t_stat}, p-value: {p_value}")
    print()


Feature: sentiment
Comparison between Hedonic Face Spray and Utilitarian Razor:
T-statistic: -5.477963428857548, p-value: 4.656947581362445e-08
Comparison between Hedonic Face Spray and Utilitarian Filter:
T-statistic: 10.043905487983672, p-value: 3.3233554984361667e-23
Comparison between Hedonic Face Spray and Utilitarian Mouse:
T-statistic: 8.4436888913292, p-value: 6.088281487491637e-17
Comparison between Utilitarian Razor and Utilitarian Filter:
T-statistic: 15.725342533072913, p-value: 4.469490547213463e-53
Comparison between Utilitarian Razor and Utilitarian Mouse:
T-statistic: 13.5492517376282, p-value: 3.2411468436272706e-40

Feature: noun_count
Comparison between Hedonic Face Spray and Utilitarian Mouse:
T-statistic: -9.787266491502047, p-value: 4.301189130830791e-22
Comparison between Utilitarian Razor and Utilitarian Mouse:
T-statistic: -9.486391328321925, p-value: 6.025868125505914e-21
Comparison between Utilitarian Filter and Utilitarian Mouse:
T-statistic: -8.909798350494