# Regression Model

In this notebook we will build a simple multiple regression model which will be used as a baseline model for the comparision with the xGBoost Model

In [1]:
# Importing Packages 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
# Importing Data with Features 
data_hedonic_parfum = pd.read_csv ('/Users/paulahofmann/Documents/Coding/Online-Review/ModelPreperation/Features_hedonic_parfum.csv')
data_utilitarian_razor = pd.read_csv ('/Users/paulahofmann/Documents/Coding/Online-Review/ModelPreperation/Features_utilitarian_razor.csv')
data_utilitarian_filter = pd.read_csv ('/Users/paulahofmann/Documents/Coding/Online-Review/ModelPreperation/Features_utilitarian_filter.csv')

In [3]:
# First we will simply sort all reviews that have at least one helpful Vote
#data_hedonic_parfum = data_hedonic_parfum[data_hedonic_parfum['helpful_vote'] > 0]
#data_utilitarian_razor = data_utilitarian_razor[data_utilitarian_razor['helpful_vote'] > 0]
#data_utilitarian_filter = data_utilitarian_filter[data_utilitarian_filter['helpful_vote'] > 0]

In [4]:
## Summarizing all Features in a List

features = ['rating','rating_number', 'sentiment', 'price', 'noun_count', 'adj_count', 'adv_count', 'word_count', 
                  'sentence_count', 'avg_words_per_sentence', 'title_length', 'review_extremity', 
                  'elapsed_time_days', 'image', 'year','month','day','hour']

target = 'helpful_ratio'

In [5]:
# We will start with the Utilitarian Razor Category

# Split the data into training and testing sets
X = data_utilitarian_razor[features]
y = data_utilitarian_razor[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Interpret the results
coefficients = pd.DataFrame({'Feature': features, 'Coefficient': model.coef_})
print(coefficients)

Mean Squared Error: 2.0865791081627675e-05
                   Feature   Coefficient
0                   rating  1.832309e-05
1                sentiment  2.504786e-05
2                    price  9.487870e-16
3               noun_count -3.941867e-06
4                adj_count -1.470029e-04
5                adv_count  2.266194e-05
6               word_count  2.592918e-05
7           sentence_count -4.604953e-06
8   avg_words_per_sentence -2.041186e-05
9             title_length  8.592254e-06
10        review_extremity  1.832309e-05
11       elapsed_time_days -1.066895e-04
12                   image  5.455795e-03
13                    year -3.902495e-02
14                   month -3.274770e-03
15                     day -1.123654e-04
16                    hour -1.267124e-05


In [6]:
# We will go on with the Utilitarian Filter Category

# Split the data into training and testing sets
X = data_utilitarian_filter[features]
y = data_utilitarian_filter[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Interpret the results
coefficients = pd.DataFrame({'Feature': features, 'Coefficient': model.coef_})
print(coefficients)

Mean Squared Error: 2.8650821016175295e-06
                   Feature   Coefficient
0                   rating  2.090168e-05
1                sentiment -6.824327e-05
2                    price -2.545927e-17
3               noun_count  5.744303e-05
4                adj_count  1.880875e-05
5                adv_count -1.307756e-04
6               word_count  6.551585e-05
7           sentence_count -6.405104e-04
8   avg_words_per_sentence -1.293302e-04
9             title_length  1.208062e-05
10        review_extremity  2.090168e-05
11       elapsed_time_days -2.707159e-05
12                   image  9.675419e-06
13                    year -9.898457e-03
14                   month -8.187834e-04
15                     day -3.636861e-05
16                    hour -1.694119e-06


In [9]:
# Moving on with hedonic parfum

# Split the data into training and testing sets
X = data_hedonic_parfum[features]
y = data_hedonic_parfum[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Interpret the results
coefficients = pd.DataFrame({'Feature': features, 'Coefficient': model.coef_})
print(coefficients)

Mean Squared Error: 2.7594663832951985e-30
                   Feature   Coefficient
0                   rating -5.402864e-17
1                sentiment  1.334924e-17
2                    price  1.277461e-16
3               noun_count -3.049265e-17
4                adj_count  8.461563e-17
5                adv_count -1.240006e-17
6               word_count  1.036355e-17
7           sentence_count -1.530411e-16
8   avg_words_per_sentence -3.114621e-17
9             title_length  5.382613e-17
10        review_extremity -1.344474e-17
11       elapsed_time_days  7.112989e-19
12                   image  7.950223e-15
13                    year  2.351139e-16
14                   month -1.393993e-17
15                     day -8.494438e-18
16                    hour  1.655107e-18
17           helpful_ratio  1.000000e+00


In [10]:
import scipy.stats as stats

# Define your features
features = ['rating', 'sentiment', 'price', 'noun_count', 'adj_count', 'adv_count', 'word_count', 
            'sentence_count', 'avg_words_per_sentence', 'title_length', 'review_extremity', 
            'elapsed_time_days', 'image', 'year', 'month', 'day', 'hour', 'helpful_ratio']

# Define your datasets for hedonic and utilitarian products
datasets = [data_hedonic_parfum, data_utilitarian_razor, data_utilitarian_filter]
product_labels = ['Hedonic Parfum', 'Utilitarian Razor', 'Utilitarian Filter']

# Perform statistical testing for each feature
for feature in features:
    print(f"Feature: {feature}")
    for i in range(len(datasets)):
        for j in range(i+1, len(datasets)):
            dataset_1 = datasets[i][feature]
            dataset_2 = datasets[j][feature]
            t_stat, p_value = stats.ttest_ind(dataset_1, dataset_2)
            print(f"Comparison between {product_labels[i]} and {product_labels[j]}:")
            print(f"T-statistic: {t_stat}, p-value: {p_value}")
    print()


Feature: rating
Comparison between Hedonic Parfum and Utilitarian Razor:
T-statistic: -8.398492142032017, p-value: 5.56318054302687e-17
Comparison between Hedonic Parfum and Utilitarian Filter:
T-statistic: -8.257683285312142, p-value: 1.6455468436268317e-16
Comparison between Utilitarian Razor and Utilitarian Filter:
T-statistic: 2.678140175667626, p-value: 0.0074102482858745995

Feature: sentiment
Comparison between Hedonic Parfum and Utilitarian Razor:
T-statistic: 2.667289309799916, p-value: 0.007666684617706823
Comparison between Hedonic Parfum and Utilitarian Filter:
T-statistic: 2.357923961611818, p-value: 0.0183939204878421
Comparison between Utilitarian Razor and Utilitarian Filter:
T-statistic: -1.8633908780500135, p-value: 0.06242430312108226

Feature: price
Comparison between Hedonic Parfum and Utilitarian Razor:
T-statistic: -inf, p-value: 0.0
Comparison between Hedonic Parfum and Utilitarian Filter:
T-statistic: -inf, p-value: 0.0
Comparison between Utilitarian Razor and 

  t_stat, p_value = stats.ttest_ind(dataset_1, dataset_2)
