# Regression Model

In this notebook we will build a simple multiple regression model which will be used as a baseline model for the comparision with the xGBoost Model

In [3]:
# Importing Packages 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

In [4]:
# Importing Data with Features 
data_hedonic = pd.read_csv ('/Users/paulahofmann/Documents/Coding/Online-Review/FeaturePreperation/Data_with_Features/Final Data/Hedonic_Final.csv')
data_utilitarian = pd.read_csv ('/Users/paulahofmann/Documents/Coding/Online-Review/FeaturePreperation/Data_with_Features/Final Data/Utilitarian_Final.csv')

In [6]:
# Mapping dictionary for sentiment transformation
sentiment_mapping = {'positive': 2, 'neutral': 1, 'negative': 0}

# Using map function to create a new column 'sentiment_c' with transformed values
data_hedonic['sentiment_c'] = data_hedonic['Sentiment_Classification'].map(sentiment_mapping)
data_utilitarian['sentiment_c'] = data_hedonic['Sentiment_Classification'].map(sentiment_mapping)


In [13]:
def calculate_total_helpful_votes(df):
    for product_id, group in df.groupby('product'):
        total_helpful_votes = group['helpful_vote'].sum()
        df.loc[group.index, 'total_helpful_votes'] = total_helpful_votes
        df.loc[group.index, 'helpful_ratio'] = group['helpful_vote'] / total_helpful_votes

    return df


In [14]:
# First we will simply sort all reviews that have at least one helpful Vote
data_utilitarian_helpful = data_utilitarian[data_utilitarian['helpful_vote'] > 0]
data_hedonic_helpful = data_hedonic[data_hedonic['helpful_vote'] > 0]


In [5]:
# Join dataframes
data_helpful = pd.concat([data_utilitarian_helpful, data_hedonic_helpful])

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

# Define features and target
features = ['rating', 'sentiment_c', 'subjective_score', 'elap_days', 'image', 'ver_purch', 'word_count',
            'sent_count', 'sent_length', 'title_length', '#adj', '#adv', '#nouns', 'FRE']
target = 'helpful_ratio'

In [8]:

# Split data into training and testing sets
X = data_utilitarian_helpful[features]
y = data_utilitarian_helpful[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define and fit the model using sklearn
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Interpret the results
coefficients = pd.DataFrame({'Feature': features, 'Coefficient': model.coef_})
print(coefficients)

# Add a constant to the features matrix for the intercept term
X_train_const = sm.add_constant(X_train_scaled)

# Fit the linear regression model using statsmodels
sm_model = sm.OLS(y_train, X_train_const)
results = sm_model.fit()

# Get the summary of the regression results
print(results.summary())

# Extract p-values and identify significant features
p_values = results.pvalues
significant_features = p_values[p_values < 0.05].index
significant_features = significant_features[significant_features != 'const']  # Exclude the intercept
print("\nSignificant features based on p-values:")
print(significant_features)

Mean Squared Error: 0.00010170414264582919
             Feature  Coefficient
0             rating    -0.000094
1        sentiment_c    -0.000465
2   subjective_score     0.001349
3          elap_days     0.002502
4              image     0.002357
5          ver_purch     0.001062
6         word_count     0.003780
7         sent_count    -0.000351
8        sent_length     0.001807
9       title_length    -0.000185
10              #adj     0.000280
11              #adv     0.000547
12            #nouns     0.001368
13               FRE     0.001603
                            OLS Regression Results                            
Dep. Variable:          helpful_ratio   R-squared:                       0.039
Model:                            OLS   Adj. R-squared:                  0.015
Method:                 Least Squares   F-statistic:                     1.643
Date:                Fri, 24 May 2024   Prob (F-statistic):             0.0638
Time:                        16:02:43   Log-Likeliho

In [9]:
# Split data into training and testing sets
X = data_hedonic_helpful[features]
y = data_hedonic_helpful[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define and fit the model using sklearn
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Interpret the results
coefficients = pd.DataFrame({'Feature': features, 'Coefficient': model.coef_})
print(coefficients)

# Add a constant to the features matrix for the intercept term
X_train_const = sm.add_constant(X_train_scaled)

# Fit the linear regression model using statsmodels
sm_model = sm.OLS(y_train, X_train_const)
results = sm_model.fit()

# Get the summary of the regression results
print(results.summary())

# Extract p-values and identify significant features
p_values = results.pvalues
significant_features = p_values[p_values < 0.05].index
significant_features = significant_features[significant_features != 'const']  # Exclude the intercept
print("\nSignificant features based on p-values:")
print(significant_features)

Mean Squared Error: 0.00012498544500094545
             Feature  Coefficient
0             rating    -0.000871
1        sentiment_c     0.000582
2   subjective_score    -0.000310
3          elap_days     0.000417
4              image     0.001395
5          ver_purch     0.000803
6         word_count     0.001552
7         sent_count    -0.000294
8        sent_length     0.000692
9       title_length     0.000580
10              #adj    -0.000584
11              #adv    -0.000503
12            #nouns    -0.000629
13               FRE    -0.000266
                            OLS Regression Results                            
Dep. Variable:          helpful_ratio   R-squared:                       0.041
Model:                            OLS   Adj. R-squared:                  0.023
Method:                 Least Squares   F-statistic:                     2.224
Date:                Fri, 24 May 2024   Prob (F-statistic):            0.00602
Time:                        16:03:52   Log-Likeliho

## Regression Model on Product level 

In [12]:
# Filter data to only include reviews with at least one helpful vote
perfume_data_hedonic = data_hedonic_helpful[data_hedonic_helpful['product'] == 'Facial Spray']

# Split data into training and testing sets
X = perfume_data_hedonic[features]
y = perfume_data_hedonic[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define and fit the model using sklearn
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Interpret the results
coefficients = pd.DataFrame({'Feature': features, 'Coefficient': model.coef_})
print(coefficients)

# Add a constant to the features matrix for the intercept term
X_train_const = sm.add_constant(X_train_scaled)

# Fit the linear regression model using statsmodels
sm_model = sm.OLS(y_train, X_train_const)
results = sm_model.fit()

# Get the summary of the regression results
print(results.summary())

# Extract p-values and identify significant features
p_values = results.pvalues
significant_features = p_values[p_values < 0.05].index
significant_features = significant_features[significant_features != 'const']  # Exclude the intercept
print("\nSignificant features based on p-values:")
print(significant_features)

Mean Squared Error: 1.311634556441197e-05
             Feature  Coefficient
0             rating    -0.001144
1        sentiment_c     0.000976
2   subjective_score    -0.000212
3          elap_days     0.000419
4              image     0.000896
5          ver_purch     0.000194
6         word_count     0.000379
7         sent_count    -0.000108
8        sent_length     0.000271
9       title_length    -0.000067
10              #adj    -0.000046
11              #adv    -0.000016
12            #nouns     0.000069
13               FRE     0.000007
                            OLS Regression Results                            
Dep. Variable:          helpful_ratio   R-squared:                       0.063
Model:                            OLS   Adj. R-squared:                  0.049
Method:                 Least Squares   F-statistic:                     4.720
Date:                Fri, 24 May 2024   Prob (F-statistic):           2.00e-08
Time:                        15:39:24   Log-Likelihoo

In [None]:
# Filter data to only include reviews with at least one helpful vote
utilitarian_battery = data_utilitarian_helpful[data_utilitarian_helpful['product'] == 'Battery']

# Split data into training and testing sets
X = utilitarian_battery[features]
y = utilitarian_battery[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define and fit the model using sklearn
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Interpret the results
coefficients = pd.DataFrame({'Feature': features, 'Coefficient': model.coef_})
print(coefficients)

# Add a constant to the features matrix for the intercept term
X_train_const = sm.add_constant(X_train_scaled)

# Fit the linear regression model using statsmodels
sm_model = sm.OLS(y_train, X_train_const)
results = sm_model.fit()

# Get the summary of the regression results
print(results.summary())

# Extract p-values and identify significant features
p_values = results.pvalues
significant_features = p_values[p_values < 0.05].index
significant_features = significant_features[significant_features != 'const']  # Exclude the intercept
print("\nSignificant features based on p-values:")
print(significant_features)

In [None]:
# Prepare a DataFrame to collect the results
results_df = pd.DataFrame(columns=['main_category', 'feature', 'coefficient', 'p_value', 'significant', 'MSE'])

# Perform regression separately for each main category
for category in data_utilitarian_helpful['main_category'].unique():
    category_data = data_utilitarian_helpful[data_utilitarian_helpful['main_category'] == category]

    # Split data into training and testing sets
    X = category_data[features]
    y = category_data[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Add a constant to the features matrix for the intercept term
    X_train_const = sm.add_constant(X_train_scaled)

    # Fit the linear regression model using statsmodels
    sm_model = sm.OLS(y_train, X_train_const)
    results = sm_model.fit()

    # Calculate Mean Squared Error (MSE)
    y_pred = results.predict(sm.add_constant(X_test_scaled))
    mse = mean_squared_error(y_test, y_pred)

    # Collect the results
    results_df = pd.concat([
        results_df,
        pd.DataFrame({
            'main_category': category,
            'feature': ['const'] + features,
            'coefficient': results.params,
            'p_value': results.pvalues,
            'significant': results.pvalues < 0.05,
            'MSE': mse
        })
    ])

# Display the results
print(results_df)