# Regression Model

In this notebook we will build a simple multiple regression model which will be used as a baseline model for the comparision with the xGBoost Model

In [1]:
# Importing Packages 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
# Importing Data with Features 
data_hedonic = pd.read_csv ('/Users/paulahofmann/Documents/Coding/Online-Review/FeaturePreperation/Data_with_Features/Final Data/Hedonic_Final.csv')
data_utilitarian = pd.read_csv ('/Users/paulahofmann/Documents/Coding/Online-Review/FeaturePreperation/Data_with_Features/Final Data/Utilitarian_Final.csv')

In [3]:
# First we will simply sort all reviews that have at least one helpful Vote
data_utilitarian_helpful = data_utilitarian[data_utilitarian['helpful_vote'] > 0]
data_hedonic_helpful = data_hedonic[data_hedonic['helpful_vote'] > 0]


In [6]:
# Join dataframes
data_helpful = pd.concat([data_utilitarian_helpful, data_hedonic_helpful])

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Define features and target
features = ['helpful_ratio', 'rating', 'sentiment', 'subjective_score', 'word_count',
            'sent_count', 'sent_length', 'title_length', 'review_ext',
            '#adj', '#adv', '#nouns', 'FRE', 'elap_days', 'image', 'ver_purch']
target = 'helpful_vote'

# Split data into training and testing sets
X = data_helpful[features]
y = data_helpful[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Interpret the results
coefficients = pd.DataFrame({'Feature': features, 'Coefficient': model.coef_})
print(coefficients)


Mean Squared Error: 206.59567921871192
             Feature  Coefficient
0      helpful_ratio  5685.959525
1             rating   -19.383021
2          sentiment     0.168629
3   subjective_score    -1.673692
4         word_count     0.043597
5         sent_count    -0.866268
6        sent_length    -0.126617
7       title_length    -0.184792
8         review_ext    18.883241
9               #adj    -3.431640
10              #adv     1.446686
11            #nouns     1.902204
12               FRE    -0.064813
13         elap_days     0.001512
14             image     1.843337
15         ver_purch    -0.591346


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Assuming 'data_helpful' is your DataFrame and the features and target are defined

# Define the features and target
features = ['helpful_ratio', 'rating', 'sentiment', 'subjective_score', 'word_count',
            'sent_count', 'sent_length', 'title_length', 'review_ext',
            '#adj', '#adv', '#nouns', 'FRE', 'elap_days', 'image', 'ver_purch']
target = 'helpful_ratio'



# Split the data into training and testing sets
X = data_helpful[features]
y = data_helpful[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define models to try
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'Random Forest': RandomForestRegressor()
}

# Evaluate models using cross-validation
for name, model in models.items():
    cv = KFold(n_splits=5, random_state=42, shuffle=True)
    scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=cv)
    print(f"{name} CV Mean Squared Error: {-scores.mean()}")

# Fit the best model (Random Forest in this case, you can choose based on CV results)
best_model = RandomForestRegressor()
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(best_model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_


Linear Regression CV Mean Squared Error: 3.141248060885838e-32


Ridge CV Mean Squared Error: 6.173503901019308e-12
Lasso CV Mean Squared Error: 0.00010423847380199239
Random Forest CV Mean Squared Error: 6.468920108047862e-06


In [None]:
# Evaluate the model
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Interpret the results
feature_importances = pd.DataFrame({'Feature': features, 'Importance': best_model.feature_importances_})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
print(feature_importances)

