In [1]:
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
random_state = 123
test_fraction = 0.05
cv = 20

feature_variables = ['total_reviews', 'elapsed_days_restaurant', 'ratio_positive_reviews', 'negative_reviews', 'immediacy_index']
target_variable = 'helpfulness_score_cbrt'

In [3]:
data = pd.read_csv('./../../data/data.csv')
train_data, test_data = train_test_split(data, test_size=test_fraction, random_state=random_state)

In [4]:
param_grid = {
    'polynomialfeatures__degree': [1, 2, 3, 4],  # Degrees of polynomial features
    'elasticnet__alpha': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0],  # Regularization parameter
    'elasticnet__l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]  # Mixing parameter for L1 and L2 regularization
}

In [5]:
elastic_net = ElasticNet()
poly_features = PolynomialFeatures(include_bias=False)
scaler = MinMaxScaler()

In [6]:
pipeline = Pipeline([
    ('scaler', scaler),
    ('polynomialfeatures', poly_features),
    ('elasticnet', elastic_net)
])

In [7]:
grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='r2')

In [8]:
grid_search.fit(train_data[feature_variables], train_data[target_variable])

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rn

GridSearchCV(cv=20,
             estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('polynomialfeatures',
                                        PolynomialFeatures(include_bias=False)),
                                       ('elasticnet', ElasticNet())]),
             param_grid={'elasticnet__alpha': [0.0001, 0.001, 0.01, 0.1, 1.0,
                                               10.0],
                         'elasticnet__l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6,
                                                  0.7, 0.8, 0.9],
                         'polynomialfeatures__degree': [1, 2, 3, 4]},
             scoring='r2')

In [9]:
grid_search.best_estimator_

Pipeline(steps=[('scaler', MinMaxScaler()),
                ('polynomialfeatures',
                 PolynomialFeatures(degree=4, include_bias=False)),
                ('elasticnet', ElasticNet(alpha=0.0001, l1_ratio=0.1))])

In [10]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_

In [11]:
best_params

{'elasticnet__alpha': 0.0001,
 'elasticnet__l1_ratio': 0.1,
 'polynomialfeatures__degree': 4}

In [12]:
best_score

0.6868017076320707

In [13]:
grid_search.best_estimator_.score(train_data[feature_variables], train_data[target_variable])

0.6875062253346511

In [14]:
grid_search.best_estimator_.score(test_data[feature_variables], test_data[target_variable])

0.674694374816273

In [15]:
pipeline_new = Pipeline([
    ('scaler', MinMaxScaler()),
    ('polynomialfeatures', PolynomialFeatures(degree=4, include_bias=False)),
    ('elasticnet', ElasticNet(alpha=0.0001, l1_ratio=0.1))
])
pipeline_new.fit(train_data[feature_variables], train_data[target_variable])

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


Pipeline(steps=[('scaler', MinMaxScaler()),
                ('polynomialfeatures',
                 PolynomialFeatures(degree=4, include_bias=False)),
                ('elasticnet', ElasticNet(alpha=0.0001, l1_ratio=0.1))])

In [16]:
r2_score(train_data[target_variable], pipeline_new.predict(train_data[feature_variables]))

0.6875062253346511

In [17]:
r2_score(test_data[target_variable], pipeline_new.predict(test_data[feature_variables]))

0.674694374816273