In [4]:
import time
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error

from utils.old_preproc import remove_html_tags_old, remove_stopwords

In [5]:
# read and preprocess descriptions
descriptions = pd.read_csv('csv/product_descriptions.csv')
descriptions['product_description'] = descriptions['product_description'].apply(remove_html_tags_old)
descriptions['product_description'] = descriptions['product_description'].apply(remove_stopwords)

In [6]:
# read and preprocess train
train = pd.read_csv('csv/train.csv', encoding='ISO-8859-1')
train = pd.merge(train, descriptions, on='product_uid')
train = train[['search_term', 'product_description', 'relevance']]

# read and preprocess test
test = pd.read_csv('csv/test.csv', encoding='ISO-8859-1')
test = pd.merge(test, descriptions, on='product_uid')
test_sol = pd.read_csv('csv/solution.csv')
test = pd.merge(test, test_sol, on='id')
test = test[['search_term', 'product_description', 'relevance']]
test = test[test['relevance'] != -1].reset_index()

In [6]:
# merge search terms and descriptions
train['text_combined'] = train['search_term'] + " " + train['product_description']
test['text_combined'] = test['search_term']+ " " + test['product_description']

In [7]:
# create CountVectorizer with character analyzer
vectorizer = CountVectorizer(analyzer='char')

# transform merged text to a character count matrix
X = vectorizer.fit_transform(train['text_combined'])
y = train['relevance']

# split to train/val
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# transform test
X_test = vectorizer.transform(test['text_combined'])
y_test = test['relevance']

In [8]:
# initialize models
linear_reg = LinearRegression()
ridge_reg = Ridge(alpha=1.0)
lasso_reg = Lasso(alpha=0.1)
elastic_net_reg = ElasticNet(alpha=0.1, l1_ratio=0.5)
decision_tree_reg = DecisionTreeRegressor(max_depth=5)
random_forest_reg = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
gradient_boosting_reg = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
svr_reg = SVR(kernel='rbf', C=1.0, epsilon=0.2)

regressors = [
    ('Ridge Regression', ridge_reg),
    ('Lasso Regression', lasso_reg),
    ('Elastic Net', elastic_net_reg),
    ('Decision Tree Regressor', decision_tree_reg),
    ('Random Forest Regressor', random_forest_reg),
    ('Gradient Boosting Regressor', gradient_boosting_reg),
]

# train and test models
for name, reg in regressors:
    start_time = time.time()
    reg.fit(X_train, y_train)
    training_time = time.time() - start_time

    train_rmse = np.sqrt(mean_squared_error(y_train, reg.predict(X_train)))
    val_rmse = np.sqrt(mean_squared_error(y_val, reg.predict(X_val)))
    test_rmse = np.sqrt(mean_squared_error(y_test, reg.predict(X_test)))

    train_mae = mean_absolute_error(y_train, reg.predict(X_train))
    val_mae = mean_absolute_error(y_val, reg.predict(X_val))
    test_mae = mean_absolute_error(y_test, reg.predict(X_test))

    print(
        f"{name} - Training time: {training_time:.4f}s, Train RMSE: {train_rmse}, Val RMSE: {val_rmse}, Test RMSE {test_rmse}, Train MAE: {train_mae}, Val MAE: {val_mae}, Test MAE: {test_mae}")


Ridge Regression - Training time: 0.3292s, Train RMSE: 0.5290273179410736, Val RMSE: 0.5275805392476611, Test RMSE 0.5322690853537078, Train MAE: 0.4338784810834374, Val MAE: 0.4314766636794571, Test MAE: 0.43594534963227977
Lasso Regression - Training time: 0.0461s, Train RMSE: 0.5341120696389539, Val RMSE: 0.5321181699165836, Test RMSE 0.5350642645305704, Train MAE: 0.43815167082911355, Val MAE: 0.4352876659118073, Test MAE: 0.4384917024890313
Elastic Net - Training time: 0.0445s, Train RMSE: 0.5340851824468785, Val RMSE: 0.5320775557235927, Test RMSE 0.5350517680707741, Train MAE: 0.4380770211454573, Val MAE: 0.4351918539437761, Test MAE: 0.43843630087183444
Decision Tree Regressor - Training time: 0.1929s, Train RMSE: 0.5300325363877064, Val RMSE: 0.5298187718701283, Test RMSE 0.5344932007888747, Train MAE: 0.4359445913869181, Val MAE: 0.43501193902097884, Test MAE: 0.4390034008881969
Random Forest Regressor - Training time: 13.8794s, Train RMSE: 0.5279730327231529, Val RMSE: 0.527