# Product recommender system

In [83]:
#import packages 
import sklearn
import pandas as pd
import numpy as np
import os

## Get the data

In [38]:
from sklearn.datasets import load_wine
data1 = load_wine(as_frame = True)

{'data':      alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
 0      14.23        1.71  2.43               15.6      127.0           2.80   
 1      13.20        1.78  2.14               11.2      100.0           2.65   
 2      13.16        2.36  2.67               18.6      101.0           2.80   
 3      14.37        1.95  2.50               16.8      113.0           3.85   
 4      13.24        2.59  2.87               21.0      118.0           2.80   
 ..       ...         ...   ...                ...        ...            ...   
 173    13.71        5.65  2.45               20.5       95.0           1.68   
 174    13.40        3.91  2.48               23.0      102.0           1.80   
 175    13.27        4.28  2.26               20.0      120.0           1.59   
 176    13.17        2.59  2.37               20.0      120.0           1.65   
 177    14.13        4.10  2.74               24.5       96.0           2.05   
 
      flavanoids  nonflavanoid

In [84]:
#Load dataset
DIRECTORY_WHERE_THIS_FILE_IS = os.path.dirname(os.path.abspath("Product_recommender.md"))
DATA_PATH = os.path.join(DIRECTORY_WHERE_THIS_FILE_IS, "data/kaggle_wine2.csv")
df1 = pd.read_csv(DATA_PATH)

In [85]:
df1['description'].head(2)

0    Aromas include tropical fruit, broom, brimston...
1    This is ripe and fruity, a wine that is smooth...
Name: description, dtype: object

In [86]:
#extract the wine year
import re
df1['year'] = 0
count = 0
for el in df1['title']: 
    res = [int(i) for i in el.split() if i.isdigit()]
    if len(res) == 1:
        df1.loc[count,'year'] = res[0]
    elif len(res) == 2 and res[0] > 1900:
        df1.loc[count,'year'] = res[0]
    elif len(res) == 2 and res[1] > 1900:
         df1.loc[count,'year'] = res[1]
    count += 1

In [87]:
#onehotencode the country qnd yeqr
categorical_columns = ['country', 'variety']
for column in categorical_columns:
    tempdf = pd.get_dummies(df1[column], prefix=column)
    df1 = pd.merge(
        left=df1,
        right=tempdf,
        left_index=True,
        right_index=True,
    )
    df1 = df1.drop(columns=column)

In [88]:
#drop the other non numericql collumns
df2 = df1.drop(['Unnamed: 0','designation','description','province','region_1','region_2','taster_name','taster_twitter_handle','winery'], axis = 1)

In [None]:
#Take the most represented collumns
for col in df2.columns:
    if col not in ('points', 'price','title'):
        if sum(df2[col]) < 5000 :
            df2 = df2.drop(columns= col)

In [None]:
df2.head(5)

In [None]:
#null
#drop for now
df2 = df2.dropna()

In [None]:
from sklearn.model_selection import train_test_split
y = df2["points"]
X = df2.loc[:,df2.columns != ["points",'title']]

In [66]:
#scale
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)
X = scaler.transform(X)
X = pd.DataFrame(X)

#remove correlated features
def get_correlation(data, threshold):
    corr_col = set()
    corrmat = data.corr()
    for i in range(len(corrmat.columns)):
        for j in range(i):
            if abs(corrmat.iloc[i, j]) > threshold:
                colname = corrmat.columns[i]
                corr_col.add(colname)
    return corr_col

corr_features = get_correlation(X, 0.70)
print('correlated features: ', len(set(corr_features)) )
print(corr_features)

#X = X.drop(labels=corr_features, axis = 1)



correlated features:  0
set()


In [67]:
#train test
X_train, X_test, y_train, y_test = train_test_split(
   X, y, test_size=0.30, random_state=42)

### Baseline

In [75]:
average = sum(y_train) / len(y_train)

In [81]:
arr_avg = []
for el in y_test:
    arr_avg.append(average)

In [82]:
print('Mean absolute error: {}'.format(mean_absolute_error(y_test,arr_avg)))

Mean absolute error: 2.4913350809477026


### Random forest regressor 

In [69]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [70]:
grid = {
    "max_depth": [35,40], 
    "min_samples_leaf": [2,3],
    "min_samples_split": [4,5,8,10]
}

"Sklearn"
"-----------------------"
grid_search = GridSearchCV(RandomForestRegressor(), param_grid = grid)
grid_search.fit(X_train, y_train)
optimal_model = grid_search.best_estimator_
"-----------------------"

print("Fine Tuned Model: {0}".format(optimal_model))

Fine Tuned Model: RandomForestRegressor(max_depth=40, min_samples_leaf=3, min_samples_split=10)


In [71]:
model = RandomForestRegressor(max_depth = optimal_model.max_depth, min_samples_leaf = optimal_model.min_samples_leaf, min_samples_split = optimal_model.min_samples_split)
model.fit(X_train,y_train)
adjusted_pred = model.predict(X_test)

In [74]:
print('Mean absolute error: {}'.format(mean_absolute_error(y_test,adjusted_pred)))

Mean absolute error: 1.8147790713983896
