In [1]:
import math
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import random

%matplotlib inline
plt.rcParams.update({
        'font.size': 14,
        'axes.titlesize': 20,
        'axes.labelsize': 20,
        'xtick.labelsize': 20,
        'ytick.labelsize': 20,
        'font.size': 20,
        'figure.figsize': (10, 7),
        'axes.grid': True,
        'grid.linestyle': '-',
        'grid.alpha': 0.3,
        'lines.markersize': 5.0,
        'xtick.minor.visible': True,
        'xtick.direction': 'in',
        'xtick.major.size': 20.0,
        'xtick.minor.size': 10.0,
        'xtick.top': False,
        'xtick.bottom': True,
        'ytick.minor.visible': True,
        'ytick.direction': 'in',
        'ytick.major.size': 12.0,
        'ytick.minor.size': 6.0,
        'ytick.right': True,
        'errorbar.capsize': 0.0,
    })

# https://www.kaggle.com/competitions/practical-ml-chocolate/data

# 1. Get data

In [2]:
df_train = pd.read_csv('chocolate_train.csv')
df_train.head()

Unnamed: 0,Company,Specific Bean Origin,REF,Review,Cocoa Percent,Company Location,Rating,Bean Type,Broad Bean Origin
0,Willie's Cacao,Rio Caribe,457,2009,72%,U.K.,3.25,Trinitario,Venezuela
1,Beschle (Felchlin),"Ocumare, Premier Cru, Quizas No. 2",508,2010,72%,Switzerland,3.5,,Venezuela
2,Dark Forest,Tanzania,1554,2015,70%,U.S.A.,3.0,,Tanzania
3,Brasstown aka It's Chocolate,Cooproagro,1125,2013,72%,U.S.A.,3.0,Trinitario,Dominican Republic
4,Pralus,"Java, Indonesie",32,2006,75%,France,3.5,Criollo,Indonesia


In [3]:
print(f'Train data has {df_train.shape[1]} columns and {df_train.shape[0]} rows')

Train data has 9 columns and 1255 rows


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1255 entries, 0 to 1254
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Company               1255 non-null   object 
 1   Specific Bean Origin  1255 non-null   object 
 2   REF                   1255 non-null   int64  
 3   Review                1255 non-null   int64  
 4   Cocoa Percent         1255 non-null   object 
 5   Company Location      1255 non-null   object 
 6   Rating                1255 non-null   float64
 7   Bean Type             1254 non-null   object 
 8   Broad Bean Origin     1254 non-null   object 
dtypes: float64(1), int64(2), object(6)
memory usage: 88.4+ KB


In [5]:
print('Number of unique values in categorical columns')

for col in df_train.columns:
    if df_train[col].dtype == 'O':
        print(f'{col}: {df_train[col].nunique()}')

Number of unique values in categorical columns
Company: 376
Specific Bean Origin: 762
Cocoa Percent: 40
Company Location: 58
Bean Type: 38
Broad Bean Origin: 84


In [6]:
# Make percentage float
df_train['Cocoa Percent'] = df_train['Cocoa Percent'].apply(lambda x: float(x.strip('%')))

In [7]:
df_train.describe()

Unnamed: 0,REF,Review,Cocoa Percent,Rating
count,1255.0,1255.0,1255.0,1255.0
mean,1045.152191,2012.38247,71.790438,3.176494
std,551.284249,2.922499,6.397448,0.478948
min,5.0,2006.0,46.0,1.0
25%,593.0,2010.0,70.0,2.75
50%,1077.0,2013.0,70.0,3.25
75%,1514.0,2015.0,75.0,3.5
max,1952.0,2017.0,100.0,5.0


## 1.1 Handling empty cells and nans

In [8]:
df_train.columns[df_train.dtypes == object]

Index(['Company', 'Specific Bean Origin', 'Company Location', 'Bean Type',
       'Broad Bean Origin'],
      dtype='object')

In [9]:
df_train.isna().sum()

Company                 0
Specific Bean Origin    0
REF                     0
Review                  0
Cocoa Percent           0
Company Location        0
Rating                  0
Bean Type               1
Broad Bean Origin       1
dtype: int64

In [10]:
df_train = df_train.fillna('Unknown')

In [12]:
empty_cell = df_train['Bean Type'].value_counts().index[0]
(df_train == empty_cell).sum()

Company                   0
Specific Bean Origin      0
REF                       0
Review                    0
Cocoa Percent             0
Company Location          0
Rating                    0
Bean Type               628
Broad Bean Origin        55
dtype: int64

In [None]:
empty_cell = df_train['Bean Type'].value_counts().index[0]
df_train['Bean Type'] = df_train['Bean Type'].replace({empty_cell: "Unknown"})
df_train['Broad Bean Origin'] = df_train['Broad Bean Origin'].replace({empty_cell: "Unknown"})
df_train['Company Location'] = df_train['Company Location'].replace({'Niacragua': 'Nicaragua'})

## 1.2 Test data

In [None]:
import os

os.listdir()

In [None]:
df_test = pd.read_csv('chocolate_test_new.csv')
df_test.head()

In [None]:
df_test.isna().sum()

In [None]:
df_test['Bean Type'] = df_test['Bean Type'].replace({empty_cell: "Unknown"})
df_test['Broad Bean Origin'] = df_test['Broad Bean Origin'].replace({empty_cell: "Unknown"})
df_test['Cocoa Percent'] = df_test['Cocoa Percent'].apply(lambda x: float(x.strip('%')))

df_test.head(10)

In [None]:
print('Empty cells')

for col in df_test.columns:
    if df_test[col].dtype == 'O':
        print(f'{col}: {(df_test[col]==empty_cell).sum()}')

In [None]:
object_cols = df_train.columns[df_train.dtypes == object]

In [None]:
print('Number of unique values in categorical columns')

for col in df_test.columns:
    if df_test[col].dtype == 'O':
        print(f'{col}: {df_test[col].nunique()}')

# 2. EDA

In [None]:
plt.scatter(df_train['Cocoa Percent'], df_train['Rating'], c='black')
plt.xlabel('Cocoa percent')
plt.ylabel('Rating')
plt.show()

In [None]:
df_train['Rating'].nunique()

In [None]:
df_train.groupby('Rating')['Cocoa Percent'].mean()

In [None]:
def distribution(data, title, x_label, y_label, figsize=(10,5)):
    plt.figure(figsize=figsize)
    plt.style.use('seaborn-pastel')
    density = sns.barplot(x=data.index, y=data.values)
    title = plt.title(title, fontdict={'fontsize': 20})
    plt.xlabel(x_label)
    plt.ylabel(y_label)
plt.show()

In [None]:
rating_cocoa_percent_dist = df_train.groupby('Rating')['Cocoa Percent'].mean()

distribution(data=rating_cocoa_percent_dist,
             title='Mean cocoa percent in rating',
             x_label='Rating',
             y_label='Cocoa percent')

Chocolate with very high percentage of cocoa may have lower rating

In [None]:
df_train.groupby('Rating')['REF'].mean()

In [None]:
plt.hist2d(df_train['REF'], df_train['Rating'], bins=[20, 16])
plt.xlabel('REF')
plt.ylabel('Rating')
plt.show()

In [None]:
sns.histplot(data=df_train, x="REF", y="Rating", cbar=True, bins=[20, 16])

In [None]:
rating_cocoa_percent_dist = df_train.groupby('Rating')['REF'].mean()

distribution(data=rating_cocoa_percent_dist,
             title='Mean REF in rating',
             x_label='Rating',
             y_label='REF')

In [None]:
unique_rating = df_train['Rating'].nunique()
unique_rating

Chocolate wtih low REF tends to have low or very high rating

In [None]:
plt.scatter(df_train['Review'], df_train['REF'])
plt.xlabel('Review [Year]')
plt.ylabel('REF')
plt.show()

In [None]:
df_train[['Review', 'REF']].corr()

REF and Review are highly correlated

In [None]:
df_train.groupby('Review')['Rating'].mean()

In [None]:
rating_cocoa_percent_dist = df_train.groupby('Review')['Rating'].mean()

distribution(data=rating_cocoa_percent_dist,
             title='Mean rating per year',
             x_label='Review [Year]',
             y_label='Rating',
             figsize=(15, 5))

In [None]:
df_train.loc[df_train['Bean Type'] == 'Unknown', ['Bean Type', 'Specific Bean Origin']]

In [None]:
df_train[df_train['Bean Type'] == 'Unknown']['Specific Bean Origin'].value_counts()

In [None]:
df_train[df_train['Bean Type'] != 'Unknown']['Specific Bean Origin'].value_counts()

In [None]:
rating_bins = int((df_train.Rating.max() - df_train.Rating.min()) / 0.25)

plt.hist(df_train.Rating, bins=rating_bins)
plt.xlabel('Rating')
plt.show()

Inference:
* Chocolate with very high percentage of cocoa may have lower rating
* Chocolate with low REF rated lower
* REF and Review (year) are highly correlated

## 2.1 Correlation matrix

In [None]:
plt.figure(figsize = (10,8))
plt.rcParams.update({
    'xtick.labelsize': 15,
    'ytick.labelsize': 15,
})
sns.heatmap(df_train.corr(), annot=True, cmap="RdYlGn")

plt.show()

In [None]:
import association_metrics as am

XC = df_train.apply(lambda x: x.astype("category") if x.dtype == "object" else x)
cramersv = am.CramersV(XC)
cramersv.fit()

In [None]:
XC = df_test.apply(lambda x: x.astype("category") if x.dtype == "object" else x)
cramersv = am.CramersV(XC)
cramersv.fit()

In [None]:
for col in ['Company', 'Company Location']:
    print(f'{col}: {df_train[col].nunique()}')

Correlated features (may drop one from each pair):
* REF and Review
* Company and Company Location
* Broad Bean Origin and Specific Bean Origin
* Specific Bean Origin and Bean Type

May be we should drop "Bean Type" column

In [None]:
df_train_short = df_train.drop(['Company', 'Specific Bean Origin'], axis=1)
df_test_short = df_test.drop(['Company', 'Specific Bean Origin'], axis=1)

In [None]:
object_cols_short = df_train_short.columns[df_train_short.dtypes==object]
for col in object_cols_short:
    out_mask = ~np.isin(df_test_short[col].unique(), df_train_short[col].unique())
    print(out_mask.sum())
    print(f'{col} presented in test but not in train:')
    print('-'*70)
    print(*df_test_short[col].unique()[out_mask], sep='\n')
    print('-'*70)

In [None]:
df_test_short[df_test_short['Company Location'] == 'Martinique']

In [None]:
df_test_short[df_test_short['Company Location'].apply(lambda x: x in ['Martinique', 'Philippines'])]

In [None]:
df_test_short[df_test_short['Broad Bean Origin'].apply(lambda x: x in ['Martinique', 'Philippines'])]

# 3. Feature ingeneering

## Broad company location

In [None]:
df_train.groupby('Company Location').agg({'Rating': 'mean', 'Company Location': 'count'})\
    .sort_values(by='Rating', ascending=False).head()

In [None]:
broad_locations = {'Europe': ['U.K.', 'Switzerland', 'France', 'Belgium', 'Spain', 'Italy',
                             'Austria', 'Sweden', 'Ireland', 'Poland', 'Hungary', 'Germany',
                             'Denmark', 'Lithuania', 'Scotland', 'Finland', 'Iceland', 
                             'Amsterdam', 'Wales', 'Netherlands', 'Portugal', 'Czech Republic'],
                  'North America': ['U.S.A.', 'Canada'],
                  'Africa': ['Madagascar', 'South Africa', 'Ghana', 'Sao Tome'],
                  'Caribbean': ['Colombia', 'Venezuela', 'Honduras', 'Guatemala',
                               'Domincan Republic', 'Grenada', 'Puerto Rico', 'Costa Rica',
                               'St. Lucia', 'Nicaragua', 'Martinique', 'Niacragua'],
                   'Asia & Oceania': ['Japan', 'Vietnam', 'Philippines', 'Fiji', 'Australia',
                           'South Korea', 'Israel', 'Singapore', 'India', 'Russia',
                           'New Zealand'],
                   'South America': ['Ecuador', 'Brazil', 'Peru', 'Eucador', 'Argentina',
                                    'Chile', 'Mexico', 'Bolivia', 'Suriname']
                  }

In [None]:
def location_in_list(location):
    for k, v in broad_locations.items():
        if location in v:
            return k
    return 'Other'

df_train['Broad Company Location'] = df_train['Company Location'].apply(lambda x: location_in_list(x))
df_test['Broad Company Location'] = df_test['Company Location'].apply(lambda x: location_in_list(x))

In [None]:
df_train.groupby('Broad Company Location')\
    .agg({'Rating': ['mean', 'std'], 'Broad Company Location': 'count'})

In [None]:
from scipy.stats import f_oneway

ratings = [df_train.loc[df_train['Broad Company Location'] == i, ['Rating']].values[:, 0] for i in df_train['Broad Company Location'].unique()]
f_test = f_oneway(*ratings)
print('One way anova test')
print(f'F statistics = {f_test.statistic}')
print(f'p-value = {f_test.pvalue}')

In [None]:
for i in df_train['Broad Bean Origin'].unique():
    if not i in df_train["Company Location"].unique():
        print(i)

In [None]:
broad_locations_bean = {'Europe': ['U.K.', 'Switzerland', 'France', 'Belgium', 'Spain', 'Italy',
                             'Austria', 'Sweden', 'Ireland', 'Poland', 'Hungary', 'Germany',
                             'Denmark', 'Lithuania', 'Scotland', 'Finland', 'Iceland', 
                             'Amsterdam', 'Wales', 'Netherlands', 'Portugal', 'Czech Republic'],
                  'North America': ['U.S.A.', 'Canada'],
                  'Africa': ['Madagascar', 'South Africa', 'Ghana', 'Sao Tome', 'Tanzania',
                             'Indonesia', 'Congo', 'Liberia', 'Principe', 'Sao Tome & Principe',
                            'Gabon', 'Ivory Coast', 'Uganda', 'Nigeria', 'West Africa'],
                  'Caribbean': ['Colombia', 'Venezuela', 'Honduras', 'Guatemala',
                               'Domincan Republic', 'Grenada', 'Puerto Rico', 'Costa Rica',
                               'St. Lucia', 'Nicaragua', 'Martinique', 'Niacragua', 'Cost Rica, Ven',
                               'Trinidad', 'Panama', 'Jamaica', 'Haiti', 'Cuba', 'Venezuela, Ghana',
                               'Ven.,Ecu.,Peru,Nic.', 'Tobago', 'Carribean(DR/Jam/Tri)',
                               'Venezuela, Java', 'Ven, Bolivia, D.R.', 'Venezuela, Carribean',
                               'Dominican Republic', 'Carribean'],
                   'Asia & Oceania': ['Japan', 'Vietnam', 'Philippines', 'Fiji', 'Australia',
                                      'South Korea', 'Israel', 'Singapore', 'India', 'Russia',
                                      'New Zealand', 'Papua New Guinea', 'Hawaii', 'Solomon Islands',
                                      'Sri Lanka', 'Malaysia', 'Samoa', 'Philippines', 'Togo',
                                      'Vanuatu'],
                   'South America': ['Ecuador', 'Brazil', 'Peru', 'Eucador', 'Argentina',
                                    'Chile', 'Mexico', 'Bolivia', 'Suriname', 'Peru, Dom. Rep',
                                    'Central and S. America', 'Colombia, Ecuador', 'Dom. Rep., Madagascar',
                                    'South America', 'Belize', 'El Salvador'],
                   'Mixed': ['Trinidad, Ecuador', 'South America, Africa', 'PNG, Vanuatu, Mad',
                            'Mad., Java, PNG', 'Peru, Mad., Dom. Rep.', 'Indonesia, Ghana', 'Madagascar & Ecuador',
                            'Venez,Africa,Brasil,Peru,Mex', 'DR, Ecuador, Peru', 'Dominican Rep., Bali',
                            'Peru, Madagascar', 'Venezuela, Dom. Rep.', 'Peru, Ecuador, Venezuela',
                            'Ven., Trinidad, Mad.', 'Ven., Indonesia, Ecuad.', 'Ghana, Domin. Rep',
                            'Peru, Belize', 'Guat., D.R., Peru, Mad., PNG']
                  }

In [None]:
def location_in_list(location, locations_dict=broad_locations):
    for k, v in locations_dict.items():
        if location in v:
            return k
    return 'Other'

df_train['Bean origin world'] = df_train['Broad Bean Origin'].apply(lambda x: location_in_list(x, broad_locations_bean))
df_test['Bean origin world'] = df_test['Broad Bean Origin'].apply(lambda x: location_in_list(x, broad_locations_bean))

In [None]:
df_train['Bean origin world'].value_counts()

In [None]:
df_train.groupby('Bean origin world').agg({'Rating': 'mean'})

In [None]:
def multiple_names(string):
    answer = 0
    for i  in [',', '&']:
        if i in string:
            answer = 1
    return answer

df_train['Multiple bean origins'] = df_train['Broad Bean Origin'].apply(lambda x: multiple_names(x))
df_test['Multiple bean origins'] = df_test['Broad Bean Origin'].apply(lambda x: multiple_names(x))

In [None]:
df_train.groupby('Multiple bean origins')\
    .agg({'Rating': ['mean', 'std'], 'Multiple bean origins': 'count'})
    #.rename({'Multiple bean origins': 'count'}, axis=1)

In [None]:
df_train.head()

In [None]:
# Countries producing the best chocolate (according to internet)
best_production_countries = ['Belgium', 'Switzerland', 'Italy', 'Germany', 'Austria', 'Poland',
                             'Ecuador', 'Japan', 'New Zealand', 'Sweden', 'Spain', 'Mexico', 'Brazil',
                             'India', 'Peru', 'Australia', 'U.K.',
                             'Indonesia', 'Tahiland', 'Philippines', 'Ivory Coast', 'U.S.A.']

In [None]:
np.array(best_production_countries)[~np.in1d(best_production_countries, df_train['Company Location'].unique())]

In [None]:
df_train['Best Production'] = df_train['Company Location'].apply(lambda x: int(x in best_production_countries))
df_test['Best Production'] = df_train['Company Location'].apply(lambda x: int(x in best_production_countries))

In [None]:
df_train.groupby('Best Production').agg(MeanRating=('Rating', 'mean'),
                                        StdRating=('Rating', 'std'))

In [None]:
top_amount_productrion_countries = ['Cote d’Ivoire', 'Ghana', 'Indonesia', 'Nigeria',
                                    'Ecuador', 'Cameroon', 'Brazil', 'Sierra Leone',
                                    'Peru', 'Dominican Republic']

In [None]:
df_train['Top Production Country'] = df_train['Broad Bean Origin'].apply(lambda x: int(x in top_amount_productrion_countries))
df_test['Top Production Country'] = df_train['Broad Bean Origin'].apply(lambda x: int(x in top_amount_productrion_countries))

In [None]:
df_train.groupby('Top Production Country')\
    .agg(MeanRating=('Rating', 'mean'),
         StdRating=('Rating', 'std'),
         Count=('Top Production Country', 'count'))

In [None]:
plt.figure(figsize = (14,12))
plt.rcParams.update({
    'xtick.labelsize': 15,
    'ytick.labelsize': 15,
})
sns.heatmap(df_train.corr(), annot=True, cmap="RdYlGn")

plt.show()

In [None]:
best_beans_list = ['Criollo', 'Trinitario'] 

def bean_in_list(bean_type, best_beans=best_beans_list):
    answer = 0
    for i in best_beans:
        if i in bean_type:
            answer = 1
    return answer

In [None]:
df_train['Best Bean Type'] = df_train['Bean Type'].apply(lambda x: bean_in_list(x, best_beans_list))
df_test['Best Bean Type'] = df_train['Bean Type'].apply(lambda x: bean_in_list(x, best_beans_list))

In [None]:
df_train.groupby('Best Bean Type')['Rating'].mean()

In [None]:
best_beans_countries = ['Ecuador', 'Ivory Coast']

df_train['Broad Bean Origin'].apply(lambda x: int(any([i in x for i in best_beans_countries])))

In [None]:
df_train_short = df_train.drop(['Company', 'Specific Bean Origin'], axis=1)
df_test_short = df_test.drop(['Company', 'Specific Bean Origin'], axis=1)

## 3.2 Encoding categorical features

In [None]:
print('Number of unique values in categorical columns')

for col in df_train_short.columns:
    if df_train_short[col].dtype == 'O':
        print(f'{col}: {df_train_short[col].nunique()}')

In [None]:
XC = df_train_short.apply(lambda x: x.astype("category") if x.dtype == "object" else x)
cramersv = am.CramersV(XC)
cramersv.fit()

## 3.3 Write target encoder

In [None]:
class MyTargetEncoder():
    def __init__(self, columns):
        self.columns = columns
        self.dict_cols = {k: dict() for k in self.columns}
        print(", ".join(self.columns))
        
    def fit(self, X, y):
        for col in self.columns:
            print('-'*50, '\n', col, '\n', '-'*50)
            for i in X[col].unique():
                self.dict_cols[col][i] = y[X[col]==i].mean()
                print(f'{i}: {self.dict_cols[col][i]}')
            self.dict_cols[col]['Mean'] = y.mean()
            print('Mean:', y.mean())
                
    def transform(self, X):
        for col in self.columns:
            print(col)
            values_avaliable = np.array(list(self.dict_cols[col].keys()))
            values_not_avaliable = X[col].unique()[~np.in1d(X[col].unique(), values_avaliable)]
            print('Not avliable:', values_not_avaliable)
            X.loc[X[col].isin(values_not_avaliable), col] = self.dict_cols[col]['Mean']
            X[col] = X[col].replace(self.dict_cols[col])
        return X

In [None]:
obj_cols = df_train_s.columns[df_train_short.dtypes == object]
obj_cols

In [None]:
my_encoder = MyTargetEncoder(obj_cols)
my_encoder.fit(df_train_short.drop('Rating', axis=1), df_train_short['Rating'])
df_train_new = my_encoder.transform(df_train_short.drop('Rating', axis=1))
df_train_new['Rating'] = df_train_short['Rating']
df_train_new.head()

In [None]:
df_test_new = df_test_short.copy()
df_test_new.head()

In [None]:
df_test_new = my_encoder.transform(df_test_new[:])

In [None]:
df_test_new.head(10)

In [None]:
df_test_new['Company Location'].value_counts()

# 4. Building models

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

## 4.1 Decision Tree Regressor

In [None]:
X = df_train_new.drop('Rating', axis=1)
y = df_train_new['Rating']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.tree import DecisionTreeRegressor

model_tree = DecisionTreeRegressor()
model_tree.fit(X_train, y_train)

y_pred = model_tree.predict(X_test)

print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 score:', r2_score(y_test, y_pred))

In [None]:
from sklearn.model_selection import GridSearchCV

decision_tree_params = {'max_depth': np.arange(2, 10),
                        'min_samples_leaf': np.arange(2, 20),
                        'min_samples_split': np.arange(2, 40)}

gs_tree = GridSearchCV(DecisionTreeRegressor(),
                       decision_tree_params,
                       scoring='r2',
                       cv=3,
                       n_jobs=-1)

gs_tree.fit(X_train, y_train)
print(gs_tree.best_params_)
print(gs_tree.best_score_)

In [None]:
model_tree = gs_tree.best_estimator_

pd.DataFrame({'Feature': model_tree.feature_names_in_, 'Importance': model_tree.feature_importances_})\
    .sort_values(by='Importance', ascending=False)

In [None]:
model_tree.fit(X, y)

In [None]:
pred = model_tree.predict(df_test_new)

In [None]:
df_test_new['id'] = np.arange(len(df_test)) 
df_test_new['Rating'] = pred 

df_test_new[['id','Rating']].to_csv("tree_bad_submission.csv", index=False)

In [None]:
pd.read_csv('tree_bad_submission.csv').head(20)

## 5.1 Decision tree test transformations

In [None]:
os.listdir()

In [None]:
pd.read_csv('choco_sample_submission.csv')

In [None]:
pd.read_csv('chocolate_test_new.csv')