In [None]:
# https://erlemar.github.io/
# https://github.com/Erlemar/Erlemar.github.io/blob/master/Notebooks/GGG.ipynb

In [171]:
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import os

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn import svm
import xgboost as xgb
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB

In [144]:
os.listdir()

['Untitled.ipynb',
 'test.csv',
 '.jovianrc',
 'train.csv',
 '.ipynb_checkpoints',
 'sample_submission.csv']

In [145]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# EDA

In [146]:
# lookin' good, ghouls
train.isna().sum()

id               0
bone_length      0
rotting_flesh    0
hair_length      0
has_soul         0
color            0
type             0
dtype: int64

In [147]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371 entries, 0 to 370
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             371 non-null    int64  
 1   bone_length    371 non-null    float64
 2   rotting_flesh  371 non-null    float64
 3   hair_length    371 non-null    float64
 4   has_soul       371 non-null    float64
 5   color          371 non-null    object 
 6   type           371 non-null    object 
dtypes: float64(4), int64(1), object(2)
memory usage: 20.4+ KB


In [148]:
train.describe(include='all')

Unnamed: 0,id,bone_length,rotting_flesh,hair_length,has_soul,color,type
count,371.0,371.0,371.0,371.0,371.0,371,371
unique,,,,,,6,3
top,,,,,,white,Ghoul
freq,,,,,,137,129
mean,443.67655,0.43416,0.506848,0.529114,0.471392,,
std,263.222489,0.132833,0.146358,0.169902,0.176129,,
min,0.0,0.061032,0.095687,0.1346,0.009402,,
25%,205.5,0.340006,0.414812,0.407428,0.348002,,
50%,458.0,0.434891,0.501552,0.538642,0.466372,,
75%,678.5,0.517223,0.603977,0.647244,0.60061,,


In [149]:
train.head()

Unnamed: 0,id,bone_length,rotting_flesh,hair_length,has_soul,color,type
0,0,0.354512,0.350839,0.465761,0.781142,clear,Ghoul
1,1,0.57556,0.425868,0.531401,0.439899,green,Goblin
2,2,0.467875,0.35433,0.811616,0.791225,black,Ghoul
3,4,0.776652,0.508723,0.636766,0.884464,black,Ghoul
4,5,0.566117,0.875862,0.418594,0.636438,green,Ghost


In [150]:
# Assess the mean value for each numerical column and the association with type
fig = make_subplots(rows=1, cols=4, shared_yaxes=True, subplot_titles=(train.loc[:, train.dtypes==float].columns))
num = 1

for i in train.loc[:, train.dtypes==float].columns:
    x = train.groupby('type')[i].mean().index
    y = train.groupby('type')[i].mean().values
    fig.add_trace(go.Bar(x=x, y=y),1, num)
    num += 1
    
fig.update_layout(showlegend=False, title_text="Numerical Columns per Type")

fig.show()

In [151]:
train['color'].value_counts()

white    137
clear    120
green     42
black     41
blue      19
blood     12
Name: color, dtype: int64

In [152]:
train.groupby(['color', 'type'])['id'].count()

color  type  
black  Ghost     14
       Ghoul     14
       Goblin    13
blood  Ghost      6
       Ghoul      4
       Goblin     2
blue   Ghost      6
       Ghoul      6
       Goblin     7
clear  Ghost     32
       Ghoul     42
       Goblin    46
green  Ghost     15
       Ghoul     13
       Goblin    14
white  Ghost     44
       Ghoul     50
       Goblin    43
Name: id, dtype: int64

In [153]:
train.columns[0]

'id'

In [154]:
train.drop(columns='id')

Unnamed: 0,bone_length,rotting_flesh,hair_length,has_soul,color,type
0,0.354512,0.350839,0.465761,0.781142,clear,Ghoul
1,0.575560,0.425868,0.531401,0.439899,green,Goblin
2,0.467875,0.354330,0.811616,0.791225,black,Ghoul
3,0.776652,0.508723,0.636766,0.884464,black,Ghoul
4,0.566117,0.875862,0.418594,0.636438,green,Ghost
...,...,...,...,...,...,...
366,0.458132,0.391760,0.660590,0.635689,blue,Goblin
367,0.331936,0.564836,0.539216,0.551471,green,Ghost
368,0.481640,0.501147,0.496446,0.544003,clear,Ghoul
369,0.294943,0.771286,0.583503,0.300618,clear,Ghost


In [155]:
fig = px.scatter_matrix(train, 
                        dimensions=['bone_length', 'rotting_flesh', 'hair_length', 'has_soul'], 
                        color='type')
fig.show()

# hair length and bone length seem to have a relationship, as well as hair length and has soul

# Data Prep

In [156]:
# Save test id for later
test_id = test['id']

train.drop(['id'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)

In [157]:
col = 'color'

dummies = pd.get_dummies(train[col], drop_first=False)
dummies = dummies.add_prefix("{}_".format(col))
train.drop(col, axis=1, inplace=True)
train = train.join(dummies)

dummies = pd.get_dummies(test[col], drop_first=False)
dummies = dummies.add_prefix("{}_".format(col))
test.drop(col, axis=1, inplace=True)
test = test.join(dummies)

In [158]:
X_train = train.drop('type', axis=1)
le = LabelEncoder()
y_train = le.fit_transform(train.type.values)
X_test = test

<IPython.core.display.Javascript object>

In [159]:
clf = RandomForestClassifier(n_estimators = 200)
clf = clf.fit(X_train, y_train)
indicies = np.argsort(clf.feature_importances_)[::-1]

print('Feature Ranking')
for i in range(X_train.shape[1]):
    print('%d. feature %d %s (%f)' % (i+1, indicies[i], X_train.columns[indicies[i]], clf.feature_importances_[indicies[i]]))

<IPython.core.display.Javascript object>

Feature Ranking
1. feature 2 hair_length (0.297188)
2. feature 3 has_soul (0.269235)
3. feature 0 bone_length (0.190779)
4. feature 1 rotting_flesh (0.187409)
5. feature 7 color_clear (0.012686)
6. feature 9 color_white (0.012569)
7. feature 4 color_black (0.009953)
8. feature 8 color_green (0.009641)
9. feature 6 color_blue (0.007077)
10. feature 5 color_blood (0.003462)


In [160]:
best_feat = X_train.columns[indicies[0:4]]

X = X_train[best_feat]
Xt = X_test[best_feat]

# Train the Model

In [166]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y_train, test_size=0.20, random_state=36)

<IPython.core.display.Javascript object>

In [177]:
forest = RandomForestClassifier()

parameter_grid = {'n_estimators' : [10,20,150], 
                  'criterion' : ['gini', 'entropy'], 
                  'max_features' : ['auto', 'sqrt', 'log2'], 
                  'max_depth' : [None, 5, 100],
                  'min_samples_split' : [2, 5, 7],
                  'min_weight_fraction_leaf' : [0.0, 0.1],
                  'max_leaf_nodes' : [40, 80],
                 }

grid_search = GridSearchCV(forest, param_grid = parameter_grid, scoring='accuracy', cv=StratifiedKFold(5))
grid_search.fit(X, y_train)
print('Best Score: {}'.format(grid_search.best_score_))
print('Best Parameters: {}'.format(grid_search.best_params_))

Best Score: 0.7464864864864865
Best Parameters: {'criterion': 'gini', 'max_depth': None, 'max_features': 'log2', 'max_leaf_nodes': 80, 'min_samples_split': 7, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 20}


In [188]:
clf = RandomForestClassifier(n_estimators = 20, 
                  criterion = 'gini', 
                  max_features = 'log2', 
                  max_depth = None,
                  min_samples_split = 7,
                  min_weight_fraction_leaf = 0.0,
                  max_leaf_nodes = 80)

calibrated_clf = CalibratedClassifierCV(clf, method='sigmoid', cv=5)
calibrated_clf.fit(Xtrain, ytrain)
y_val = calibrated_clf.predict_proba(Xtest)

print("Validation accuracy: ", sum(pd.DataFrame(y_val, columns=le.classes_).idxmax(axis=1).values
                                   == le.inverse_transform(ytest))/len(ytest))

Validation accuracy:  0.72


In [189]:
svc = svm.SVC(kernel='linear')
svc.fit(Xtrain, ytrain)
y_val_s = svc.predict(Xtest)
print("Validation accuracy: ", sum(le.inverse_transform(y_val_s) == le.inverse_transform(ytest))/len(ytest))

Validation accuracy:  0.7466666666666667


In [190]:
#The last model is logistic regression
logreg = LogisticRegression()

parameter_grid = {'solver' : ['newton-cg', 'lbfgs'],
                  'multi_class' : ['ovr', 'multinomial'],
                  'C' : [0.005, 0.01, 1, 10, 100, 1000],
                  'tol': [0.0001, 0.001, 0.005]
                 }

grid_search = GridSearchCV(logreg, param_grid=parameter_grid, cv=StratifiedKFold(5))
grid_search.fit(Xtrain, ytrain)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

Best score: 0.7632768361581921
Best parameters: {'C': 1, 'multi_class': 'multinomial', 'solver': 'newton-cg', 'tol': 0.0001}


In [191]:
log_reg = LogisticRegression(C = 1, tol = 0.0001, solver='newton-cg', multi_class='multinomial')
log_reg.fit(Xtrain, ytrain)
y_val_l = log_reg.predict_proba(Xtest)
print("Validation accuracy: ", sum(pd.DataFrame(y_val_l, columns=le.classes_).idxmax(axis=1).values
                              == le.inverse_transform(ytest))/len(ytest))


Validation accuracy:  0.76


In [192]:
clf = RandomForestClassifier(n_estimators=20, n_jobs=-1, criterion = 'gini', max_features = 'sqrt',
                             min_samples_split=2, min_weight_fraction_leaf=0.0,
                             max_leaf_nodes=40, max_depth=100)

calibrated_clf = CalibratedClassifierCV(clf, method='sigmoid', cv=5)
log_reg = LogisticRegression(C = 1, tol = 0.0001, solver='newton-cg', multi_class='multinomial')
gnb = GaussianNB()

In [193]:
calibrated_clf1 = CalibratedClassifierCV(RandomForestClassifier())
log_reg1 = LogisticRegression()
gnb1 = GaussianNB()

In [194]:
Vclf1 = VotingClassifier(estimators=[('LR', log_reg1), ('CRF', calibrated_clf1),
                                     ('GNB', gnb1)], voting='hard')
Vclf = VotingClassifier(estimators=[('LR', log_reg), ('CRF', calibrated_clf),
                                     ('GNB', gnb)], voting='soft', weights=[1,1,1])

In [196]:
hard_predict = le.inverse_transform(Vclf1.fit(X, y_train).predict(Xt))
soft_predict = le.inverse_transform(Vclf.fit(X, y_train).predict(Xt))

In [197]:
#Let's see the differences:
for i in range(len(hard_predict)):
    if hard_predict[i] != soft_predict[i]:
        print(i, hard_predict[i], soft_predict[i])

24 Ghoul Goblin
57 Ghoul Goblin
76 Ghoul Goblin
100 Ghoul Goblin
102 Ghoul Goblin
120 Ghoul Goblin
134 Ghost Goblin
150 Ghoul Goblin
162 Goblin Ghoul
204 Ghoul Goblin
238 Goblin Ghoul
258 Goblin Ghost
293 Ghoul Goblin
342 Ghoul Goblin
347 Ghoul Goblin
367 Ghoul Goblin
390 Ghoul Goblin
452 Ghoul Goblin
453 Ghoul Goblin
