In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

poke = pd.read_csv('Datacamp CSV/Pokemon_Machine_Learning.csv', index_col = 0)

# Get dummies (in a real world situation, have to do this separately for the final test)
poke = pd.get_dummies(poke, columns = ['Ability'], drop_first = True)

# Split into final test and initial train set
holdout = poke.loc[:150, :]
training = poke[~poke['name'].isin(holdout['name'])]

In [2]:
from sklearn.model_selection import train_test_split

# Get target variable
X = training.drop(['name', 'type'], axis = 1)
y = training['type']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, 
#                                                     stratify = y, random_state = 42)

In [111]:
from sklearn.preprocessing import FunctionTransformer

# Dummy columnes
dummy_col = training.iloc[:, 3:].columns

# Tested - works
get_text_data = FunctionTransformer(lambda x: x['Pokedex Entry'])
get_dummies_data = FunctionTransformer(lambda x: x[dummy_col])

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC, SVC

alphanumeric = '[A-Za-z0-9]+(?=\\s+)'

dummy_features = Pipeline([('selector', get_dummies_data)])

text_features = Pipeline([('selector', get_text_data), 
                          ('vectorizer', CountVectorizer(token_pattern = alphanumeric,
                                                         stop_words = 'english')), 
                          ('transformer', TfidfTransformer())])

# Combine the two arrays into a single function
feature_processing = FeatureUnion(transformer_list = [('dummy_features', dummy_features),
                                                     ('text_features', text_features)])

# Full pipeline
pipeline = Pipeline([
    ('union', feature_processing),
    ('clf', LogisticRegression(C = 1.0, solver = 'liblinear'))    
    # Using multinomial instead of ovr doesn't seem to make much difference
    # Has a slightly lower accuracy but slightly lower log_loss
    # ('clf', LogisticRegression(C = 1.0, multi_class = 'multinomial'))
    
    # Default SVC gives the lowest logloss, but takes the longest time
    # ('clf', SVC(gamma = 0.1, probability = True))
    
    # SGDClassifier is similar to both LR and SVM, but scales much better to bigger datasets
    # ('clf', SGDClassifier(loss = 'log', alpha = 0.001, penalty = 'l2'))
    ])

pipeline.fit(X, y)

Pipeline(memory=None,
         steps=[('union',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('dummy_features',
                                                 Pipeline(memory=None,
                                                          steps=[('selector',
                                                                  FunctionTransformer(accept_sparse=False,
                                                                                      check_inverse=True,
                                                                                      func=<function <lambda> at 0x00000000131CE828>,
                                                                                      inv_kw_args=None,
                                                                                      inverse_func=None,
                                                                                      kw_args=None,
                                     

In [112]:
predictions = pipeline.predict(holdout.drop(['name', 'type'], axis = 1))

# # this is all the pokemon
# # This dataset has some problems with types - not great
# pokemon = pd.read_csv('pokedex_entries.csv')
# pokemon = pokemon[['name', 'type1', 'type2']]
# # Select the holdout set of pokemon
# pokemon = pokemon.loc[:150, :]

pokemon = pd.read_csv('Datacamp CSV/pokedex_(Update_05.20).csv', index_col = 0)
pokemon = pd.merge(pokemon, holdout, how = 'right')[['name', 'type_1', 'type_2']]
pokemon['type_1'] = pokemon['type_1'].str.lower()
pokemon['type_2'] = pokemon['type_2'].str.lower()
pokemon['Predicted Type'] = predictions
pokemon = pokemon.rename(columns = {'type_1':'type1', 'type_2':'type2'})

# from sklearn.metrics import log_loss
# predict_proba = pipeline.predict_proba(holdout.drop(['name', 'type'], axis = 1))
# print('Log loss: {:.3f}'.format(log_loss(pokemon['type1'], predict_proba, 
#                                          labels = pipeline['clf'].classes_)))

# Log_loss to beat is around 2.84
# We're doing VERY well compared to random guessing
# Note however, that this logloss is only calculated on the first type.
# So our higher accuracy may not correspond correctly to the log_loss

# LogisticRegression C = 1.0, liblinear, ovr --> 1.770
# LogisticRegression C = 1.0, lbfgs, multinomial --> 1.740
# LinearSVC --> 1.701
# SVC --> 1.698
# SVC, RBF, gamma = 0.1 --> 1.657

In [113]:
# Double counted matrix

test = pd.melt(pokemon, 
                  id_vars = ['name', 'Predicted Type'], 
                  value_vars=['type1', 'type2'], 
                  value_name = 'Actual Type').dropna()

cat_type = pd.CategoricalDtype(categories = poke['type'].unique().tolist())

test = test[['Predicted Type', 'Actual Type']].astype(cat_type)

confusion_matrix = pd.crosstab(test['Actual Type'], test['Predicted Type'], dropna = False)

%matplotlib qt
sns.set_context('notebook', font_scale=1.2)
plt.figure()
sns.heatmap(confusion_matrix, annot = True)
plt.title('Total Pokemon: 151 \n Accuracy: {:.2f}'.format(np.diag(confusion_matrix.values).sum() / 151))
plt.show()

plt.subplots_adjust(
top=0.944,
bottom=0.158,
left=0.081,
right=0.986,
hspace=0.2,
wspace=0.2
)

In [6]:
# Single counted matrix
score = pipeline.score(holdout.drop(['name', 'type'], axis = 1), holdout['type'])

cat_type = pd.CategoricalDtype(categories = poke['type'].unique().tolist())

pokemon[['type1', 'Predicted Type']] = pokemon[['type1', 'Predicted Type']].astype(cat_type)

matrix = pd.crosstab(pokemon['type1'], 
                               pokemon['Predicted Type'], 
                               rownames = ['Actual Type'], 
                               dropna = False)

%matplotlib qt
sns.set_context('notebook', font_scale=1.2)
plt.figure()
sns.heatmap(matrix, annot = True)
plt.title('Total Pokemon: {0} \n Accuracy: {1:.2f}'.format(len(holdout), np.diag(confusion_matrix.values).sum() / 151))
plt.show()

plt.subplots_adjust(
top=0.944,
bottom=0.158,
left=0.081,
right=0.986,
hspace=0.2,
wspace=0.2
)

In [82]:
from sklearn.metrics import log_loss

def calculate_log_loss(class_ratio,multi=10000):
    
    if sum(class_ratio)!=1.0:
        print("warning: Sum of ratios should be 1 for best results")
        class_ratio[-1]+=1-sum(class_ratio)  # add the residual to last class's ratio
    
    actuals=[]
    for i,val in enumerate(class_ratio):
        actuals=actuals+[i for x in range(int(val*multi))]
        

    preds=[]
    for i in range(multi - 1):
        preds+=[class_ratio]

    return (log_loss(actuals, preds))

calculate_log_loss((y.value_counts() / len(y)).values.round(2), multi = 10000)



2.8475944823179224

In [98]:
help(pipeline['clf'])

Help on SVC in module sklearn.svm._classes object:

class SVC(sklearn.svm._base.BaseSVC)
 |  SVC(C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None)
 |  
 |  C-Support Vector Classification.
 |  
 |  The implementation is based on libsvm. The fit time scales at least
 |  quadratically with the number of samples and may be impractical
 |  beyond tens of thousands of samples. For large datasets
 |  consider using :class:`sklearn.svm.LinearSVC` or
 |  :class:`sklearn.linear_model.SGDClassifier` instead, possibly after a
 |  :class:`sklearn.kernel_approximation.Nystroem` transformer.
 |  
 |  The multiclass support is handled according to a one-vs-one scheme.
 |  
 |  For details on the precise mathematical formulation of the provided
 |  kernel functions and how `gamma`, `coef0` and `degree` affect each
 |  ot