In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('E:\Sem6\Machine Learning\Project\pokemon_alopez247.csv')

In [4]:
df.head()

Unnamed: 0,Number,Name,Type_1,Type_2,Total,HP,Attack,Defense,Sp_Atk,Sp_Def,...,Color,hasGender,Pr_Male,Egg_Group_1,Egg_Group_2,hasMegaEvolution,Height_m,Weight_kg,Catch_Rate,Body_Style
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,...,Green,True,0.875,Monster,Grass,False,0.71,6.9,45,quadruped
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,...,Green,True,0.875,Monster,Grass,False,0.99,13.0,45,quadruped
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,...,Green,True,0.875,Monster,Grass,True,2.01,100.0,45,quadruped
3,4,Charmander,Fire,,309,39,52,43,60,50,...,Red,True,0.875,Monster,Dragon,False,0.61,8.5,45,bipedal_tailed
4,5,Charmeleon,Fire,,405,58,64,58,80,65,...,Red,True,0.875,Monster,Dragon,False,1.09,19.0,45,bipedal_tailed


In [5]:
for col in df.columns:
    if df[col].dtype == int:
        df[col] = df[col].astype(float)

In [6]:
# we want to predict if pokemon is legendary or not
df['isLegendary'].value_counts()

False    675
True      46
Name: isLegendary, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [8]:
def get_arrays(df):
    X = np.array(df[['HP', 'Attack', 'Defense', 'Sp_Atk', 'Sp_Def']])
    y = np.array(df['isLegendary'])
    
    return X, y

X_train, y_train = get_arrays(df_train)
X_test, y_test = get_arrays(df_test)

X_train.shape, y_train.shape

((576, 5), (576,))

In [9]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

pipeline = make_pipeline(
    StandardScaler(),
    LogisticRegression()
)

In [10]:
model = pipeline.fit(X_train, y_train)
model.predict(X_train)[:5]




array([False, False, False, False, False], dtype=bool)

In [11]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_train, model.predict(X_train))



array([[532,   6],
       [ 21,  17]])

In [12]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_train, model.predict(X_train))



0.71810800234787719

In [13]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

def cross_validate_auc(pipeline, X_train, y_train):
    results = cross_val_score(
        pipeline,
        X_train,
        y_train,
        scoring=make_scorer(roc_auc_score),
        cv=10,
    )

    return np.mean(results)
    
cross_validate_auc(pipeline, X_train, y_train)



0.71526030747728864

In [14]:
from sklearn.base import BaseEstimator, TransformerMixin

class PandasSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, selected_columns):
        self.selected_columns = selected_columns
    
    def fit(self, df, *args):
        return self

    def transform(self, df):
        return np.array(df[self.selected_columns])

In [15]:
pipeline = make_pipeline(
    PandasSelector(['HP', 'Attack', 'Defense', 'Sp_Atk', 'Sp_Def']),
    StandardScaler(),
    LogisticRegression()
)

model = pipeline.fit(df_train, y_train)
model.predict(df_train)[:5]



array([False, False, False, False, False], dtype=bool)

In [16]:
class StringConverter(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.map = {} # column : string : int
    
    def fit(self, X, *args):
        for col in range(X.shape[1]):
            self.map[col] = {}
            idx = 1
            for row in range(X.shape[0]):
                s = X[row, col]
                if s not in self.map[col]:
                    self.map[col][s] = idx
                    idx += 1
        return self

    def transform(self, X):
        X_int = np.zeros(shape=X.shape)
        for col in range(X.shape[1]):
            for row in range(X.shape[0]):
                s = X[row, col]
                X_int[row, col] = self.map[col].get(s, 0)

        return X_int

In [19]:
np.mean(df_train['Pr_Male'].isnull())

0.1076388888888889

In [22]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'logisticregression__C': [0.01, 0.1, 1, 10, 100],
    'logisticregression__class_weight': [None, 'balanced'],
}

grid = GridSearchCV(
    pipeline,
    parameters,
    scoring=make_scorer(roc_auc_score),
).fit(df_train, y_train)

print('Best params: {}'.format(grid.best_params_))
print('Best AUC: {:.3f}'.format(grid.best_score_))

final_model = grid.best_estimator_



Best params: {'logisticregression__C': 1, 'logisticregression__class_weight': 'balanced'}
Best AUC: 0.952




In [23]:
roc_auc_score(y_test, final_model.predict(df_test))



0.90100364963503643

In [24]:
from sklearn.externals import joblib

joblib.dump(final_model, 'final_model.pkl');

In [27]:
loaded_model = joblib.load('final_model.pkl')
roc_auc_score(y_test, final_model.predict(df_test))




0.90100364963503643