# Ricky's Final Project - Part 4

### Pokedex Upgrade - Predicting Catch Rates and Legendary Status of Unknown Pokemon

**Data**

Saved previous dataframe from part 3 as a new CSV file and pulled that file in to start for project 4


In [817]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

%matplotlib inline

plt.style.use('ggplot')

In [818]:
# Imported cleaned up data from part 3
poke_data = pd.read_csv(os.path.join('.', 'Revised_Pokemon_Data.csv'), index_col = 0)

In [819]:
""""To run Random Forest I needed to clean up the data further and created additional features to use 
for modeling such as if Pokemon has a second categorical"""

# Additional feature: to use for modeling such as if Pokemon has a second categorical
def has_2nd_type(text_in):
    try:
        if ' ' in text_in:
            return 0
        else:
            return 1
    except: 
        return 0
        
poke_data['2nd_Type'] = poke_data['Type_2'].map(has_2nd_type)

# Additional features: Sum of their offensive and defensive stats
poke_data['Offensive_Stats'] = poke_data['Attack'] + poke_data['Sp_Atk'] + poke_data['Speed']
poke_data['Defensive_Stats'] = poke_data['HP'] + poke_data['Defense'] + poke_data['Sp_Def']

# Fixing up a the Pr_Male field
poke_data['Pr_Male'].fillna(value=0, inplace=True) 
del poke_data['Type_2']

# Used on hot coding to convert body type and type to dummy variables
poke_data2 = pd.concat([poke_data, (pd.get_dummies(poke_data['Body_Style'], prefix='Body Type', drop_first=True))], axis=1)
poke_data2 = pd.concat([poke_data2, (pd.get_dummies(poke_data['Type_1'], prefix='Type', drop_first=True))], axis=1)
poke_data3 = poke_data2.copy()

print [[poke_data.head()]]
print "===================================================================================================="
print [[poke_data2.head()]]

[[       Type_1  Total  HP  Attack  Defense  Sp_Atk  Sp_Def  Speed  Color  \
Number                                                                    
1       Grass    318  45      49       49      65      65     45  Green   
2       Grass    405  60      62       63      80      80     60  Green   
3       Grass    525  80      82       83     100     100     80  Green   
4        Fire    309  39      52       43      60      50     65    Red   
5        Fire    405  58      64       58      80      65     80    Red   

        Pr_Male Egg_Group_1  Height_m  Weight_kg  Catch_Rate      Body_Style  \
Number                                                                         
1         0.875     Monster      0.71        6.9          45       quadruped   
2         0.875     Monster      0.99       13.0          45       quadruped   
3         0.875     Monster      2.01      100.0          45       quadruped   
4         0.875     Monster      0.61        8.5          45  bipedal_ta

In [820]:
X_feat_test = poke_data2
y_feat_test = poke_data2['Legendary_True']
remove = ['Legendary_True', 'Body_Style', 'Color', 'Egg_Group_1', 'Type_1']
X_feat_test.drop(remove, axis = 1, inplace=True)


In [821]:
#Run Feature_Importance from RF
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators = 20)
    
model.fit(X_feat_test, y_feat_test)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=20, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [822]:
features = X_feat_test.columns
feature_importances = model.feature_importances_

features_df = pd.DataFrame({'Features': features, 'Importance Score': feature_importances})
features_df.sort_values('Importance Score', inplace=True, ascending=False)

important_feat = features_df.head(100)
important_feat

Unnamed: 0,Features,Importance Score
0,Total,0.26735
14,Offensive_Stats,0.136424
10,Catch_Rate,0.11175
11,Gender_True,0.095178
9,Weight_kg,0.081764
15,Defensive_Stats,0.043611
1,HP,0.041525
4,Sp_Atk,0.041114
2,Attack,0.035283
8,Height_m,0.028573


In [823]:
# It appears that both Type and Body Type doesn't play a big role in regards to predicting a legendary classification. This
# aligns with what I learn in earlier EDA. I will remove those from my random forest to help save time when running.
poke_data.head(1)

Unnamed: 0_level_0,Type_1,Total,HP,Attack,Defense,Sp_Atk,Sp_Def,Speed,Color,Pr_Male,Egg_Group_1,Height_m,Weight_kg,Catch_Rate,Body_Style,Gender_True,Legendary_True,hasMegaEvolution_True,2nd_Type,Offensive_Stats,Defensive_Stats
Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Grass,318,45,49,49,65,65,45,Green,0.875,Monster,0.71,6.9,45,quadruped,1.0,0.0,0.0,1,159,159


In [824]:
X = poke_data
y = poke_data['Legendary_True']
remove = ['Legendary_True', 'Body_Style', 'Color', 'Egg_Group_1', 'Type_1']
X.drop(remove, axis = 1, inplace=True)

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 20)
model.fit(X, y)

features = X.columns
feature_importances = model.feature_importances_

features_df = pd.DataFrame({'Features': features, 'Importance Score': feature_importances})
features_df.sort_values('Importance Score', inplace=True, ascending=False)

important_feat = features_df.head(100)
important_feat

Unnamed: 0,Features,Importance Score
10,Catch_Rate,0.369571
0,Total,0.142718
14,Offensive_Stats,0.106072
8,Height_m,0.074602
11,Gender_True,0.069429
9,Weight_kg,0.046288
15,Defensive_Stats,0.044971
4,Sp_Atk,0.039785
2,Attack,0.032586
7,Pr_Male,0.026009


In [825]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=1)

In [826]:
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=1)

In [827]:
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
# transform our training features
X_train_std = stdsc.fit_transform(X_train)
# transform the testing features in the same way
X_test_std = stdsc.transform(X_test)

In [828]:
from sklearn.model_selection import GridSearchCV
rf_model = RandomForestClassifier()        

params = {'n_estimators': [10,20,30,40,50], 'max_depth': range(3,5)}
clf = GridSearchCV(rf_model, params, cv=cv, scoring='f1_macro')
output = clf.fit(X_train, y_train)

In [829]:
clf.best_params_

{'max_depth': 3, 'n_estimators': 10}

In [830]:
clf.best_score_

0.93533969629982994

In [831]:
best_rf = clf.best_estimator_
print best_rf
print rf_model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)


In [832]:
###best_log.score(X_test_std, y_test)

In [833]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(best_rf, X_train_std, y_train, cv=10, scoring='accuracy')
print('CV Accuracy {}, Average Accuracy {}'.format(scores, scores.mean()))
scores

CV Accuracy [ 0.98181818  0.94545455  0.96363636  1.          0.98148148  0.98148148
  1.          1.          0.96226415  1.        ], Average Accuracy 0.981613620482


array([ 0.98181818,  0.94545455,  0.96363636,  1.        ,  0.98148148,
        0.98148148,  1.        ,  1.        ,  0.96226415,  1.        ])

In [834]:
# Null accuracy.....How do you find the null f1 Score
1-y.mean()

0.9361997226074896

In [835]:
scores_f1 = cross_val_score(best_rf, X_train_std, y_train, cv=10, scoring='f1_macro')
print('CV F1 Score {}, Average F1 Score {}'.format(scores_f1, scores_f1.mean()))

CV F1 Score [ 1.          0.89        0.82371795  1.          0.92362093  0.89514563
  0.92352092  0.92352092  0.92352092  0.8950495 ], Average F1 Score 0.919809678882


In [836]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
pred = output.predict(X_test)
score1 = accuracy_score(y_test,pred)
print score1
score2 = f1_score(y_test,pred)
print score2

0.994475138122
0.96


**REMOVING STATS FROM THE PICTURE**

In [837]:
poke_data3.head()

Unnamed: 0_level_0,Type_1,Total,HP,Attack,Defense,Sp_Atk,Sp_Def,Speed,Color,Pr_Male,Egg_Group_1,Height_m,Weight_kg,Catch_Rate,Body_Style,Gender_True,Legendary_True,hasMegaEvolution_True,2nd_Type,Offensive_Stats,Defensive_Stats,Body Type_bipedal_tailless,Body Type_four_wings,Body Type_head_arms,Body Type_head_base,Body Type_head_legs,Body Type_head_only,Body Type_insectoid,Body Type_multiple_bodies,Body Type_quadruped,Body Type_serpentine_body,Body Type_several_limbs,Body Type_two_wings,Body Type_with_fins,Type_Dark,Type_Dragon,Type_Electric,Type_Fairy,Type_Fighting,Type_Fire,Type_Flying,Type_Ghost,Type_Grass,Type_Ground,Type_Ice,Type_Normal,Type_Poison,Type_Psychic,Type_Rock,Type_Steel,Type_Water
Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
1,Grass,318,45,49,49,65,65,45,Green,0.875,Monster,0.71,6.9,45,quadruped,1.0,0.0,0.0,1,159,159,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Grass,405,60,62,63,80,80,60,Green,0.875,Monster,0.99,13.0,45,quadruped,1.0,0.0,0.0,1,202,203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Grass,525,80,82,83,100,100,80,Green,0.875,Monster,2.01,100.0,45,quadruped,1.0,0.0,1.0,1,262,263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Fire,309,39,52,43,60,50,65,Red,0.875,Monster,0.61,8.5,45,bipedal_tailed,1.0,0.0,0.0,0,177,132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Fire,405,58,64,58,80,65,80,Red,0.875,Monster,1.09,19.0,45,bipedal_tailed,1.0,0.0,0.0,0,224,181,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [838]:
remove_stats = ['Total','HP','Attack','Defense','Sp_Atk','Sp_Def','Speed','2nd_Type','Offensive_Stats','Defensive_Stats',
                'hasMegaEvolution_True','Catch_Rate','Type_1','Egg_Group_1','Body_Style','Color','Pr_Male']
poke_data3.drop(remove_stats, axis=1, inplace=True)


In [839]:
poke_data3.head()

Unnamed: 0_level_0,Height_m,Weight_kg,Gender_True,Legendary_True,Body Type_bipedal_tailless,Body Type_four_wings,Body Type_head_arms,Body Type_head_base,Body Type_head_legs,Body Type_head_only,Body Type_insectoid,Body Type_multiple_bodies,Body Type_quadruped,Body Type_serpentine_body,Body Type_several_limbs,Body Type_two_wings,Body Type_with_fins,Type_Dark,Type_Dragon,Type_Electric,Type_Fairy,Type_Fighting,Type_Fire,Type_Flying,Type_Ghost,Type_Grass,Type_Ground,Type_Ice,Type_Normal,Type_Poison,Type_Psychic,Type_Rock,Type_Steel,Type_Water
Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
1,0.71,6.9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.99,13.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.01,100.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.61,8.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.09,19.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [840]:
X2 = poke_data3
y2 = poke_data3['Legendary_True']
remove2 = ['Legendary_True']
X2.drop(remove2, axis = 1, inplace=True)

from sklearn.ensemble import RandomForestClassifier
model2 = RandomForestClassifier(n_estimators = 20)
model.fit(X2, y2)

features = X2.columns
feature_importances = model.feature_importances_

features_df = pd.DataFrame({'Features': features, 'Importance Score': feature_importances})
features_df.sort_values('Importance Score', inplace=True, ascending=False)

important_feat = features_df.head(100)
important_feat2 = important_feat.iloc[0:-20]
important_feat

Unnamed: 0,Features,Importance Score
2,Gender_True,0.259153
1,Weight_kg,0.245525
0,Height_m,0.218118
17,Type_Dragon,0.029346
5,Body Type_head_arms,0.028064
11,Body Type_quadruped,0.022564
14,Body Type_two_wings,0.020819
8,Body Type_head_only,0.016431
3,Body Type_bipedal_tailless,0.015996
29,Type_Psychic,0.015692


In [846]:
features = important_feat2.iloc[:,0]

2                    Gender_True
1                      Weight_kg
0                       Height_m
17                   Type_Dragon
5            Body Type_head_arms
11           Body Type_quadruped
14           Body Type_two_wings
8            Body Type_head_only
3     Body Type_bipedal_tailless
29                  Type_Psychic
32                    Type_Water
26                      Type_Ice
25                   Type_Ground
Name: Features, dtype: object

In [849]:
X2_features = X2[features]
X2_features.head()

Unnamed: 0_level_0,Height_m,Weight_kg,Gender_True,Body Type_bipedal_tailless,Body Type_four_wings,Body Type_head_arms,Body Type_head_base,Body Type_head_legs,Body Type_head_only,Body Type_insectoid,Body Type_multiple_bodies,Body Type_quadruped,Body Type_serpentine_body,Body Type_several_limbs,Body Type_two_wings,Body Type_with_fins,Type_Dark,Type_Dragon,Type_Electric,Type_Fairy,Type_Fighting,Type_Fire,Type_Flying,Type_Ghost,Type_Grass,Type_Ground,Type_Ice,Type_Normal,Type_Poison,Type_Psychic,Type_Rock,Type_Steel,Type_Water
Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
1,0.71,6.9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.99,13.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.01,100.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.61,8.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.09,19.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [850]:
from sklearn.model_selection import train_test_split
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2_features, y2, stratify=y2, random_state=1)

In [851]:
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=1)

In [852]:
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
# transform our training features
X_train_std2 = stdsc.fit_transform(X_train2)
# transform the testing features in the same way
X_test_std2 = stdsc.transform(X_test2)

In [853]:
from sklearn.model_selection import GridSearchCV
rf_model2 = RandomForestClassifier()        

params = {'n_estimators': [80,90,100], 'max_depth': range(3,6), 'max_features': (5,15)}
clf2 = GridSearchCV(rf_model2, params, cv=cv, scoring='f1_macro')
output2 = clf2.fit(X_train_std2, y_train2)

In [854]:
output2

GridSearchCV(cv=ShuffleSplit(n_splits=10, random_state=1, test_size=0.3, train_size=None),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [80, 90, 100], 'max_features': (5, 15), 'max_depth': [3, 4, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='f1_macro', verbose=0)

In [855]:
clf2.best_params_


{'max_depth': 3, 'max_features': 15, 'n_estimators': 80}

In [856]:
clf2.best_score_

0.84333227414202405

In [857]:
best_params = clf2.best_estimator_

In [858]:
scores1 = cross_val_score(best_params, X_train_std2, y_train2, cv=10, scoring='f1_macro')
print('CV Accuracy {}, Average Accuracy {}'.format(scores1, scores1.mean()))
scores

CV Accuracy [ 0.82371795  0.82371795  0.92371706  0.92371706  0.74038462  0.89514563
  1.          0.82333333  0.74019608  0.74019608], Average Accuracy 0.843412575336


array([ 0.98181818,  0.94545455,  0.96363636,  1.        ,  0.98148148,
        0.98148148,  1.        ,  1.        ,  0.96226415,  1.        ])

In [859]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
pred2 = output2.predict(X_test_std2)
score3 = accuracy_score(y_test2,pred2)
print score3
score4 = f1_score(y_test2,pred2)
print score4

0.983425414365
0.857142857143


**Logistic Reg**

In [716]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
logreg = LogisticRegression(penalty='l2', C=10)      

params2 = {'penalty': ['l1'], 'C': range(1,10), 'class_weight': [None, 'balanced']}
logit = GridSearchCV(logreg, params2, cv=cv, scoring='f1_macro')
output3 = logit.fit(X_train_std2, y_train2)

In [717]:
logit.best_params_

{'C': 1, 'class_weight': None, 'penalty': 'l1'}

In [718]:
best_logit_param = logit.best_estimator_

In [719]:
logit.best_score_

0.80564415569949466

In [720]:
scores_log = cross_val_score(best_logit_param, X_train_std2, y_train2, cv=10, scoring='f1_macro')
print('CV Accuracy {}, Average F1 Score {}'.format(scores_log, scores_log.mean()))
scores_log

CV Accuracy [ 0.82371795  0.73039216  0.92371706  0.81848185  0.7708628   0.74038462
  0.81786942  0.82333333  0.82333333  0.74019608], Average F1 Score 0.801228859026


array([ 0.82371795,  0.73039216,  0.92371706,  0.81848185,  0.7708628 ,
        0.74038462,  0.81786942,  0.82333333,  0.82333333,  0.74019608])

In [721]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
pred3 = output3.predict(X_test_std2)
score5 = accuracy_score(y_test2,pred3)
print score5
score6 = f1_score(y_test2,pred3)
print score6

0.977900552486
0.8


In [722]:
best_logit_param.coef_

array([[ 0.25179182,  0.50097449, -1.38777341,  0.23208932,  0.        ,
         0.33555328, -0.19927519, -0.09561123, -0.02034407,  0.        ,
        -0.46163569,  0.46642182, -0.02527748,  0.        ,  0.54934573,
         0.        ,  0.20666204,  0.59725111,  0.        ,  0.        ,
         0.        ,  0.47137639,  0.26200017,  0.        ,  0.29982146,
        -0.19542525, -0.01938092, -0.03195328,  0.        ,  0.3800351 ,
         0.        ,  0.        ,  0.        ]])

In [860]:
pd.DataFrame({'features': X2.columns, 'coefficients': best_logit_param.coef_[0], 'log_odds': np.exp(best_logit_param.coef_[0])})

Unnamed: 0,coefficients,features,log_odds
0,0.251792,Height_m,1.286328
1,0.500974,Weight_kg,1.650329
2,-1.387773,Gender_True,0.249631
3,0.232089,Body Type_bipedal_tailless,1.261232
4,0.0,Body Type_four_wings,1.0
5,0.335553,Body Type_head_arms,1.398714
6,-0.199275,Body Type_head_base,0.819324
7,-0.095611,Body Type_head_legs,0.908817
8,-0.020344,Body Type_head_only,0.979861
9,0.0,Body Type_insectoid,1.0
