In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
data = pd.read_csv("../input/Pokemon.csv")

Let's check the data out, How many pokemon are in the dataset?

In [None]:
len(data)

In [None]:
data.head(9)

In [None]:
data.columns

So we have 13 features (columns) in our data.

We can see that not all pokemon have dual types e.g. Charmander, so lets replace any NaN values in Type 2 column

In [None]:
data['Type 2'].fillna(value='None',inplace=True)

check the data to see the NaN have bits been updated

In [None]:
data.head(9)

Nice! lets start doing some visualisation to understand our data

1. How much of each primary type are there?

(pandas.Series.value_counts returns object containing counts of unique values)

In [None]:
data['Type 1'].value_counts().plot.bar()

Woaahhh so many water types!! we can see that there are not many primary flying types, hmm why dont we look at the type 2 count too!

In [None]:
data['Type 2'].value_counts().plot.bar()

So we can see that having a second type is actually quite rare among all the 800 pokemon with almost 50% having no type at all.

In pokemon the Legendary pokemon, were always the coolest. Lets see how many there are?

In [None]:
data['Legendary'].value_counts().plot.bar()

Damn! The legendary Pokémon live up to their name of rarity with less than a 1/8 of Pokemon holding that status.

I wonder if any of the other features in the dataset can indicate whether a pokemon is legendary or not!

Lets use a decision tree to model this problem!
* Decision tree learners create biased trees if some classes dominate. It is therefore recommended to balance the dataset prior to fitting with the decision tree.

In [None]:
from sklearn.model_selection import train_test_split
legendaryPokemon = data.loc[data['Legendary']==True]
normalPokemon = data.loc[data['Legendary']==False]
# we will only use the pokemon battle stats + types to determine whether it is legendary or not 
legendaryPokemon = legendaryPokemon[['Type 1','Type 2','Total','HP','Attack','Defense','Sp. Atk','Sp. Def','Speed','Legendary']]
normalPokemon = normalPokemon[['Type 1','Type 2','Total','HP','Attack','Defense','Sp. Atk','Sp. Def','Speed','Legendary']]

# now we will randomly sample random non-legendary pokemon from the data set to balance our dataset

sampledNormalPokemon = normalPokemon.sample(100)


x = pd.concat([legendaryPokemon, sampledNormalPokemon])
x = pd.get_dummies(x)
# take last column as training labels and drop it from the training data
y = x['Legendary']
x = x.drop('Legendary', 1)

In [None]:
testNormalPokemon = pd.get_dummies(normalPokemon)
testNormalPokemon.head()

In [None]:
#Using the train_test_split to create train and test sets.
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state = 47, test_size = 0.30)

In [None]:
# now that we have split our train, test data. Let's increase the amount of Legendary pokemon in our training data, 
# by creating synthetic examples using the SMOTE algorithm
from imblearn.over_sampling import SMOTE

# sampling ration of 1.0 will equally balance the binary classes
sm = SMOTE(random_state=15,sampling_strategy= 1.0)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)


In [None]:

X_train_res.shape

In [None]:
(y_train_res == True).sum()

In [None]:
from sklearn.ensemble import RandomForestClassifier # for random forest classifier
model = RandomForestClassifier(n_estimators=100,max_depth=7)

In [None]:
#Training the random forest classifier. 
model.fit(X_train_res, y_train_res)


In [None]:
#Predicting labels on the test set.
y_pred =  model.predict(X_test)

In [None]:
#Importing the accuracy metric from sklearn.metrics library

from sklearn.metrics import accuracy_score
print('Accuracy Score on train data: ', accuracy_score(y_true=y_train_res, y_pred=model.predict(X_train_res)))
print('Accuracy Score on test data: ', accuracy_score(y_true=y_test, y_pred=y_pred))

In [None]:
# feature importance
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(model.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
importances.head(10)

In [None]:
plot = importances.plot.pie(y='importance', figsize=(10, 10))

In [None]:
import sklearn.tree 
import graphviz 

# Extract single tree
estimator = model.estimators_[4]

dot_data = dot_data = sklearn.tree.export_graphviz(estimator, out_file=None, 
               feature_names=x.columns,  
                class_names=['normal','legendary'] , filled=True, rounded=True,  special_characters=True)  
graph = graphviz.Source(dot_data) 

graph