## Playing with Pandas, Seaborn and Machine Learning with Pokemon

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

Loading the Pokemon DataSet

In [None]:
df = pd.read_csv('../input/Pokemon.csv')

In [None]:
df.head()

Step 1 - Filling Type 2 - NaN with the Type 1

In [None]:
df['Type 2'].fillna(df['Type 1'],inplace = True)

Step 2 - Spliting the name to put only Mega and the name

In [None]:
def sep(x):
    return x.split('Mega ')[0]

In [None]:
df['Name'] = df['Name'].apply(lambda x: sep(x))

In [None]:
for i in range(1,len(df['Name'])):
    if df['Name'][i] == df['Name'][i-1]:
        df['Name'][i] = 'Mega ' + df['Name'][i]

In [None]:
df.head()

Step 3 - Removing # and put the Pokemon's name as index

In [None]:
df.drop('#',inplace=True,axis=1)

In [None]:
df.set_index('Name',inplace=True)

In [None]:
df.head()

Let's check the best Pokemons by status

Total max status

In [None]:
df[df['Total'] == df['Total'].max()]

HP max status

In [None]:
df[df['HP'] == df['HP'].max()]

Attack max status

In [None]:
df[df['Attack'] == df['Attack'].max()]

Max defense 

In [None]:
df[df['Defense'] == df['Defense'].max()]

Max Sp.Atk

In [None]:
df[df['Sp. Atk'] == df['Sp. Atk'].max()]

Max Sp.Def

In [None]:
df[df['Sp. Def'] == df['Sp. Def'].max()]

In [None]:
df[df['Speed'] == df['Speed'].max()]

Brief statistical description

In [None]:
df.describe()

Check the best pokemon by total status and type 1 and put them in a DataFrame

In [None]:
a = df.groupby('Type 1')['Total'].idxmax()
b = df.groupby('Type 1')['Total'].max()

In [None]:
c = pd.DataFrame({'Name':a,'Total':b})
c

Sort the dataframe by status 

In [None]:
c.sort_values(by=['Total'])

Distribution of types

In [None]:
plt.figure(figsize=(11,6))
sns.set()
plt.title('Distribution of pokemon types')
k = sns.countplot(x = 'Type 1',data=df)
k.set_xticklabels(k.get_xticklabels(), rotation=45)
plt.show()

Histogram of total status

In [None]:
plt.figure(figsize=(11,6))
sns.distplot(df['Total'],color='r')

Let's make a boxplot to check total status by type

In [None]:
sns.set()
plt.figure(figsize=(14,8))
k = sns.boxplot(x='Type 1',y = 'Total', data = df)
k.set_xticklabels(k.get_xticklabels(), rotation=45)
plt.show()

Count of Pokemon by generation

In [None]:
plt.figure(figsize=(11,6))
sns.set()
plt.title('Count of pokemon by generations')
k = sns.countplot(x = 'Generation',data=df)
k.set_xticklabels(k.get_xticklabels(), rotation=45)
plt.show()

Creating a matrix of correlation

In [None]:
plt.figure(figsize=(8,8))
j = df.iloc[:,2:9]
sns.heatmap(j.corr(),square=True,robust=True,annot=True,cmap='jet')

## Start to use Machine Learning to determinate if the pokemon is legendary or not

First, let's convert True or False in Legendary to 0 and 1

In [None]:
def legend(x):
    if x == True:
        return 0
    if x == False:
        return 1

In [None]:
df['Legendary'] = df['Legendary'].apply(lambda x: legend(x))

In [None]:
df.head()

In [None]:
X = df.iloc[:,:10].values

In [None]:
y = df.iloc[:,10].values

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Normalizing data
scaler = MinMaxScaler(feature_range = (0, 1))
X[:,2:9] = scaler.fit_transform(X[:,2:9])

In [None]:
a = pd.DataFrame(X)

Getting dummy variables for pokemon types and generation

In [None]:
b = pd.get_dummies(a, columns=[0,1,9])
X = b.iloc[:,0:].values

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
# Number of folds
num_folds = 10
seed = 7

# Number of trees
num_trees = 100

# Separating by folds
kfold = KFold(num_folds, True, random_state = seed)

# Creating the model
modelo = GradientBoostingClassifier(n_estimators = num_trees, random_state = seed)

# Cross Validation
resultado = cross_val_score(modelo, X, y, cv = kfold)

# Print 
print("Accuracy: %.3f" % (resultado.mean() * 100))

Using PCA

In [None]:
from sklearn.decomposition import PCA

In [None]:
# Choosing attributes
pca = PCA(n_components = 4)
fit = pca.fit(X)


print("Variance: %s" % fit.explained_variance_ratio_)
print(np.sum(fit.explained_variance_ratio_))
p = []
x = []
for i in range(1,25):
    pca = PCA(n_components = i)
    fit = pca.fit(X)
    x.append(i)
    p.append(np.sum(fit.explained_variance_ratio_))
x_pca = pca.transform(X)

In [None]:
plt.grid(True)
plt.scatter(x,p)
plt.show()

In [None]:
# Folds
num_folds = 10
seed = 7

# Number of trees
num_trees = 100

# Folds in data
kfold = KFold(num_folds, True, random_state = seed)

# model
modelo = GradientBoostingClassifier(n_estimators = num_trees, random_state = seed)

# Cross Validation
resultado = cross_val_score(modelo, x_pca, y, cv = kfold)

# Printing result with PCA
print("Accuracy: %.3f" % (resultado.mean() * 100))

In [None]:
from xgboost import XGBClassifier

In [None]:
#model
modelo = XGBClassifier(n_estimators = num_trees, random_state = seed)

# Cross Validation
resultado = cross_val_score(modelo, X, y, cv = kfold)

# # Printing result
print("Acurácia: %.3f" % (resultado.mean() * 100))

In [None]:
#model
modelo = XGBClassifier(n_estimators = num_trees, random_state = seed)

# Cross Validation
resultado = cross_val_score(modelo, x_pca, y, cv = kfold)

## Printing result with PCA
print("Accuracy: %.3f" % (resultado.mean() * 100))

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
# Model
modelo = MLPClassifier(hidden_layer_sizes=500,max_iter=3000,tol=1e-7,solver='adam')

# Cross Validation
resultado = cross_val_score(modelo, X, y, cv = kfold)

# Print result
print("Accuracy: %.3f" % (resultado.mean() * 100))

In [None]:
# model
modelo = MLPClassifier(hidden_layer_sizes=500,max_iter=3000,tol=1e-7,solver='adam')

# Cross Validation
resultado = cross_val_score(modelo, x_pca, y, cv = kfold)

# # Printing result with PCA
print("Accuracy: %.3f" % (resultado.mean() * 100))