In [1]:
# importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

In [2]:
# calling data from the directory Dataset in my drive folder.
# https://www.kaggle.com/alopez247/pokemon
pokemon_data = pd.read_csv('../input/pokemon-dataset/Pokemon Data.csv')

In [3]:
poke = pd.DataFrame(pokemon_data)
poke.head()

In [4]:
# checking out for null values.
poke.isnull().sum()

In [5]:
# from the shape it is clear the dataset is small, meaning we can simply remove the null values columns as filling them can
# make the dataset alittle biased.
poke.shape

In [6]:
# type_2, egg_group_2 and Pr_male have null values.
poke['Pr_Male'].value_counts()

In [7]:
# Since Type_2 and Egg_group_2 columns have so many NULL values we will be removing those columns, you can impute them with other methods too,
# but for simplicity we wont do that here. We are only setting Pr_Male column since it had only 77 missing values.

poke['Pr_Male'].fillna(0.500, inplace=True)
poke['Pr_Male'].isnull().sum()

In [8]:
poke.tail()

In [9]:
# dropping columns

new_poke = poke.drop(['Type_2', 'Egg_Group_2'], axis=1)

In [10]:
# From the data understand each columns type and values.

new_poke.describe()
# (20, 20000) -> x -min/ max-min -> x = 300 -> 300-20/19980 -> a very small value

In [11]:
plt.figure(figsize=(15,15))
sns.heatmap(new_poke.corr(),annot=True,cmap='viridis',linewidths=.5)

In [12]:
# the above is a correlation graph which tells you how much is a feature correlated to another, sice high correlation means one of the two feature
# is not telling much to the model when predicting.
# Usually it is to be determined by you itself for the value of correlation that is high and is to be removed.

In [13]:
# From the above table it is clear that different features have different ranges of value, which creates complexity for model, so we tone them 
# down usually using StandardScalar() class which we will do later on.

In [14]:
# Now try to understand which columns are categorical, they are hidden mines of data and feature engineering is usually done on them.

new_poke['Type_1'].value_counts()

In [15]:
new_poke['Generation'].value_counts()

In [16]:
new_poke['Color'].value_counts()

In [17]:
new_poke['Egg_Group_1'].value_counts()

In [18]:
new_poke['Body_Style'].value_counts()

In [19]:
# There are more categorical data, which I have not listed here, can you find those?
# Can you identify, why I have not listed them

In [20]:
# Feature Engineering

# creating new categories or actually merging categories so it is easy for to wrok with afterwards.
# Now this may seem uncomfortable to some but you will get it why I did it like that.

poke_type1 = new_poke.replace(['Water', 'Ice'], 'Water')
poke_type1 = poke_type1.replace(['Grass', 'Bug'], 'Grass')
poke_type1 = poke_type1.replace(['Ground', 'Rock'], 'Rock')
poke_type1 = poke_type1.replace(['Psychic', 'Dark', 'Ghost', 'Fairy'], 'Dark')
poke_type1 = poke_type1.replace(['Electric', 'Steel'], 'Electric')

poke_type1['Type_1'].value_counts()

In [21]:
poke_type1.head()

In [22]:
ref1 = dict(poke_type1['Body_Style'].value_counts())

poke_type1['Body_Style_new'] = poke_type1['Body_Style'].map(ref1)

In [23]:
# You may be wondering what I did, I took the value counts of each body type and replced the body type with the numbers, see below
poke_type1['Body_Style_new'].head()

In [24]:
poke_type1['Body_Style'].head()

In [25]:
# encoding data - features like Type_1 and Color

In [26]:
types_poke = pd.get_dummies(poke_type1['Type_1'])
color_poke = pd.get_dummies(poke_type1['Color'])

X = pd.concat([poke_type1, types_poke], axis=1)
X = pd.concat([X, color_poke], axis=1)

X.head()

In [27]:
# Now we have build some features and extracted some feature data, whats left is to remove redundant features
X.columns

In [28]:
X_ = X.drop(['Number', 'Name', 'Type_1', 'Color', 'Egg_Group_1'], axis = 1)
X_.shape

In [29]:
X.shape

In [30]:
# creating and training our model

In [31]:
# Lastly we define our target variable and set it into a variable called y
y = X_['isLegendary']
X_final = X_.drop(['isLegendary', 'Body_Style'], axis = 1)

In [32]:
X_final.columns

In [33]:
X_final.head()

In [34]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X_final, y, test_size=0.2)

In [35]:
random_model = RandomForestClassifier(n_estimators=500, random_state = 42)

In [36]:
#Fit
model_final = random_model.fit(Xtrain, ytrain)

y_pred = model_final.predict(Xtest)

#Checking the accuracy
random_model_accuracy = round(model_final.score(Xtrain, ytrain)*100,2)
print(round(random_model_accuracy, 2), '%')

In [37]:
random_model_accuracy1 = round(random_model.score(Xtest, ytest)*100,2)
print(round(random_model_accuracy1, 2), '%')

In [38]:
print(type(ytest))
Ytest = np.array(ytest)

In [41]:
type(y_pred)

In [43]:
count = 0
for i in range(len(ytest)):
  if Ytest[i] == y_pred[i]:
    count = count + 1

print((count/len(ytest))*100)

In [45]:
# save the model to disk
import pickle
filename = 'Pokemon Data.pickle'
pickle.dump(model_final, open(filename, 'wb'))

In [None]:
"""
# load the model from disk
filename = 'drive/MyDrive/Dataset/Models/pokemon_model.pickle'
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(Xtest, ytest)
"""