In [1]:
import pandas as pd
import numpy as np
import string

pokemon = pd.read_csv('pokedex_entries.csv')

df = pokemon[['name', 'abilities', 'type1', 'type2', 'Pokedex Entry']]

df = pd.melt(df, id_vars = ['name', 'Pokedex Entry', 'abilities'], value_vars = ['type1', 'type2']).dropna()

In [2]:
# Use str.split to split by commas into different columns
new = df['abilities'].str.split(',', expand = True)

# Add the new columns to existing dataframe
# We only want the first three columns, because additional columns come from new forms of
# the pokemon.
df['Ability1'] = new[0]
df['Ability2'] = new[1]
df['Ability3'] = new[2]

df


Unnamed: 0,name,Pokedex Entry,abilities,variable,value,Ability1,Ability2,Ability3
0,Bulbasaur,Bulbasaur can be seen napping in bright sunlig...,"['Overgrow', 'Chlorophyll']",type1,grass,['Overgrow','Chlorophyll'],
1,Ivysaur,There is a bud on this Pokémon’s back. To supp...,"['Overgrow', 'Chlorophyll']",type1,grass,['Overgrow','Chlorophyll'],
2,Venusaur,There is a large flower on Venusaur’s back. Th...,"['Overgrow', 'Chlorophyll']",type1,grass,['Overgrow','Chlorophyll'],
3,Charmander,The flame that burns at the tip of its tail is...,"['Blaze', 'Solar Power']",type1,fire,['Blaze','Solar Power'],
4,Charmeleon,Charmeleon mercilessly destroys its foes using...,"['Blaze', 'Solar Power']",type1,fire,['Blaze','Solar Power'],
...,...,...,...,...,...,...,...,...
1595,Pheromosa,"One of the dangerous Ultra Beasts, it has been...",['Beast Boost'],type2,fighting,['Beast Boost'],,
1597,Celesteela,It appeared from the Ultra Wormhole. Witnesses...,['Beast Boost'],type2,flying,['Beast Boost'],,
1598,Kartana,This Ultra Beast came from the Ultra Wormhole....,['Beast Boost'],type2,steel,['Beast Boost'],,
1599,Guzzlord,It has gobbled mountains and swallowed whole b...,['Beast Boost'],type2,dragon,['Beast Boost'],,


In [3]:
# Interesting way to remove punctuation
import string
abilities = ['Ability1', 'Ability2', 'Ability3']

# Fill nas with 'None', then strip punctuation away from each column
for ability in abilities:
    df[ability] = df[ability].fillna('None').apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

# for abilities in pokemon[abilities]:
#     abilities = abilities.translate(str.maketrans('', '', string.punctuation))
#     print(abilities)

df

Unnamed: 0,name,Pokedex Entry,abilities,variable,value,Ability1,Ability2,Ability3
0,Bulbasaur,Bulbasaur can be seen napping in bright sunlig...,"['Overgrow', 'Chlorophyll']",type1,grass,Overgrow,Chlorophyll,
1,Ivysaur,There is a bud on this Pokémon’s back. To supp...,"['Overgrow', 'Chlorophyll']",type1,grass,Overgrow,Chlorophyll,
2,Venusaur,There is a large flower on Venusaur’s back. Th...,"['Overgrow', 'Chlorophyll']",type1,grass,Overgrow,Chlorophyll,
3,Charmander,The flame that burns at the tip of its tail is...,"['Blaze', 'Solar Power']",type1,fire,Blaze,Solar Power,
4,Charmeleon,Charmeleon mercilessly destroys its foes using...,"['Blaze', 'Solar Power']",type1,fire,Blaze,Solar Power,
...,...,...,...,...,...,...,...,...
1595,Pheromosa,"One of the dangerous Ultra Beasts, it has been...",['Beast Boost'],type2,fighting,Beast Boost,,
1597,Celesteela,It appeared from the Ultra Wormhole. Witnesses...,['Beast Boost'],type2,flying,Beast Boost,,
1598,Kartana,This Ultra Beast came from the Ultra Wormhole....,['Beast Boost'],type2,steel,Beast Boost,,
1599,Guzzlord,It has gobbled mountains and swallowed whole b...,['Beast Boost'],type2,dragon,Beast Boost,,


In [4]:
# Melt the ability columns just like the originals
poke = pd.melt(df, 
             id_vars = ['name', 'Pokedex Entry', 'value'], 
             value_vars = abilities, var_name = 'variable',
            value_name = 'Ability')

In [5]:
# Drop any rows that have no ability information.
poke = poke.replace('None', np.nan).dropna().drop('variable', axis = 1)

In [6]:
# One hot encode the abilities.

df = pd.get_dummies(poke, columns = ['Ability'], drop_first = True)

df

Unnamed: 0,name,Pokedex Entry,value,Ability_ Aftermath,Ability_ Analytic,Ability_ Anger Point,Ability_ Anticipation,Ability_ Arena Trap,Ability_ Aroma Veil,Ability_ Battle Armor,...,Ability_Vital Spirit,Ability_Volt Absorb,Ability_Water Absorb,Ability_Water Bubble,Ability_Water Compaction,Ability_Water Veil,Ability_White Smoke,Ability_Wimp Out,Ability_Wonder Guard,Ability_Wonder Skin
0,Bulbasaur,Bulbasaur can be seen napping in bright sunlig...,grass,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Ivysaur,There is a bud on this Pokémon’s back. To supp...,grass,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Venusaur,There is a large flower on Venusaur’s back. Th...,grass,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Charmander,The flame that burns at the tip of its tail is...,fire,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Charmeleon,Charmeleon mercilessly destroys its foes using...,fire,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3634,Togedemaru,"When it’s surprised or agitated, the 14 fur sp...",steel,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3636,Bruxish,When it unleashes its psychic power from the p...,psychic,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3637,Drampa,If a child it has made friends with is bullied...,dragon,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3639,Hakamo-o,It makes noise by clanging its scales together...,fighting,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# We need to separate out the dummies from the text. Create a function that splits on the 
# text.

from sklearn.preprocessing import FunctionTransformer

# Dummy columnes
dummy_col = df.iloc[:, 3:].columns

# Tested - works
get_text_data = FunctionTransformer(lambda x: x['Pokedex Entry'])
get_dummies_data = FunctionTransformer(lambda x: x[dummy_col])

# This should get me the correct data
# Problem with dummifying the data within the pipeline is that you need ALL values 
# on both sides of the initial train_test_split, otherwise the features will be of different
# lengths. 
# Aka, if one side doesn't have Poison Point, then there won't be a Poison Point feature
# on that side
# get_dummies_data = FunctionTransformer(lambda x: pd.get_dummies(x['Ability'], drop_first = True))

In [8]:
from sklearn.model_selection import train_test_split

# Get target variable
X = df.drop(['name', 'value'], axis = 1)
y = df['value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, 
                                                    stratify = y, random_state = 42)

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

alphanumeric = '[A-Za-z0-9]+(?=\\s+)'

dummy_features = Pipeline([('selector', get_dummies_data)])

text_features = Pipeline([('selector', get_text_data), 
                          ('vectorizer', CountVectorizer(token_pattern = alphanumeric,
                                                         stop_words = 'english')), 
                          ('transformer', TfidfTransformer())])

# Combine the two arrays into a single function
feature_processing = FeatureUnion(transformer_list = [('dummy_features', dummy_features),
                                                     ('text_features', text_features)])

# Full pipeline
pipeline = Pipeline([
    ('union', feature_processing),
    # RandomForest incredibly slow, don't use
    # ('clf', RandomForestClassifier(max_features = 'sqrt', n_estimators = 1000))
    ('clf', LogisticRegression(C = 1.0, solver = 'liblinear'))    
    ])

pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('union',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('dummy_features',
                                                 Pipeline(memory=None,
                                                          steps=[('selector',
                                                                  FunctionTransformer(accept_sparse=False,
                                                                                      check_inverse=True,
                                                                                      func=<function <lambda> at 0x0000000007EC2E58>,
                                                                                      inv_kw_args=None,
                                                                                      inverse_func=None,
                                                                                      kw_args=None,
                                     

In [10]:
def my_custom_scorer_func(y_true, y_pred):
    score = []
    for x, i in enumerate(y_true.index):
        name = df.loc[i]['name']
        type1 = pokemon[pokemon['name'] == name]['type1'].item()
        type2 = pokemon[pokemon['name'] == name]['type2'].item()
        if (y_pred[x] == type1) | (y_pred[x] == type2):
            score.append(1)
        else:
            score.append(0)
    error = np.sum(score) / len(score)
    return error

from sklearn.metrics import make_scorer
my_scorer = make_scorer(my_custom_scorer_func, greater_is_better=True)

y_pred = pipeline.predict(X_test)
my_custom_scorer_func(y_test, y_pred)

0.9594594594594594

In [11]:
# # TUNING PAGE
# # Record scores

# # LogisticRegression -- 53% on Gen8
# # Tuned LogisticRegression -- 55%
# # RandomForest -- 41% on Gen8


# from sklearn.model_selection import GridSearchCV

# # param_grid = {
# #     'clf__C' : [100, 10, 1.0, 0.1, 0.01],
# #     'clf__solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
# # }

# param_grid = {
#     'clf__max_features' : ['sqrt', 'log2'],
#     'clf__n_estimators' : [10, 100, 1000]
# }

# search = GridSearchCV(pipeline, param_grid, scoring = my_scorer, cv = 3, verbose = 1, n_jobs = -1)
# search.fit(X_train, y_train)

# print("Best parameter CV score={:.3f}:".format(search.best_score_))
# print(search.best_params_)

In [12]:
# Slight problem - if we pre-dummify our data, then we can't really input new data.
# Perhaps make the dummy_features pipeline get our dummies for us <- Not the correct solution
# Instead, do this:

Pokedex8 = pd.read_csv('Datacamp CSV/PokeDex8.csv')

# Prep Galar data
# galar = pd.melt(gen8, id_vars = ['Name'], 
#                 value_vars = ['Ability1', 'Ability2', 'Hidden_Ability'], 
#                 value_name = 'Ability').drop('variable', axis = 1).dropna()

galar = Pokedex8[['Name', 'Ability1']]

# Dummify the Ability column
galar = galar.rename(columns={"Ability1": "Ability"})
galar = pd.get_dummies(galar, columns = ['Ability'], drop_first = True)

In [13]:
# Get only the Gen8 pokemon
gen8 = pd.read_csv('Datacamp CSV/gen8.csv', header = None, names=['Name'])
galar = pd.merge(galar, gen8, how = 'right')

In [14]:
# Now let's grab the pokedex entries.
import requests
from bs4 import BeautifulSoup

for i, x in enumerate(galar['Name']): 
    url = 'https://pokemondb.net/pokedex/' + x
    
    # Access the content in the URL
    html_content = requests.get(url).text

    # Parse the html content
    soup = BeautifulSoup(html_content, "lxml")
    
    # This finds the pokedex entries, then finds the table immediately after that.
    try:    
        table = soup.find('h2', text='Pokédex entries').find_next('table')
        flavor = table.find_all('td', attrs = {'class': 'cell-med-text'})

        # Specifically, we'll retrieve the latest 4 pokedex entries.
        # Note that this does NOT pull megas or different forms
        poketext = ' '.join([y.text for y in flavor[-2:]])
        
        # You can create a column like this:
        galar.loc[i, 'Pokedex Entry'] = poketext
        
    
    except:
        print('Scraping failed: ', x)
        pass
    
# Manually get the data
special = ['mr-rime']

special_index = galar[galar['Pokedex Entry'].isnull()].index

for i, x in enumerate(special_index): 
    url = 'https://pokemondb.net/pokedex/' + special[i]
    
    # Access the content in the URL
    html_content = requests.get(url).text

    # Parse the html content
    soup = BeautifulSoup(html_content, "lxml")
    
    # This finds the pokedex entries, then finds the table immediately after that.

    table = soup.find('h2', text='Pokédex entries').find_next('table')
    flavor = table.find_all('td', attrs = {'class': 'cell-med-text'})

    # Specifically, we'll retrieve the latest 4 pokedex entries.
    # Note that this does NOT pull megas or different forms
    poketext = ' '.join([x.text for x in flavor[-4:]])

    print(poketext)
    galar.loc[x, 'Pokedex Entry'] = poketext

Scraping failed:  MrRime
It’s highly skilled at tap-dancing. It waves its cane of ice in time with its graceful movements. Its amusing movements make it very popular. It releases its psychic power from the pattern on its belly.


In [67]:
# We can reindex the columns to make sure that we still have all the original dummy columns
# Note that during reindex, we want ALL the columns of the original dataframe. Also note that
# new values in the Ability column will be removed from the dataframe. We can think about how
# to resolve that later.
galar = galar.rename(columns = {'Name': 'name'})

galar = galar.reindex(columns = df.columns, fill_value = 0)

galar_test = galar.drop('name', axis = 1)

y_pred = pipeline.predict(galar_test)

# Grab the correct answers
Pokedex8 = Pokedex8.rename(columns = {'Name':'name'})
score_frame = pd.merge(galar, Pokedex8, how = 'left') [['name', 'Type1', 'Type2']]
score_frame['Predicted Type'] = y_pred

# Score the model - if it matches either of the pokemon's types, it's correct.
# We get around 40-50% for our models, pretty sweet!
print(((score_frame['Type1'] == score_frame['Predicted Type']) | (score_frame['Type2'] == score_frame['Predicted Type'])).sum() / len(score_frame))

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib qt

# For a heatmap, categorical variables provide more control
cat_type = pd.CategoricalDtype(categories= score_frame['Type1'].unique().tolist(), ordered=True)

# Convert to categorical
score_frame[['Type1', 'Predicted Type']] = score_frame[['Type1', 'Predicted Type']].astype(cat_type)

# Create the confusion matrix. Make sure to set dropna = False
confusion_matrix1 = pd.crosstab(score_frame['Type1'], 
                                score_frame['Predicted Type'], 
                                rownames=['Actual Type'], 
                                dropna = False) # This includes columns of 0s

# Create a Heatmap confusion matrix of our results. Very cool.
sns.set_context('notebook', font_scale=1.2)

plt.figure()
sns.heatmap(confusion_matrix1, annot = True)
plt.title('Total Pokemon: 81 \n Accuracy: {:.2f}'.format(((score_frame['Type1'] == score_frame['Predicted Type']) | (score_frame['Type2'] == score_frame['Predicted Type'])).sum() / len(score_frame)))
plt.show()

plt.subplots_adjust(
top=0.964,
bottom=0.158,
left=0.081,
right=0.986,
hspace=0.2,
wspace=0.2
)

0.5555555555555556


In [None]:
# Use this for a 'double counted' matrix - both pokemon types will be scored. This means that
# each pokemon with a dual typing will have one correct and one incorrect answer.

test = pd.melt(score_frame, 
        id_vars = ['name', 'Predicted Type'], 
        value_vars = ['Type1', 'Type2']).dropna()

test = test.astype(cat_type)

matrix = pd.crosstab(test['value'], 
           test['Predicted Type'],
           rownames = ['Actual Type'],
                    dropna = False)
plt.figure()
sns.heatmap(matrix, annot = True)

# Get sum of the matrix diagonal using this
# The sum of the confusion matrix diagonal of a double counted matrix gives the total number
# of correct answers.
np.diag(matrix.values).sum()

In [16]:
# pd.DataFrame({'Name': galar['Name'], 'Prediction': predictions})
predictions = pipeline.predict_proba(galar_test)

# Select top n results that you want. This also selects in the order that from best to worst
# guess. The results are selected in the form of their column indices
best_n = np.argsort(-predictions, axis=1)[:, :2]

# Dictionary that relates the index no. of each column to the class name
type_dictionary = {i: x for i, x in enumerate(pipeline.classes_)}

# Convert the indexes from best_n to their class name
results = np.vectorize(type_dictionary.get)(best_n)

# list(zip(galar['name'], results.tolist()))

[('Grookey', ['grass', 'flying']),
 ('Thwackey', ['grass', 'poison']),
 ('Rillaboom', ['grass', 'poison']),
 ('Scorbunny', ['fire', 'fighting']),
 ('Raboot', ['fire', 'fighting']),
 ('Cinderace', ['fire', 'fighting']),
 ('Sobble', ['water', 'normal']),
 ('Drizzile', ['water', 'ground']),
 ('Inteleon', ['water', 'ground']),
 ('Blipbug', ['bug', 'steel']),
 ('Dottler', ['bug', 'psychic']),
 ('Orbeetle', ['psychic', 'bug']),
 ('Rookidee', ['flying', 'normal']),
 ('Corvisquire', ['flying', 'normal']),
 ('Corviknight', ['flying', 'psychic']),
 ('Skwovet', ['normal', 'flying']),
 ('Greedent', ['normal', 'water']),
 ('Nickit', ['normal', 'dark']),
 ('Thievul', ['normal', 'dark']),
 ('Obstagoon', ['normal', 'water']),
 ('Wooloo', ['normal', 'fighting']),
 ('Dubwool', ['normal', 'fighting']),
 ('Chewtle', ['dragon', 'water']),
 ('Drednaw', ['dragon', 'rock']),
 ('Yamper', ['normal', 'electric']),
 ('Boltund', ['electric', 'dragon']),
 ('Gossifleur', ['grass', 'water']),
 ('Eldegoss', ['grass', 

In [17]:
alphanumeric = '[A-Za-z0-9]+(?=\\s+)'

dex_reader = Pipeline([('selector', get_text_data), 
                          ('vectorizer', CountVectorizer(token_pattern = alphanumeric,
                                                         stop_words = 'english')), 
                          ('transformer', TfidfTransformer()), 
                          ('classifier', LogisticRegression(C = 1291))])

dex_reader.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(memory=None,
         steps=[('selector',
                 FunctionTransformer(accept_sparse=False, check_inverse=True,
                                     func=<function <lambda> at 0x0000000007F26288>,
                                     inv_kw_args=None, inverse_func=None,
                                     kw_args=None, validate=False)),
                ('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', l...
                ('transformer',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 LogisticRegression(C=1291, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
          

In [18]:
y_pred = dex_reader.predict(X_test)
my_custom_scorer_func(y_test, y_pred)

0.9898648648648649

In [19]:
y_pred = dex_reader.predict(galar_test)

Pokedex8 = Pokedex8.rename(columns = {'Name':'name'})
score_frame = pd.merge(galar, Pokedex8, how = 'left') [['name', 'Type1', 'Type2']]
score_frame['Predicted Type'] = y_pred

print(((score_frame['Type1'] == score_frame['Predicted Type']) | (score_frame['Type2'] == score_frame['Predicted Type'])).sum() / len(score_frame))

score_frame.tail(81)

0.37037037037037035


Unnamed: 0,name,Type1,Type2,Predicted Type
0,Grookey,grass,,fighting
1,Thwackey,grass,,normal
2,Rillaboom,grass,,psychic
3,Scorbunny,fire,,flying
4,Raboot,fire,,normal
...,...,...,...,...
76,Drakloak,dragon,ghost,flying
77,Dragapult,dragon,ghost,normal
78,Zacian,fairy,,flying
79,Zamazenta,fighting,,grass


In [109]:
# galar.to_csv('Datacamp CSV/GalarDex.csv')
# poke.to_csv('Datacamp CSV/Pokemon_Machine_Learning.csv')

In [108]:
Pokedex8 = pd.read_csv('Datacamp CSV/PokeDex8.csv')

Pokedex8 = Pokedex8.rename(columns = {'Name':'name'})

galar_reset = pd.merge(galar, Pokedex8, how = 'left')[['name', 'Ability1', 'Ability2', 'Hidden_Ability', 'Type1', 'Type2']]

galar_reset['Pokedex Entry'] = galar['Pokedex Entry']

galar_reset = pd.melt(galar_reset, 
                      id_vars = ['name', 'Type1', 'Type2', 'Pokedex Entry'], 
                      value_vars = ['Ability1', 'Ability2', 'Hidden_Ability'], 
                      var_name = 'variable', 
                      value_name = 'Ability')

galar_reset = pd.melt(galar_reset, 
                      id_vars = ['name', 'Ability', 'Pokedex Entry'], 
                      value_vars = ['Type1', 'Type2'], 
                      var_name = 'variable', 
                      value_name = 'type')

galar_reset = galar_reset.drop('variable', axis = 1).dropna()

poke = poke.rename(columns = {'value':'type'})

poke = poke.append(galar_reset).reset_index()

poke.to_csv('')

ValueError: cannot insert level_0, already exists