# Hearthstone Project
***
***

# Goals
- Thoroughly prepare so that it is ready for exploration and modeling
    - I won't be creating any models for this project but I'd still like to prepare the data nonetheless
- Explore the data to gather insights about the characteristics of the game's different class types

# Setup
***

In [1]:
# establishing environment
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

# Acquire
Acquiring data from local csv files
***

In [49]:
# reading in card data and saving as DF
cards = pd.read_csv('hearthstone_standard_cards.csv')

# reading in card classes and saving as DF
classes = pd.read_csv('classes.csv')

# reading in minion types data and saving as DF
mtypes = pd.read_csv('minionTypes.csv')

# reading in rarities data and saving as DF
rarities = pd.read_csv('rarities.csv')

# reading in set groups data and saving as DF
setgroups = pd.read_csv('setGroups.csv')

# reading in card sets data and saving as DF
sets = pd.read_csv('sets.csv')

# reading in types data and saving as DF
ctypes = pd.read_csv('types.csv')

# reading in keywords data and saving as DF
keywords = pd.read_csv('keywords.csv')

# Prepare
Preparing data for exploration
***

## Merging Data frames

### Merging 'classes' DF

In [50]:
# removing brackets and commas from multiclassids column
cards.multiClassIds = cards.multiClassIds.str.replace('\]|,|\[' , '')

# creating column to hold primary class id 
# if card is of one class, this will reflect its sole class
# if card is dual, this will reflect the 1st of the two classes in the multiClassIds column
# necessary since dual class cards erroneously hold the 'neutral' class value in their primary class id 
cards['primeclassId'] = np.where((cards.multiClassIds.str.contains(' ')), cards["multiClassIds"].str.split(" ", expand = True)[0], cards.classId)

# converting key columns to make all value data types match
cards.primeclassId = cards.primeclassId.astype(str)
classes.id = classes.id.astype(str)

# merging 'classes' df with card df
df = pd.merge(cards, classes[['id', 'slug']], 
              left_on = 'primeclassId', right_on = 'id', how="left", 
              suffixes = (None, '_prime_hero_class'))

# dropping columns I no longer need
df.drop(columns = ['primeclassId', 'classId'])

Unnamed: 0,id,collectible,slug,multiClassIds,cardTypeId,cardSetId,rarityId,artistName,manaCost,name,...,duels,minionTypeId,health,attack,keywordIds,childIds,durability,armor,id_prime_hero_class,slug_prime_hero_class
0,58607,1,58607-blur,,5,1463,1,Zoltan Boros,0,Blur,...,"{'relevant': True, 'constructed': True}",,,,,,,,14,demonhunter
1,56806,1,56806-shadowhoof-slayer,,4,2,2,A.J. Nazzaro,1,Shadowhoof Slayer,...,"{'relevant': True, 'constructed': True}",15.0,1.0,2.0,[8],,,,14,demonhunter
2,58170,1,58170-crimson-sigil-runner,,4,1414,1,Arthur Bozonnet,1,Crimson Sigil Runner,...,,,1.0,1.0,[86],,,,14,demonhunter
3,59394,1,59394-demon-companion,14 3,5,1443,3,Zoltan Boros,1,Demon Companion,...,"{'relevant': True, 'constructed': True}",,,,,"[59391, 59392, 59393]",,,14,demonhunter
4,59606,1,59606-double-jump,,5,1443,1,A.J. Nazzaro,1,Double Jump,...,"{'relevant': True, 'constructed': True}",,,,[86],,,,14,demonhunter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1284,61503,1,61503-cthun-the-shattered,,4,1466,5,Alex Horley Orlandelli,10,"C'Thun, the Shattered",...,"{'relevant': True, 'constructed': True}",,6.0,6.0,"[8, 64]","[61873, 61875, 61874, 61877]",,,12,neutral
1285,61629,1,61629-darkmoon-rabbit,,4,1466,4,Matt Dixon,10,Darkmoon Rabbit,...,"{'relevant': True, 'constructed': True}",20.0,1.0,1.0,"[32, 53]",,,,12,neutral
1286,60443,1,60443-nzoth-god-of-the-deep,,4,1466,5,Alex Horley Orlandelli,10,"N'Zoth, God of the Deep",...,"{'relevant': True, 'constructed': True}",,7.0,5.0,[8],,,,12,neutral
1287,61308,1,61308-yshaarj-the-defiler,,4,1466,5,Alex Horley Orlandelli,10,"Y'Shaarj, the Defiler",...,"{'relevant': True, 'constructed': True}",,10.0,10.0,"[8, 98]",[61296],,,12,neutral


### Merging 'mtypes' DF

In [51]:
# merging 'mtypes' df
df = pd.merge(df, mtypes[['id', 'slug']], 
              left_on = 'minionTypeId', right_on = 'id', how="left", 
              suffixes = (None, '_minion_type_id'))

### Merging 'rarities' DF

In [52]:
# merging 'rarities' df
df = pd.merge(df, rarities[['id', 'slug']], 
              left_on = 'rarityId', right_on = 'id', how="left", 
              suffixes = (None, '_rarity'))

### Merging 'setGroups' DF

In [53]:
# merging 'setgroups' df
df = pd.merge(df, sets[['id', 'slug', 'name']], 
              left_on = 'cardSetId', right_on = 'id', how="left", 
              suffixes = (None, '_set'))

### Merging 'ctypes' DF

In [54]:
# merging 'ctypes' df
df = pd.merge(df, ctypes[['id', 'slug']], 
              left_on = 'cardTypeId', right_on = 'id', how="left", 
              suffixes = (None, '_card_type'))

### Merging 'keywords' DF

In [55]:
# adding missing keyword data to 'keywords' df
keywords.loc[len(keywords.index)] = ['64', 'start-of-game', 'Start of Game', 
                                     'Does something at the start of the Game.', 
                                     'Does something at the start of the Game.']

# removing brackets and commas from keyword id column
df.keywordIds = df.keywordIds.str.replace('\]|,|\[' , '')

# splitting keyword ids into separate columns for each card
kwdf = df["keywordIds"].str.split(" ", expand = True) 

# renaming columns
kwdf.columns = ['keywordId1', 'keywordId2', 'keywordId3', 'keywordId4', 'keywordId5']

# concatenating split keyword id columns with main df
df = pd.concat([df, kwdf], axis=1)

# converting keywords id column to str type to enable merge
keywords.id = keywords.id.astype(str)

# creating loop to add a column for the text name of each keyword ability of each card
# via merging with keywords DF
for x in kwdf.columns:
    df = pd.merge(df, keywords[['id', 'slug']], 
              left_on = x, right_on = 'id', how = "left",
              suffixes = (None, x + '_name'))

## Checking for duplicate rows

In [56]:
# checking number of rows in current DF
df.shape

(1289, 51)

In [57]:
# Checking number of rows if duplicates were dropped
df.drop_duplicates().shape

(1289, 51)

- No duplicates found

### Checking for proper data types, categorical columns (based on domain knowledge), and null counts

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1289 entries, 0 to 1288
Data columns (total 51 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     1289 non-null   int64  
 1   collectible            1289 non-null   int64  
 2   slug                   1289 non-null   object 
 3   classId                1289 non-null   int64  
 4   multiClassIds          1289 non-null   object 
 5   cardTypeId             1289 non-null   int64  
 6   cardSetId              1289 non-null   int64  
 7   rarityId               1289 non-null   int64  
 8   artistName             1288 non-null   object 
 9   manaCost               1289 non-null   int64  
 10  name                   1289 non-null   object 
 11  text                   1271 non-null   object 
 12  image                  1289 non-null   object 
 13  imageGold              805 non-null    object 
 14  flavorText             1289 non-null   object 
 15  crop

- The following columns will be dropped as they won't be needed for the expected operations of this project
    - id, slug
        - unique identifiers for cards, not needed since the 'name' column provides this while also being easier to reference
    - artistName, image, imageGold, cropImage
        - I won't be exploring images or artist names in this iteration of the project
    - all columns reflecting key words with the exception of the boolean columns and the 'slug_keyword#_name' columns
        - The exempted columns are sufficient for the project's expected operations
     
     
- Based on my domain knowledge of the game, I'm inferring that several of the columns are categorical
    - I need to create boolean columns for categorical columns (rarity, card set, etc.)


- Many null values that need to be addressed
    - text
    - duels
    - minion type id
    - health
    - attack
    - child ids
    - durability
    - armor

### Dropping columns that aren't needed for the planned operations of this project

In [59]:
# creating list of columns to drop
columns_to_drop = ['id', 'slug', 'artistName', 'image', 'imageGold', 'flavorText', 'cropImage']

# dropping columns
df.drop(columns = columns_to_drop, inplace = True)

# Addressing Null Values

### Addressing nulls in 'text' column

In [13]:
# checking values in text box
df.text.value_counts(dropna = False)

NaN                                                                            18
<b>Taunt</b>                                                                   15
<b>Charge</b>                                                                   7
<b>Spell Damage +1</b>                                                          6
<b>Stealth</b>                                                                  6
                                                                               ..
Whenever this minion takes damage, gain +3&nbsp;Attack.                         1
<b>Rush</b> Whenever this attacks a minion, <b>Silence</b> it.                  1
<b>Deathrattle:</b> Draw two 1-Health minions from your&nbsp;deck.              1
<b>Battlecry:</b> <b>Freeze</b> a&nbsp;character.                               1
At the end of your turn, deal 1 damage to this minion and summon a 1/1 Imp.     1
Name: text, Length: 1215, dtype: int64

In [74]:
# filling null text values with 'no effect'
df["text"].fillna("no effect", inplace = True) 

### Addressing nulls in 'duels' column

In [66]:
# checking duels values
df.duels.value_counts(dropna = False)

{'relevant': True, 'constructed': True}    708
NaN                                        581
Name: duels, dtype: int64

In [67]:
# updating duels column so that cards that were allowed in duels have value of 1 and 0 otherwise
df['duels'] = np.where((df.duels == "{'relevant': True, 'constructed': True}"), 1, 0)

### Addressing nulls in ' minionTypeId' column

In [68]:
# checking minontypeId values
df.minionTypeId.value_counts(dropna = False)

NaN     968
20.0     75
15.0     63
18.0     51
24.0     50
17.0     32
14.0     25
23.0     18
21.0      6
26.0      1
Name: minionTypeId, dtype: int64

In [69]:
# converting nulls to 'neutral' type
df['minionTypeId'] = np.where((df.minionTypeId.isnull() == True), 'Neutral', df.minionTypeId)

### Variable: childIds

In [70]:
# checking childIds values
df.childIds.value_counts(dropna = False)

no_childid                                                                994
[52897, 52900, 53160, 53161, 53162, 53163, 55378, 60588, 64652, 64653]     15
[59723]                                                                     8
[53921]                                                                     3
[56927]                                                                     3
                                                                         ... 
[57156]                                                                     1
[61701]                                                                     1
[56935]                                                                     1
[59919]                                                                     1
[58496]                                                                     1
Name: childIds, Length: 261, dtype: int64

In [71]:
# filling nulls with "no_childid"
df.childIds.fillna("no_childid", inplace = True) 

### Variables: health, attack, durability, armor
All of these variables respective columns have null values since none of these variables apply to every card (examples: only minions have health while only weapons have durability). For the time being I'll fill these nulls 
with a value that represents infinity. If this causes issues later I'll employ a different means of handling them.

In [72]:
hada = ['health', 'attack', 'durability', 'armor']

for att in hada:
    df[att].fillna(float('inf'), inplace = True)

In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1289 entries, 0 to 1288
Data columns (total 44 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   collectible            1289 non-null   int64  
 1   classId                1289 non-null   int64  
 2   multiClassIds          1289 non-null   object 
 3   cardTypeId             1289 non-null   int64  
 4   cardSetId              1289 non-null   int64  
 5   rarityId               1289 non-null   int64  
 6   manaCost               1289 non-null   int64  
 7   name                   1289 non-null   object 
 8   text                   1289 non-null   object 
 9   duels                  1289 non-null   int64  
 10  minionTypeId           1289 non-null   object 
 11  health                 1289 non-null   float64
 12  attack                 1289 non-null   float64
 13  keywordIds             877 non-null    object 
 14  childIds               1289 non-null   object 
 15  dura

# Creating boolean columns for categorical variables

### Creating boolean columns for 'keywords'

In [21]:
# loop iterates through each keyword and creates a boolean column for it
for kw in keywords.slug:
    df['has_' + kw] = np.where(
    (df.slugkeywordId1_name == kw) |
    (df.slugkeywordId2_name == kw) |
    (df.slugkeywordId3_name == kw) |
    (df.slugkeywordId4_name == kw) |
    (df.slugkeywordId5_name == kw), 1, 0)
    
# creating empty list
key_word_col_drop = []

# iterating through columns in df and creating list of columns to drop
for col in df.columns:
    if 'keywordId' in col:
        key_word_col_drop.append(col)
        
# dropping columns
df.drop(columns = key_word_col_drop, inplace = True)

### Creating boolean columns for 'hero classes'

In [22]:
# removing brackets and commas from multiclassids column
df.multiClassIds = df.multiClassIds.str.replace('\]|,|\[' , '')

# creating column that holds secondary class separate from primary class
df['classId2'] = df["multiClassIds"].str.split(" ", expand = True)[1]

# converting column to str type to enable merge with newly created column 'classId2'
classes.id = classes.id.astype(str)

# creating df containing columns for merge in order to rename before merge without altering original classes DF
classes2 = classes[['id', 'slug']]

# renaming columns
classes2.columns = ['classId2', 'slug_second_hero_class']

# merging 'classes' on secondary hero class id to get secondary class names
df = pd.merge(df, classes2[['classId2', 'slug_second_hero_class']], 
              on = 'classId2', how="left")

# creating boolean columns for each hero class
for c in classes.slug:
    df['is_' + c] = np.where(
    (df.slug_prime_hero_class == c) | (df.slug_second_hero_class == c), 1, 0)

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1289 entries, 0 to 1288
Data columns (total 79 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   collectible             1289 non-null   int64  
 1   classId                 1289 non-null   int64  
 2   multiClassIds           1289 non-null   object 
 3   cardTypeId              1289 non-null   int64  
 4   cardSetId               1289 non-null   int64  
 5   rarityId                1289 non-null   int64  
 6   manaCost                1289 non-null   int64  
 7   name                    1289 non-null   object 
 8   text                    1289 non-null   object 
 9   duels                   1289 non-null   int64  
 10  minionTypeId            1289 non-null   object 
 11  health                  825 non-null    float64
 12  attack                  860 non-null    float64
 13  childIds                1289 non-null   object 
 14  durability              48 non-null     

In [28]:
df.slug_minionTypeId

AttributeError: 'DataFrame' object has no attribute 'slug_minionTypeId'