# Hearthstone Project
***
***

# Setup
***

In [186]:
# establishing environment
import pandas as pd
import numpy as np

# Acquire
Acquiring data from local csv files
***

In [187]:
# reading in card data and saving as DF
cards = pd.read_csv('hearthstone_standard_cards.csv')

# reading in card classes and saving as DF
classes = pd.read_csv('classes.csv')

# reading in minion types data and saving as DF
mtypes = pd.read_csv('minionTypes.csv')

# reading in rarities data and saving as DF
rarities = pd.read_csv('rarities.csv')

# reading in set groups data and saving as DF
setgroups = pd.read_csv('setGroups.csv')

# reading in card sets data and saving as DF
sets = pd.read_csv('sets.csv')

# reading in types data and saving as DF
ctypes = pd.read_csv('types.csv')

# reading in keywords data and saving as DF
keywords = pd.read_csv('keywords.csv')

# Prepare
Preparing data for exploration
***

### Merging 'classes' DF

In [188]:
# merging 'classes' df
df = pd.merge(cards, classes[['id', 'slug']], 
              left_on = 'classId', right_on = 'id', how="left", 
              suffixes = (None, '_hero_class'))

### Merging 'mtypes' DF

In [189]:
# merging 'mtypes' df
df = pd.merge(df, mtypes[['id', 'slug']], 
              left_on = 'minionTypeId', right_on = 'id', how="left", 
              suffixes = (None, '_minion_type_id'))

### Merging 'rarities' DF

In [190]:
# merging 'rarities' df
df = pd.merge(df, rarities[['id', 'slug']], 
              left_on = 'rarityId', right_on = 'id', how="left", 
              suffixes = (None, '_rarity'))

### Merging 'setGroups' DF

In [191]:
# merging 'setgroups' df
df = pd.merge(df, sets[['id', 'slug', 'name']], 
              left_on = 'cardSetId', right_on = 'id', how="left", 
              suffixes = (None, '_set'))

### Merging 'ctypes' DF

In [192]:
# merging 'ctypes' df
df = pd.merge(df, ctypes[['id', 'slug']], 
              left_on = 'cardTypeId', right_on = 'id', how="left", 
              suffixes = (None, '_card_type'))

### Merging 'keywords' DF

In [193]:
# removing brackets and commas from keyword id column
df.keywordIds = df.keywordIds.str.replace('\]|,|\[' , '')

kwdf = df["keywordIds"].str.split(" ", expand = True) 

kwdf.columns = ['keywordId1', 'keywordId2', 'keywordId3', 'keywordId4', 'keywordId5']

df = pd.concat([df, kwdf], axis=1)

In [141]:
# creating column in main df for each keyword in keywords df
#for kw in keywords.slug:
 #   df['has_' + kw] = 0
    
#df['has_taunt'] = np.where((df.keywordIds.str.contains('8 ', na = 0)), 1, 0)

In [200]:
keywords.id = keywords.id.astype(str)

df = pd.merge(df, keywords[['id', 'slug']], 
              left_on = 'keywordId1', right_on = 'id', how = "left",
              suffixes = (None, '_keyword1_name'))

df = pd.merge(df, keywords[['id', 'slug']], 
              left_on = 'keywordId2', right_on = 'id', how = "left",
              suffixes = (None, '_keyword2_name'))

df = pd.merge(df, keywords[['id', 'slug']], 
              left_on = 'keywordId3', right_on = 'id', how = "left",
              suffixes = (None, '_keyword3_name'))

df = pd.merge(df, keywords[['id', 'slug']], 
              left_on = 'keywordId4', right_on = 'id', how = "left",
              suffixes = (None, '_keyword4_name'))

df = pd.merge(df, keywords[['id', 'slug']], 
              left_on = 'keywordId5', right_on = 'id', how = "left",
              suffixes = (None, '_keyword5_name'))

df.head()

Unnamed: 0,id,collectible,slug,classId,multiClassIds,cardTypeId,cardSetId,rarityId,artistName,manaCost,...,id_keyword1_name,slug_keyword1_name,id_keyword2_name,slug_keyword2_name,id_keyword3_name,slug_keyword3_name,id_keyword4_name,slug_keyword4_name,id_keyword5_name,slug_keyword5_name
0,58607,1,58607-blur,14,[],5,1463,1,Zoltan Boros,0,...,,,,,,,,,,
1,56806,1,56806-shadowhoof-slayer,14,[],4,2,2,A.J. Nazzaro,1,...,8.0,battlecry,,,,,,,,
2,58170,1,58170-crimson-sigil-runner,14,[],4,1414,1,Arthur Bozonnet,1,...,86.0,outcast,,,,,,,,
3,59394,1,59394-demon-companion,14,"[14, 3]",5,1443,3,Zoltan Boros,1,...,,,,,,,,,,
4,59606,1,59606-double-jump,14,[],5,1443,1,A.J. Nazzaro,1,...,86.0,outcast,,,,,,,,


### Checking for proper data types and null counts

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1289 entries, 0 to 1288
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   1289 non-null   int64  
 1   collectible          1289 non-null   int64  
 2   slug                 1289 non-null   object 
 3   classId              1289 non-null   int64  
 4   multiClassIds        1289 non-null   object 
 5   cardTypeId           1289 non-null   int64  
 6   cardSetId            1289 non-null   int64  
 7   rarityId             1289 non-null   int64  
 8   artistName           1288 non-null   object 
 9   manaCost             1289 non-null   int64  
 10  name                 1289 non-null   object 
 11  text                 1271 non-null   object 
 12  image                1289 non-null   object 
 13  imageGold            805 non-null    object 
 14  flavorText           1289 non-null   object 
 15  cropImage            1289 non-null   o

- The following columns will be dropped as they won't be needed for the expected operations of this project
    - id, slug
        - unique identifiers for cards, not needed since the 'name' column provides this while also being easier to reference
    - artistName, image, imageGold, cropImage
        - I won't be exploring images or artist names in this iteration of the project
        
        
- Data types are acceptable for expected operations


- Many null values that need to be addressed
    - text
    - duels
    - minion type id
    - health
    - attack
    - keyword ids
    - child ids
    - durability
    - armor

### Dropping columns I won't be using (explained above)

In [5]:
# dropping columns
df.drop(columns = ['id', 'slug', 'artistName', 'image', 'imageGold', 'flavorText', 'cropImage'], inplace = True)

# Null Values

### Variable: text

In [6]:
# checking values in text box
df.text.value_counts(dropna = False)

NaN                                                                                     18
<b>Taunt</b>                                                                            15
<b>Charge</b>                                                                            7
<b>Spell Damage +1</b>                                                                   6
<b>Stealth</b>                                                                           6
                                                                                        ..
<b>Taunt</b> <b>Battlecry:</b> Summon three random 1-Cost minions for your opponent.     1
<b>Battlecry:</b> If your hero attacked this turn, deal 4 damage.                        1
Has +3 Attack while your hero has Armor.                                                 1
<b>Secret:</b> When a minion attacks your hero, destroy it.                              1
<b>Battlecry:</b> If your deck has no duplicates, summon King Krush.                     1

In [7]:
# filling null text values with 'no effect'
df["text"].fillna("no_effect", inplace = True) 

### Variable: duels

In [8]:
# checking duels values
df.duels.value_counts(dropna = False)

{'relevant': True, 'constructed': True}    708
NaN                                        581
Name: duels, dtype: int64

In [9]:
# updating duels column so that cards that were allowed in duels have value of 1 and 0 otherwise
df['duels'] = np.where((df.duels == "{'relevant': True, 'constructed': True}"), 1, 0)

### Variable: minionTypeId

In [10]:
# checking duels values
df.minionTypeId.value_counts(dropna = False)

NaN     968
20.0     75
15.0     63
18.0     51
24.0     50
17.0     32
14.0     25
23.0     18
21.0      6
26.0      1
Name: minionTypeId, dtype: int64

In [11]:
# converting nulls to 'neutral' type
df['minionTypeId'] = np.where((df.minionTypeId.isnull() == True), 'Neutral', df.minionTypeId)

### Variables: health, attack, durability, armor
All of these variables respective columns have null values since none of these variables apply to every card (examples: only minions have health while only weapons have durability). Ill be leaving the nulls in these particular columns for the reasons listed below.

- I don't want to impute a string such as 'NA' as this will alter the numeric data type of the column
- I don't want to impute a false numeric value as the null state is more accurate in this instance
- I'll be isolating the appropriate card types when exploring this variable so the nulls won't impact exploration
- This project will not include modeling so leaving nulls in these columns won't be an issue for modeling either

However, I will be checking for instances of attributes being applied to the wrong card types and correcting them as needed. For example, a weapon having health or a minion having durability.

### Variable: keywordIds

In [12]:
# checking duels values
df.keywordIds.value_counts(dropna = False)

NaN           412
[8]           216
[12]           57
[1]            52
[5]            35
             ... 
[2, 14]         1
[1, 3, 12]      1
[53, 78]        1
[12, 15]        1
[14, 53]        1
Name: keywordIds, Length: 150, dtype: int64

In [13]:
# filling nulls with "no_keyword"
df.keywordIds.fillna("no_keyword", inplace = True) 

### Variable: childIds

In [14]:
# checking duels values
df.childIds.value_counts(dropna = False)

NaN                                                                       994
[52897, 52900, 53160, 53161, 53162, 53163, 55378, 60588, 64652, 64653]     15
[59723]                                                                     8
[53921]                                                                     3
[56927]                                                                     3
                                                                         ... 
[57498]                                                                     1
[55808, 57432, 57434]                                                       1
[54488]                                                                     1
[488]                                                                       1
[56164, 56165, 56167, 56168, 56169, 56170, 56171, 56173, 56175]             1
Name: childIds, Length: 261, dtype: int64

In [15]:
# filling nulls with "no_childid"
df.childIds.fillna("no_childid", inplace = True) 