# Hearthstone Project
***
***

# Goals
- Thoroughly prepare so that it is ready for exploration and modeling
    - I won't be creating any models for this project but I'd still like to prepare the data nonetheless
- Explore the data to gather insights about the characteristics of the game's different class types

# Setup
***

In [1]:
# establishing environment
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

# Acquire
Acquiring data from local csv files
***

In [2]:
# reading in card data and saving as DF
cards = pd.read_csv('hearthstone_standard_cards.csv')

# reading in card classes and saving as DF
classes = pd.read_csv('classes.csv')

# reading in minion types data and saving as DF
mtypes = pd.read_csv('minionTypes.csv')

# reading in rarities data and saving as DF
rarities = pd.read_csv('rarities.csv')

# reading in set groups data and saving as DF
setgroups = pd.read_csv('setGroups.csv')

# reading in card sets data and saving as DF
sets = pd.read_csv('sets.csv')

# reading in types data and saving as DF
ctypes = pd.read_csv('types.csv')

# reading in keywords data and saving as DF
keywords = pd.read_csv('keywords.csv')

# Prepare
Preparing data for exploration
***

## Preparing DataFrames

### Lowercasing all DF columns

In [3]:
# lowercasing cards DF columns
cards.columns = cards.columns.str.lower()

# creating list of all DFs besides cards
df_list = [classes, mtypes, rarities, setgroups, sets, ctypes, keywords]

# iterating through DFs
# lowercasing all column names, dropping original name column, renaming slug to name column
for dtafrm in df_list:
        dtafrm.columns = dtafrm.columns.str.lower()
        dtafrm.drop(columns = 'name')
        dtafrm.rename(columns = {"slug": "name"}, inplace = True)       

## Merging DataFrames

### Merging 'classes' DF

In [4]:
# removing brackets and commas from multiclassids column
cards.multiclassids = cards.multiclassids.str.replace('\]|,|\[' , '')

# creating column to hold primary class id 
# if card is of one class, this will reflect its sole class
# if card is dual, this will reflect the 1st of the two classes in the multiClassIds column
# necessary since dual class cards erroneously hold the 'neutral' class value in their primary class id 
cards['primeclassid'] = np.where((cards.multiclassids.str.contains(' ')), cards["multiclassids"].str.split(" ", expand = True)[0], cards.classid)

# converting key columns to make all value data types match
cards.primeclassid = cards.primeclassid.astype(str)
classes.id = classes.id.astype(str)

# merging 'classes' df with card df
df = pd.merge(cards, classes[['id', 'name']], 
              left_on = 'primeclassid', right_on = 'id', how="left", 
              suffixes = (None, '_prime_hero_class'))

# dropping columns I no longer need
df.drop(columns = ['primeclassid', 'classid'], inplace = True)

### Merging 'mtypes' DF

In [5]:
# changing null values of minionTypeId for neutral minions to -1
df['miniontypeid'] = np.where((df.miniontypeid.isnull() == True) & (df.cardtypeid == 4), -1, df.miniontypeid)

# adding missing keyword data to 'keywords' df
# -1 is for minions with no tribe
mtypes.loc[len(mtypes.index)] = ['no tribe', -1, 'No Tribe']

# merging 'mtypes' df
df = pd.merge(df, mtypes[['id', 'name']], 
              left_on = 'miniontypeid', right_on = 'id', how="left", 
              suffixes = (None, '_minion_type'))

# dropping column I no longer need
df.drop(columns = ['miniontypeid'], inplace = True)

### Merging 'rarities' DF

In [6]:
# merging 'rarities' df
df = pd.merge(df, rarities[['id', 'name']], 
              left_on = 'rarityid', right_on = 'id', how="left", 
              suffixes = (None, '_rarity'))

# dropping column I no longer need
df.drop(columns = ['rarityid'], inplace = True)

### Merging 'setGroups' DF

In [7]:
# merging 'setgroups' df
df = pd.merge(df, sets[['id', 'name']], 
              left_on = 'cardsetid', right_on = 'id', how="left", 
              suffixes = (None, '_set'))

# dropping column I no longer need
df.drop(columns = ['cardsetid'], inplace = True)

### Merging 'ctypes' DF

In [8]:
# merging 'ctypes' df
df = pd.merge(df, ctypes[['id', 'name']], 
              left_on = 'cardtypeid', right_on = 'id', how="left", 
              suffixes = (None, '_card_type'))

# dropping column I no longer need
df.drop(columns = ['cardtypeid'], inplace = True)

### Merging 'keywords' DF

In [13]:
keywords

Unnamed: 0,id,name,name.1,reftext,text
0,1,taunt,Taunt,Enemies must attack minions that have Taunt.,Enemies must attack this minion.
1,2,spellpower,Spell Damage,Your spells deal extra damage.,Your spell cards deal extra damage.
2,3,divine-shield,Divine Shield,"The first time a Shielded minion takes damage,...","The first time this minion takes damage, ignor..."
3,4,charge,Charge,Can attack immediately.,Can attack immediately.
4,5,secret,Secret,Hidden until a specific action occurs on your ...,Hidden until a specific action occurs on your ...
5,6,stealth,Stealth,Can't be attacked or targeted until it attacks.,Can't be attacked or targeted until it attacks.
6,8,battlecry,Battlecry,Does something when you play it from your hand.,Does something when you play it from your hand.
7,10,freeze,Freeze,Frozen characters lose their next attack.,Frozen characters lose their next attack.
8,11,windfury,Windfury,Can attack twice each turn.,Can attack twice each turn.
9,12,deathrattle,Deathrattle,Does something when it dies.,Does something when it dies.


In [9]:
# adding missing keyword data to 'keywords' df
keywords.loc[len(keywords.index)] = ['64', 'start-of-game', 'start of game', 
                                     'does something at the start of the game.', 
                                     'does something at the start of the game.']

# removing brackets and commas from keyword id column
df.keywordids = df.keywordids.str.replace('\]|,|\[' , '')

# splitting keyword ids into separate columns for each card
kwdf = df["keywordids"].str.split(" ", expand = True) 

# renaming columns
kwdf.columns = ['keywordid1', 'keywordid2', 'keywordid3', 'keywordid4', 'keywordid5']

# concatenating split keyword id columns with main df
df = pd.concat([df, kwdf], axis=1)

# converting keywords id column to str type to enable merge
keywords.id = keywords.id.astype(str)

# creating loop to add a column for the text name of each keyword ability of each card
# via merging with keywords DF
for x in kwdf.columns:
    df = pd.merge(df, keywords[['id', 'name']], 
              left_on = x, right_on = 'id', how = "left",
              suffixes = (None, x + '_name'))

ValueError: Buffer has wrong number of dimensions (expected 1, got 0)

## Checking for duplicate rows

In [None]:
# checking number of rows in current DF
df.shape

In [None]:
# Checking number of rows if duplicates were dropped
df.drop_duplicates().shape

- No duplicates found

### Checking for proper data types, categorical columns (based on domain knowledge), and null counts

In [None]:
df.info()

- The following columns will be dropped as they won't be needed for the expected operations of this project
    - id, slug
        - unique identifiers for cards, not needed since the 'name' column provides this while also being easier to reference
    - artistname, image, imagegold, cropimage
        - I won't be exploring images or artist names in this iteration of the project
    - collectible
        - Only 1 value, no nulls, doesn't distinguish any cards
    - all columns reflecting key words with the exception of the boolean columns and the 'slug_keyword#_name' columns
        - The exempted columns are sufficient for the project's expected operations
     
     
- Based on my domain knowledge of the game, I'm inferring that several of the columns are categorical
    - I need to create boolean columns for categorical columns (rarity, card set, etc.)


- Many null values that need to be addressed
    - text
    - duels
    - minion type id
    - health
    - attack
    - child ids
    - durability
    - armor

### Dropping columns that aren't needed for the planned operations of this project

In [None]:
# creating list of columns to drop
columns_to_drop = ['id', 'slug', 'artistname', 'image', 'imagegold', 'flavortext', 'cropimage', 'collectible']

# dropping columns
df.drop(columns = columns_to_drop, inplace = True)

# Addressing Null Values

### Addressing nulls in 'text' column

In [None]:
# checking values in text box
df.text.value_counts(dropna = False)

In [None]:
# filling null text values with 'no effect'
df["text"].fillna("no effect", inplace = True) 

### Addressing nulls in 'duels' column

In [None]:
# checking duels values
df.duels.value_counts(dropna = False)

In [None]:
# updating duels column so that cards that were allowed in duels have value of 1 and 0 otherwise
df['duels'] = np.where((df.duels == "{'relevant': True, 'constructed': True}"), 1, 0)

df.rename(columns={'duels':'in_duels'}, inplace=True)

### Addressing nulls in 'id_minion_type', and 'slug_minion_type' columns

In [None]:
# checking minontypeId values
df.id_minion_type.value_counts(dropna = False)

In [None]:
# converting nulls, aka non-minion cards to 'not a minion' type
df['id_minion_tribe'] = np.where((df.id_minion_type.isnull() == True), 'not a minion', df.id_minion_type)
df['name_minion_tribe'] = np.where((df.name_minion_type.isnull() == True), 'not a minion', df.name_minion_type)

# dropping minionTypeId since id_minion_type suffices
df.drop(columns = ['id_minion_type', 'name_minion_type'], inplace = True)

### Addressing nulls in 'childIds' column

In [None]:
# checking childIds values
df.childids.value_counts(dropna = False)

In [None]:
# filling nulls with "no_childid"
df.childids.fillna("no_childid", inplace = True) 

### Addressing nulls in 'health', 'attack', 'durability', and 'armor' columns
All of these variables respective columns have null values since none of these variables apply to every card (examples: only minions have health while only weapons have durability). For the time being I'll fill these nulls 
with a value that represents infinity. The benefit of this method is that it allows me to fill the nulls while preserving the int64 data type of the column. Furthermore, no matter what value blizzard assigns to these variables in future cards, this value probably wouldn't be used. If this causes issues later I'll employ a different means of handling them. 

In [None]:
# creating list of column names
hada = ['health', 'attack', 'durability', 'armor']

# iterating through columns filling nulls within each
for att in hada:
    df[att].fillna(float('inf'), inplace = True)

# Creating boolean columns for categorical variables

### Creating boolean columns for 'keywords'

In [None]:
# loop iterates through each keyword and creates a boolean column for it
for kw in keywords.name:
    df['has_' + kw] = np.where(
    (df.namekeywordid1_name == kw) |
    (df.namekeywordid2_name == kw) |
    (df.namekeywordid3_name == kw) |
    (df.namekeywordid4_name == kw) |
    (df.namekeywordid5_name == kw), 1, 0)
    
# creating empty list
key_word_col_drop = []

# iterating through columns in df and creating list of columns to drop
for col in df.columns:
    if 'keywordid' in col:
        key_word_col_drop.append(col)
        
# dropping columns
df.drop(columns = key_word_col_drop, inplace = True)

### Creating boolean columns for 'hero classes'

In [None]:
# removing brackets and commas from multiclassids column
df.multiclassids = df.multiclassids.str.replace('\]|,|\[' , '')

# creating column that holds secondary class separate from primary class
df['id_second_hero_class'] = df["multiclassids"].str.split(" ", expand = True)[1]

# converting column to str type to enable merge with newly created column 'id_second_hero_class'
classes.id = classes.id.astype(str)

# creating df containing columns for merge in order to rename before merge without altering original classes DF
classes2 = classes[['id', 'name']]

# renaming columns
classes2.columns = ['id_second_hero_class', 'name_second_hero_class']

# merging 'classes' on secondary hero class id to get secondary class names
df = pd.merge(df, classes2[['id_second_hero_class', 'name_second_hero_class']], 
              on = 'id_second_hero_class', how = "left")

# creating boolean columns for each hero class
for c in classes.name:
    df['is_' + c] = np.where(
    (df.name_prime_hero_class == c) | (df.name_second_hero_class == c), 1, 0)

# filling nulls in new columns
df['name_second_hero_class'].fillna('mono class', inplace = True)
df['id_second_hero_class'].fillna('mono class', inplace = True)

In [None]:
df.info() 