# Hearthstone Project
***
***

# Goals
- Explore the data to gather insights about the characteristics of the game's different class types

# Setup
***

In [None]:
# establishing environment
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

# Acquire
Acquiring data from local csv files
***

In [None]:
# reading in card data and saving as DF
cards = pd.read_csv('hearthstone_standard_cards.csv')

# reading in card classes and saving as DF
classes = pd.read_csv('classes.csv')

# reading in minion types data and saving as DF
mtypes = pd.read_csv('minionTypes.csv')

# reading in rarities data and saving as DF
rarities = pd.read_csv('rarities.csv')

# reading in set groups data and saving as DF
setgroups = pd.read_csv('setGroups.csv')

# reading in card sets data and saving as DF
sets = pd.read_csv('sets.csv')

# reading in types data and saving as DF
ctypes = pd.read_csv('types.csv')

# reading in keywords data and saving as DF
keywords = pd.read_csv('keywords.csv')

# Prepare
Preparing data for exploration
***

## Preparing DataFrames

### Lowercasing all DF columns

In [None]:
# lowercasing cards DF columns
cards.columns = cards.columns.str.lower()

# lowercasing name and text column values
cards.text = cards.text.str.lower()
cards.name = cards.name.str.lower()

# creating list of all DFs besides cards
df_list = [classes, mtypes, rarities, setgroups, sets, ctypes, keywords]

# iterating through DFs
# lowercasing all column names, dropping original name column, renaming slug to name column
for dtafrm in df_list:
        dtafrm.columns = dtafrm.columns.str.lower()
        dtafrm.drop(columns = 'name', inplace = True)
        dtafrm.rename(columns = {"slug": "name"}, inplace = True)       

## Merging DataFrames

### Merging 'classes' DF

In [None]:
# removing brackets and commas from multiclassids column
cards.multiclassids = cards.multiclassids.str.replace('\]|,|\[' , '')

# creating column to hold primary class id 
# if card is of one class, this will reflect its sole class
# if card is dual, this will reflect the 1st of the two classes in the multiClassIds column
# necessary since dual class cards erroneously hold the 'neutral' class value in their primary class id 
cards['primeclassid'] = np.where((cards.multiclassids.str.contains(' ')), cards["multiclassids"].str.split(" ", expand = True)[0], cards.classid)

# converting key columns to make all value data types match
cards.primeclassid = cards.primeclassid.astype(str)
classes.id = classes.id.astype(str)

# merging 'classes' df with card df
df = pd.merge(cards, classes[['id', 'name']], 
              left_on = 'primeclassid', right_on = 'id', how="left", 
              suffixes = (None, '_prime_hero_class'))

# dropping columns I no longer need
df.drop(columns = ['primeclassid', 'classid'], inplace = True)

### Merging 'mtypes' DF

In [None]:
# changing null values of minionTypeId for neutral minions to -1
df['miniontypeid'] = np.where((df.miniontypeid.isnull() == True) & (df.cardtypeid == 4), -1, df.miniontypeid)

# adding missing keyword data to 'keywords' df
# -1 is for minions with no tribe
mtypes.loc[len(mtypes.index)] = ['no_tribe', -1]

# merging 'mtypes' df
df = pd.merge(df, mtypes[['id', 'name']], 
              left_on = 'miniontypeid', right_on = 'id', how="left", 
              suffixes = (None, '_minion_type'))

# dropping column I no longer need
df.drop(columns = ['miniontypeid'], inplace = True)

### Merging 'rarities' DF

In [None]:
# merging 'rarities' df
df = pd.merge(df, rarities[['id', 'name']], 
              left_on = 'rarityid', right_on = 'id', how="left", 
              suffixes = (None, '_rarity'))

# dropping column I no longer need
df.drop(columns = ['rarityid'], inplace = True)

### Merging 'setGroups' DF

In [None]:
# replacing dashes with underscores in names
sets.name = sets.name.str.replace('-', '_')

# merging 'setgroups' df
df = pd.merge(df, sets[['id', 'name']], 
              left_on = 'cardsetid', right_on = 'id', how="left", 
              suffixes = (None, '_set'))

# dropping column I no longer need
df.drop(columns = ['cardsetid'], inplace = True)

### Merging 'ctypes' DF

In [None]:
# merging 'ctypes' df
df = pd.merge(df, ctypes[['id', 'name']], 
              left_on = 'cardtypeid', right_on = 'id', how="left", 
              suffixes = (None, '_card_type'))

# dropping column I no longer need
df.drop(columns = ['cardtypeid'], inplace = True)

### Merging 'keywords' DF

In [None]:
# replacing dashes with underscores in names
keywords.name = keywords.name.str.replace('-', '_')

# adding missing keyword data to 'keywords' df
keywords.loc[len(keywords.index)] = ['64', 'start_of_game', 
                                     'does something at the start of the game.', 
                                     'does something at the start of the game.']

# removing brackets and commas from keyword id column
df.keywordids = df.keywordids.str.replace('\]|,|\[' , '')

# splitting keyword ids into separate columns for each card
kwdf = df["keywordids"].str.split(" ", expand = True) 

# renaming columns
kwdf.columns = ['keywordid1', 'keywordid2', 'keywordid3', 'keywordid4', 'keywordid5']

# concatenating split keyword id columns with main df
df = pd.concat([df, kwdf], axis=1)

# converting keywords id column to str type to enable merge
keywords.id = keywords.id.astype(str)

# creating loop to add a column for the text name of each keyword ability of each card
# via merging with keywords DF
for x in kwdf.columns:
    df = pd.merge(df, keywords[['id', 'name']], 
              left_on = x, right_on = 'id', how = "left",
              suffixes = (None, x + '_name'))

## Checking for duplicate rows

In [None]:
# checking number of rows in current DF
df.shape

In [None]:
# Checking number of rows if duplicates were dropped
df.drop_duplicates().shape

- No duplicates found

### Checking for proper data types, categorical columns (based on domain knowledge), and null counts

In [None]:
df.info()

- The following columns will be dropped as they won't be needed for the expected operations of this project
    - id, slug
        - unique identifiers for cards, not needed since the 'name' column provides this while also being easier to reference
    - artistname, image, imagegold, cropimage
        - I won't be exploring images or artist names in this iteration of the project
    - collectible
        - Only 1 value, no nulls, doesn't distinguish any cards
    - all columns reflecting key words with the exception of the boolean columns and the 'slug_keyword#_name' columns
        - The exempted columns are sufficient for the project's expected operations
     
     
- Based on my domain knowledge of the game, I'm inferring that several of the columns are categorical
    - I need to create boolean columns for categorical columns (rarity, card set, etc.)


- Many null values that need to be addressed
    - text
    - duels
    - minion type id
    - health
    - attack
    - child ids
    - durability
    - armor

### Dropping columns that aren't needed for the planned operations of this project

In [None]:
# creating list of columns to drop
columns_to_drop = ['id', 'slug', 'artistname', 'image', 'imagegold', 'flavortext', 'cropimage', 'collectible']

# dropping columns
df.drop(columns = columns_to_drop, inplace = True)

# Addressing Null Values

### Addressing nulls in 'text' column

In [None]:
# checking values in text box
df.text.value_counts(dropna = False)

In [None]:
# filling null text values with 'no effect'
df["text"].fillna("no effect", inplace = True) 

### Addressing nulls in 'duels' column

In [None]:
# checking duels values
df.duels.value_counts(dropna = False)

In [None]:
# updating duels column so that cards that were allowed in duels have value of 1 and 0 otherwise
df['duels'] = np.where((df.duels == "{'relevant': True, 'constructed': True}"), 1, 0)

df.rename(columns={'duels':'in_duels'}, inplace=True)

### Addressing nulls in 'id_minion_type', and 'slug_minion_type' columns

In [None]:
# checking minontypeId values
df.id_minion_type.value_counts(dropna = False)

In [None]:
# converting nulls, aka non-minion cards to 'not a minion' type
df['id_minion_tribe'] = np.where((df.id_minion_type.isnull() == True), 'not a minion', df.id_minion_type)
df['name_minion_tribe'] = np.where((df.name_minion_type.isnull() == True), 'not a minion', df.name_minion_type)

# dropping minionTypeId since id_minion_type suffices
df.drop(columns = ['id_minion_type', 'name_minion_type'], inplace = True)

### Addressing nulls in 'childIds' column

In [None]:
# checking childIds values
df.childids.value_counts(dropna = False)

In [None]:
# filling nulls with "no_childid"
df.childids.fillna("no_childid", inplace = True) 

### Addressing nulls in 'health', 'attack', 'durability', and 'armor' columns
All of these variables respective columns have null values since none of these variables apply to every card (examples: only minions have health while only weapons have durability). For the time being I'll fill these nulls 
with a value that represents infinity. The benefit of this method is that it allows me to fill the nulls while preserving the int64 data type of the column. Furthermore, no matter what value blizzard assigns to these variables in future cards, this value probably wouldn't be used. If this causes issues later I'll employ a different means of handling them. 

In [None]:
# creating list of column names
hada = ['health', 'attack', 'durability', 'armor']

# iterating through columns filling nulls within each
for att in hada:
    df[att].fillna(float('inf'), inplace = True)

# Creating boolean columns for categorical variables

### Creating boolean columns for 'keywords'

In [None]:
# loop iterates through each keyword and creates a boolean column for it
for kw in keywords.name:
    df['has_' + kw] = np.where(
    (df.namekeywordid1_name == kw) |
    (df.namekeywordid2_name == kw) |
    (df.namekeywordid3_name == kw) |
    (df.namekeywordid4_name == kw) |
    (df.namekeywordid5_name == kw), 1, 0)
    
# creating empty list
key_word_col_drop = []

# iterating through columns in df and creating list of columns to drop
for col in df.columns:
    if 'keywordid' in col:
        key_word_col_drop.append(col)
        
# dropping columns
df.drop(columns = key_word_col_drop, inplace = True)

### Creating boolean columns for 'hero classes'

In [None]:
# removing brackets and commas from multiclassids column
df.multiclassids = df.multiclassids.str.replace('\]|,|\[' , '')

# creating column that holds secondary class separate from primary class
df['id_second_hero_class'] = df["multiclassids"].str.split(" ", expand = True)[1]

# converting column to str type to enable merge with newly created column 'id_second_hero_class'
classes.id = classes.id.astype(str)

# creating df containing columns for merge in order to rename before merge without altering original classes DF
classes2 = classes[['id', 'name']]

# renaming columns
classes2.columns = ['id_second_hero_class', 'name_second_hero_class']

# merging 'classes' on secondary hero class id to get secondary class names
df = pd.merge(df, classes2[['id_second_hero_class', 'name_second_hero_class']], 
              on = 'id_second_hero_class', how = "left")

# creating boolean columns for each hero class
for c in classes.name:
    df['is_' + c] = np.where(
    (df.name_prime_hero_class == c) | (df.name_second_hero_class == c), 1, 0)

# filling nulls in new columns
df['name_second_hero_class'].fillna('monoclass', inplace = True)
df['id_second_hero_class'].fillna('monoclass', inplace = True)

### Creating boolean column for multiclass cards

In [None]:
# creating column where 1 = multiclass, 0 = monoclass)
# contains ' ' will suffice since only cards with a space in this value are multiclass
df['is_multiclass'] = np.where((df.multiclassids.str.contains(' ')), 1, 0)

# dropping column I no longer need
df.drop(columns = 'multiclassids', inplace = True)

### Creating boolean column for cards with child ids

In [None]:
# creating column where 1 = card has childids, 0 = card has no childids)
# contains ',' will suffice since only cards with a comma in this value have childids
df['has_child_ids'] = np.where((df.childids.str.contains(',')), 1, 0)

# dropping column I no longer need
df.drop(columns = 'childids', inplace = True)

### Creating boolean columns for rarity levels

In [None]:
# iterating through levels of rarity (common, rare, epic, etc)
# creating boolean column for each
for level in rarities.name:
    df['is_' + level] = np.where((df.name_rarity == level), 1, 0)

### Creating boolean columns for card sets

In [None]:
# iterating through set names and creating a boolean column for each
for setname in sets.name:
    df['is_' + setname] = np.where((df.name_set == setname), 1, 0)

### Creating boolean columns for card type

In [None]:
# iterating through card types and creating a boolean column for each
for ctype in ctypes.name:
    df['is_' + ctype] = np.where((df.name_card_type == ctype), 1, 0)

# dropping column I no longer need
df.drop(columns = 'id_card_type', inplace = True)

### Creating boolean columns for minion tribe

In [None]:
# iterating through minion tribes and creating a boolean column for each
for mtype in mtypes.name:
    df['is_' + mtype] = np.where((df.name_minion_tribe == mtype), 1, 0)

# dropping column I no longer need
df.drop(columns = 'id_minion_tribe', inplace = True)

## Misc. Prep Updates

### Dropping boolean columns with all 0 values
I know that some of the sets and keywords that were turned into booleans are not currently in standard (the format the collection of the cards in the data are part of). These columns will be completely filled with 0s so I'm going to drop them.

In [None]:
# creating list of index values for columns that only have 0 values
all_0_cols = np.where(df.isin([0]).all() == True)

# dropping columns based on index value
df.drop(df.columns[all_0_cols], axis = 1, inplace = True)

### Adding rows for dual class cards with prime and secondary class swapped
Adding these rows will make it easier to perform certain operations such as grouping and plotting.

In [None]:
# making identical dfs of all dual class cards
dcc = df[df.name_second_hero_class != 'monoclass']
dcc2 = df[df.name_second_hero_class != 'monoclass']

# swapping primary and secondary hero class values
dcc2.name_prime_hero_class, dcc2.name_second_hero_class, dcc2.id_prime_hero_class, dcc2.id_second_hero_class = dcc.name_second_hero_class, dcc.name_prime_hero_class, dcc.id_second_hero_class, dcc.id_prime_hero_class

# adding new rows to main df
df = pd.concat([df, dcc2])

# resetting index
df.reset_index(drop = True, inplace = True)

### Adding column that holds count of words in card name

In [None]:
# counting words in card names and adding as variable
df['name_word_count'] = df.name.apply(lambda x: len(str(x).split(' ')))

### Adjusting column order

In [None]:
# adjusting order of columns
df = df[['manacost', 'name', 'name_word_count', 'text', 'in_duels', 'has_child_ids', 'health', 'attack',
       'durability', 'armor', 'id_prime_hero_class', 'name_prime_hero_class', 
       'id_second_hero_class', 'name_second_hero_class',
       'id_rarity', 'name_rarity', 'id_set', 'name_set', 'name_card_type',
       'name_minion_tribe', 'has_taunt', 'has_spellpower', 'has_divine_shield',
       'has_charge', 'has_secret', 'has_stealth', 'has_battlecry',
       'has_freeze', 'has_windfury', 'has_deathrattle', 'has_combo',
       'has_overload', 'has_silence', 'has_counter', 'has_immune',
       'has_discover', 'has_quest', 'has_poisonous', 'has_lifesteal',
       'has_rush', 'has_evilzug', 'has_twinspell', 'has_mega_windfury',
       'has_reborn', 'has_empower', 'has_outcast', 'has_spellburst',
       'has_sidequest', 'has_corrupt', 'has_start_of_game',
       'is_demonhunter', 'is_druid', 'is_hunter', 'is_mage', 'is_paladin', 'is_priest',
       'is_rogue', 'is_shaman', 'is_warlock', 'is_warrior', 'is_neutral',
       'is_multiclass', 'is_common', 'is_free', 'is_rare',
       'is_epic', 'is_legendary', 'is_madness_at_the_darkmoon_faire',
       'is_scholomance_academy', 'is_demonhunter_initiate',
       'is_ashes_of_outland', 'is_galakronds_awakening',
       'is_descent_of_dragons', 'is_saviors_of_uldum', 'is_rise_of_shadows',
       'is_classic', 'is_basic', 'is_hero', 'is_minion', 'is_spell',
       'is_weapon', 'is_murloc', 'is_demon', 'is_mech', 'is_elemental',
       'is_beast', 'is_totem', 'is_pirate', 'is_dragon', 'is_all',
       'is_no_tribe']]

# Preparation Phase Summary
- Lowercased all column names and values

- Merged all dataFrames into single dataframe

- Added missing keyword value (start of game)

- Added rows for dual class cards with primary and secondary hero class values swapped

- Changed all dashes in column names to underscores

- Checked for duplicate rows, none found

- Filled nulls with different values based on column and intended operations

- Created boolean columns for categorical values

- Dropped columns that were unuseful for the operations of this project

- Reordered columns

# Explore
Exploring the data to identify the characteristics of each class, including neutral

## Mana Costs

In [None]:
# distribution of mana cost
avgmana = pd.DataFrame(df.groupby(['name_prime_hero_class']).mean().sort_values(by = 'manacost', ascending = True)['manacost'])

avgmana

In [None]:
# creating dictionary that holds each class as a key with a color for each
# can be used to specify the color of each classes' bar or representative figure in a plot
colord = {'rogue' : 'black', 'shaman' : 'darkblue', 'hunter' : 'lime', 'warrior' : 'red', 'priest' : 'white', 
          'paladin' : 'gold', 'mage' : 'deepskyblue', 'warlock' : 'purple', 'demonhunter' : 'darkgreen',
          'druid' : 'saddlebrown', 'neutral' : 'pink'}

# creating plot
plt.rcParams["figure.figsize"] = (10,7)
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 14)
plt.title("Mana Costs Don't Vary Significantly Per Class", fontsize = 20)
ax = sns.barplot(x = "manacost", y = mana.index, data = avgmana, palette = colord, edgecolor = 'black')
ax.set_xlabel('Average Mana Cost', fontsize = 15)
ax.set_ylabel('Hero Class', fontsize = 17)

- Average mana costs range from ~3 to ~4 mana so there is very little variation 
- However, Rogue does has the lowest average mana cost, ~3
- Neutral has the highest average mana cost, ~4