In [1]:
# Data manipulation and wrangling package
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

# ML package
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier 
from xgboost import XGBClassifier as XGBC

# Data processing package
import sklearn.utils 
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold, RandomizedSearchCV, cross_val_score
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, confusion_matrix, classification_report

pd.set_option('display.max_columns', 50)

In [2]:
battles = pd.read_csv('battles.csv')
char_deaths = pd.read_csv('character-deaths.csv')
char_pred = pd.read_csv('character-predictions.csv')

In [4]:
# From  Shail Daliwala's kernel on Kaggle
cult = {
    'Summer Islands': ['summer islands', 'summer islander', 'summer isles'],
    'Ghiscari': ['ghiscari', 'ghiscaricari',  'ghis'],
    'Asshai': ["asshai'i", 'asshai'],
    'Lysene': ['lysene', 'lyseni'],
    'Andal': ['andal', 'andals'],
    'Braavosi': ['braavosi', 'braavos'],
    'Dornish': ['dornishmen', 'dorne', 'dornish'],
    'Myrish': ['myr', 'myrish', 'myrmen'],
    'Westermen': ['westermen', 'westerman', 'westerlands'],
    'Westerosi': ['westeros', 'westerosi'],
    'Stormlander': ['stormlands', 'stormlander'],
    'Norvoshi': ['norvos', 'norvoshi'],
    'Northmen': ['the north', 'northmen'],
    'Free Folk': ['wildling', 'first men', 'free folk'],
    'Qartheen': ['qartheen', 'qarth'],
    'Reach': ['the reach', 'reach', 'reachmen'],
}

def get_cult(value):
    value = value.lower()
    v = [k for (k, v) in cult.items() if value in v]
    return v[0] if len(v) > 0 else value.title()


In [5]:
# Names of all Attackers
all_attackers = set(battles.attacker_1).union(set(battles.attacker_2))\
.union(set(battles.attacker_3)).union(set(battles.attacker_4))
# Names of all Defenders
all_defenders = set(battles.defender_1).union(set(battles.defender_2))

# Unique attackers and defenders combined
all_warlead = all_attackers.union(all_defenders)
all_warlead = list(all_warlead)
all_warlead.remove(np.nan)

renam_warlead = []
for x in all_warlead:
    house = 'House ' + x
    renam_warlead.append(house)
    
# All attacking commanders
all = []
for i in battles['attacker_commander'].dropna().values:
    splitted = i.split(', ')
    all +=splitted
    
all_att_commanders = set(all)

# All defending commanders
all = []
for i in battles['defender_commander'].dropna().values:
    splitted = i.split(', ')
    all +=splitted
    
all_def_commanders = set(all)

# All commanders 
all_commanders = all_att_commanders.union(all_def_commanders)
# characters who swore allegiance in character death data
char_with_allgnc = char_deaths[char_deaths['Allegiances'] != 'None']

# Extracting the above character from all character data
all_char_with_allgnc = char_pred[char_pred['name'].isin(char_with_allgnc['Name'])]

# house of character with allegiance to another house (house in character dead allegiance column)
hus_char_with_allegnc = all_char_with_allgnc['house']

# Extract unique house of allegiance from charater death data.
husOfAllgnc = list(set(char_deaths['Allegiances']))
husOfAllgnc.remove('None')

renam_husOfAllgnc = []
for x in husOfAllgnc:
    house = 'House ' + x
    renam_husOfAllgnc.append(house)


In [6]:

character = char_pred.copy()
def transform_data(data):
    # Replacing the age
    data.loc[1684, 'age'] = 25.0
    data.loc[1868, 'age'] = 0.0
    
    # Replacing missing values
    data["age"].fillna(round(data["age"].mean()), inplace=True)
    data['dateOfBirth'].fillna(data['dateOfBirth'].median(), inplace=True)
    data['DateoFdeath'].fillna(data['DateoFdeath'].median(), inplace=True)
    
    data.loc[:, "culture"] = [get_cult(x) for x in data.culture.fillna("")]
    cat = ['title', 'mother', 'father', 'heir', 'house', 'spouse']
    for col in cat:
        data[col].fillna('unknown', inplace=True)
    data.fillna(-1, inplace=True)
    
    data['isACommander'] = np.where(data['name'].isin(all_commanders), 1, 0)
    data['inHouseOfWarLead'] = np.where(data['house'].isin(renam_warlead), 1, 0)
    data['houseSworeAllegiance'] = np.where(data['house'].isin(hus_char_with_allegnc), 1, 0)
    data['inHouseOfAllegnc'] = np.where(data['house'].isin(renam_husOfAllgnc), 1, 0)
    
    data = data.drop(['S.No', 'actual', 'pred', 'alive', 'plod', 'name', 'title',
                      'dateOfBirth', 'DateoFdeath', 'house'], axis=1)
    
    return data

def select_features(data):
    # Heatmap of numerical feature in the training dataset
    num_features = data.select_dtypes(['int', 'float'])
    corr_mat = num_features.corr()['isAlive'].abs().sort_values()
    corr_less = corr_mat[corr_mat < 0.1]
    data = data.drop(corr_less.index, axis=1)
    
    # Dropping columns with colinearity
    data = data.drop(['isAliveFather', 'isAliveMother'], axis=1)
    
    # List of categorical variable
    nominal_features = ['culture', 'mother', 'father', 'heir', 'spouse']
    # Counting unique values in each column
    unique_count = data[nominal_features].apply(lambda x: len(x.value_counts())).sort_values()
    unique_count = unique_count[unique_count > 25].index
    # Dropping 
    data = data.drop(unique_count, axis=1)
    # Converting the remaining text column to categorical type
    txt_col = data.select_dtypes(include=['object'])
    for col in txt_col:
        data[col] = data[col].astype('category')
        
    # Creating dummy columns from txt columns and add back to dataframe
    data = pd.concat([
        data, pd.get_dummies(data.select_dtypes(include=['category']))
    ], axis=1).drop(txt_col, axis=1)
    
    return data
    
transformed_data = transform_data(character)
selected_features = select_features(transformed_data)

selected_features

Unnamed: 0,male,book1,book4,isAliveHeir,age,numDeadRelations,boolDeadRelations,isPopular,popularity,isAlive,inHouseOfAllegnc,mother_Alicent Hightower,mother_Alyssa Targaryen,mother_Alyssa Velaryon,mother_Betha Blackwood,mother_Cassana Estermont,mother_Catelyn Stark,mother_Cersei Lannister,mother_Daenaera Velaryon,mother_Dyanna Dayne,mother_Lady of House Sunderly,mother_Mariah Martell,mother_Naerys Targaryen,mother_Rhaenyra Targaryen,mother_Rhaenys Targaryen,...,father_Viserys I Targaryen,father_unknown,heir_Aegon IV Targaryen,heir_Aegon Targaryen,heir_Aemon Targaryen,heir_Aenys Targaryen,heir_Aerys II Targaryen,heir_Arlan V Durrandon,heir_Baelor 'Breakspear' Targaryen,heir_Baelor I Targaryen,heir_Bran Stark,heir_Daeron I Targaryen,heir_Daeron II Targaryen,heir_Duncan Targaryen,heir_Durran the Devout,heir_Halleck Hoare,heir_Jaehaerys Targaryen,heir_Myrcella Baratheon,heir_Rhaegar Targaryen,heir_Rhaegel Targaryen,heir_Rhaenyra Targaryen,heir_Shireen Baratheon,heir_Theon Greyjoy,heir_Tommen Baratheon,heir_unknown
0,1,0,0,0.0,37.0,11,1,1,0.605351,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,1,-1.0,97.0,1,1,1,0.896321,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,1,0,1,-1.0,37.0,0,0,0,0.267559,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,-1.0,23.0,0,0,0,0.183946,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,1,-1.0,29.0,0,0,0,0.043478,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1941,1,1,1,-1.0,37.0,0,0,1,0.351171,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1942,1,0,1,-1.0,37.0,0,0,0,0.096990,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1943,1,1,1,-1.0,37.0,0,0,0,0.030100,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1944,1,0,1,-1.0,37.0,0,0,0,0.130435,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
