## Setting up

Importing the required packages and viewing components of the dataset

In [1]:
# Importing required packages

import pandas as pd
import numpy as np 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from numpy import isnan
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import gender_guesser.detector as gender
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

# setting pandas print options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

Installing a extra package to smoothen data distribution

!pip install imblearn

In [2]:
#specifying the file path
file = ('GOT_character_predictions.xlsx')

#reading the file into python
data = pd.read_excel(io = file)

#displaying the first rows of the dataframe
data.head()

Unnamed: 0,S.No,name,title,culture,dateOfBirth,mother,father,heir,house,spouse,book1_A_Game_Of_Thrones,book2_A_Clash_Of_Kings,book3_A_Storm_Of_Swords,book4_A_Feast_For_Crows,book5_A_Dance_with_Dragons,isAliveMother,isAliveFather,isAliveHeir,isAliveSpouse,isMarried,isNoble,age,numDeadRelations,popularity,isAlive
0,1,Viserys II Targaryen,,,,Rhaenyra Targaryen,Daemon Targaryen,Aegon IV Targaryen,,,0,0,0,0,0,1.0,0.0,0.0,,0,0,,11,0.605351,0
1,2,Walder Frey,Lord of the Crossing,Rivermen,208.0,,,,House Frey,Perra Royce,1,1,1,1,1,,,,1.0,1,1,97.0,1,0.896321,1
2,3,Addison Hill,Ser,,,,,,House Swyft,,0,0,0,1,0,,,,,0,1,,0,0.267559,1
3,4,Aemma Arryn,Queen,,82.0,,,,House Arryn,Viserys I Targaryen,0,0,0,0,0,,,,0.0,1,1,23.0,0,0.183946,0
4,5,Sylva Santagar,Greenstone,Dornish,276.0,,,,House Santagar,Eldon Estermont,0,0,0,1,0,,,,1.0,1,1,29.0,0,0.043478,1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1946 entries, 0 to 1945
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   S.No                        1946 non-null   int64  
 1   name                        1946 non-null   object 
 2   title                       938 non-null    object 
 3   culture                     677 non-null    object 
 4   dateOfBirth                 433 non-null    float64
 5   mother                      21 non-null     object 
 6   father                      26 non-null     object 
 7   heir                        23 non-null     object 
 8   house                       1519 non-null   object 
 9   spouse                      276 non-null    object 
 10  book1_A_Game_Of_Thrones     1946 non-null   int64  
 11  book2_A_Clash_Of_Kings      1946 non-null   int64  
 12  book3_A_Storm_Of_Swords     1946 non-null   int64  
 13  book4_A_Feast_For_Crows     1946 

In [4]:
data.corr()

Unnamed: 0,S.No,dateOfBirth,book1_A_Game_Of_Thrones,book2_A_Clash_Of_Kings,book3_A_Storm_Of_Swords,book4_A_Feast_For_Crows,book5_A_Dance_with_Dragons,isAliveMother,isAliveFather,isAliveHeir,isAliveSpouse,isMarried,isNoble,age,numDeadRelations,popularity,isAlive
S.No,1.0,0.068381,0.29013,0.312403,0.368665,0.284554,0.405907,0.289094,-0.096596,-0.138287,0.033193,0.150648,0.241642,-0.067925,0.092734,0.277761,-0.128712
dateOfBirth,0.068381,1.0,0.10159,0.055825,-0.022845,-0.03214,-0.008324,,,,0.275833,-0.043918,-0.067865,-0.999997,0.062941,-0.015658,-0.085863
book1_A_Game_Of_Thrones,0.29013,0.10159,1.0,0.392439,0.243843,0.079959,0.264818,,-0.09759,,0.082209,0.085895,0.095848,-0.101148,0.191853,0.324782,-0.147401
book2_A_Clash_Of_Kings,0.312403,0.055825,0.392439,1.0,0.449931,0.175937,0.225677,,-0.09759,,0.057321,0.157045,0.023366,-0.054744,0.075281,0.247918,-0.0672
book3_A_Storm_Of_Swords,0.368665,-0.022845,0.243843,0.449931,1.0,0.373553,0.285103,,,,0.067919,0.151504,0.024793,0.024,0.05287,0.214018,0.006693
book4_A_Feast_For_Crows,0.284554,-0.03214,0.079959,0.175937,0.373553,1.0,0.337415,,,,0.171175,0.127712,0.079689,0.033531,-0.007731,0.116085,0.268975
book5_A_Dance_with_Dragons,0.405907,-0.008324,0.264818,0.225677,0.285103,0.337415,1.0,,,,0.142594,0.09621,0.10234,0.009114,0.103986,0.215681,0.032846
isAliveMother,0.289094,,,,,,,1.0,0.258199,-0.564076,,,,,0.02434,-0.235275,-0.043033
isAliveFather,-0.096596,,-0.09759,-0.09759,,,,0.258199,1.0,0.3669,,,,,-0.2401,-0.050977,0.195992
isAliveHeir,-0.138287,,,,,,,-0.564076,0.3669,1.0,,,,,-0.424601,0.022362,0.3849


### Data Cleaning

Reviewing the data for missing information,seperating columns that requires spliting and introducing a new variable - 'gender' into the dataset

In [5]:
#checking for missing values
data.isnull().sum()

S.No                             0
name                             0
title                         1008
culture                       1269
dateOfBirth                   1513
mother                        1925
father                        1920
heir                          1923
house                          427
spouse                        1670
book1_A_Game_Of_Thrones          0
book2_A_Clash_Of_Kings           0
book3_A_Storm_Of_Swords          0
book4_A_Feast_For_Crows          0
book5_A_Dance_with_Dragons       0
isAliveMother                 1925
isAliveFather                 1920
isAliveHeir                   1923
isAliveSpouse                 1670
isMarried                        0
isNoble                          0
age                           1513
numDeadRelations                 0
popularity                       0
isAlive                          0
dtype: int64

In [6]:
#splitting names into first and last to enable gender guesser work properly
data[['first_name' , 'last_name']] = data['name'].loc[data['name'].str.split().str.len() == 2].str.split(expand=True)

### Gender Guesser
Creating a new variable - gender guesser and mapping it to the original dataset

In [7]:
#initiate gender guesser and split from first name
d = gender.Detector()

#creating placeholder list to contain the new gender guessed
placeholder_lst = []

#creating loop to guess gender
for name in data ["first_name"]:
    guess = d.get_gender(name)
    placeholder_lst.append(guess)

#print results
print(placeholder_lst)

['unknown', 'unknown', 'andy', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'mostly_male', 'mostly_male', 'mostly_male', 'mostly_male', 'mostly_male', 'unknown', 'male', 'unknown', 'unknown', 'male', 'male', 'female', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'mostly_female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'andy', 'andy', 'unknown', 'andy', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', '

In [8]:
#map the gender to the original data
data['gender'] = data['first_name'].map(lambda x: d.get_gender(x))

### Treating missing data

Using simple imputter to handle missing values. The mode was used for categorical data,same measure was adopted for continous variables. Simple imputter implicilty removes outlier and normalises data, this makes mode not far from the median. Thus the choice of replacing all missing values with mode


In [9]:
#defining the imputer
imputer = SimpleImputer(strategy = 'most_frequent')

#fitting the imputter on the data
result_mode_imputer = imputer.fit_transform(data)

#Turning the result to a dataframe
data = pd.DataFrame (result_mode_imputer)

In [10]:
#Rename columns in the cleaned dataframe with original column names
data = data.rename(columns={0: "S.No", 1:'name', 2: 'title', 3: 'culture', 4: 'dateofbirth',
                           5: 'mother', 6: 'father', 7: 'heir', 8: 'house',
                           9: 'spouse', 10: 'book1_A_Game_Of_Thrones', 11: 'book2_A_Clash_Of_Kings', 
                            12: 'book3_A_Storm_Of_Swords', 13: 'book4_A_Feast_For_Crows', 
                            14: 'book5_A_Dance_with_Dragons',15: 'isAliveMother', 16: 'isAliveFather',
                           17:'isAliveHeir', 18:'isAliveSpouse', 19:'isMarried', 20:'isNoble', 21:'age',
                           22:'numDeadRelations', 23:'popularity', 24:'isAlive', 25:'first_name', 
                            26:'last_name', 27:'gender'})

## Feature Engineering

In [11]:
#Encode categorical columns and drop columns not needed
#converting categorical data to numbers to allow for gradient boosting later.
data['name'] = pd.factorize(data.name)[0]
data['title'] = pd.factorize(data.title)[0]
data['culture'] = pd.factorize(data.culture)[0]
data['mother'] = pd.factorize(data.mother)[0]
data['father'] = pd.factorize(data.father)[0]
data['heir'] = pd.factorize(data.heir)[0]
data['house'] = pd.factorize(data.house)[0]
data['spouse'] = pd.factorize(data.spouse)[0]
data['first_name'] = pd.factorize(data.first_name)[0]
data['last_name'] = pd.factorize(data.last_name)[0]
data['gender'] = pd.factorize(data.gender)[0]

data = data.drop(['S.No'], axis=1)

In [12]:
#checking the data after factorisation for any imbalance
data.isAlive.value_counts()

1    1451
0     495
Name: isAlive, dtype: int64

The response variable distribution is abit skewed, resampling will be done to balance the distribution

### Testing and training set

In [13]:
#Split dataset into training and testing dataset
X = data.drop (['isAlive'] , axis = 1)
y = data ['isAlive']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.1, random_state = 219)

y_train = y_train.astype('int')
y_test = y_test.astype ('int')


In [14]:
#SMOTEENN is used to balance the classes in the response variable
sme = SMOTEENN(random_state=42)
X_res, y_res = sme.fit_resample(X_train, y_train)

#X_resampled, y_resampled = sme.fit_resample(X_train, y_train) # This was gotten online, modified a little though - 

### Data Exploration
Exploring the permissible models to see the options that yields the best AUC score and least training/testing gap

#### Logistic Regression Model

In [15]:
#Build a logistic regression model
model =  LogisticRegression(random_state=0)

#train the model
model.fit(X_res, y_res)

#Get required paramters for for both train and test sets
training_accuracy  = round(accuracy_score(y_train,model.predict(X_train)),2)
test_accuracy  = round(accuracy_score(y_test, model.predict(X_test)),2)
AUC_Score = round(roc_auc_score(y_test, model.predict(X_test)),2)
Confusion_Matrix =  confusion_matrix(y_test, model.predict(X_test))  

#Print the required output
print(f""" 
Model Type :    {model}

Training Accuracy: {training_accuracy}

Testing Accuracy:  {test_accuracy}

AUC Score: {AUC_Score}

Confusion Matrix: {Confusion_Matrix}
""")

 
Model Type :    LogisticRegression(random_state=0)

Training Accuracy: 0.63

Testing Accuracy:  0.65

AUC Score: 0.66

Confusion Matrix: [[38 19]
 [49 89]]



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### Decision Tree

In [16]:
#Decision Tree modelling
model =  DecisionTreeClassifier(random_state=0)

#train the model
model.fit(X_res, y_res)

#Get required paramters for for both train and test sets
training_accuracy  = round(accuracy_score(y_train,model.predict(X_train)),2)
test_accuracy  = round(accuracy_score(y_test, model.predict(X_test)),2)
AUC_Score = round(roc_auc_score(y_test, model.predict(X_test)),2)
Confusion_Matrix =  confusion_matrix(y_test, model.predict(X_test))  

#Print the required output
print(f""" 
Model Type :    {model}

Training Accuracy: {training_accuracy}

Testing Accuracy:  {test_accuracy}

AUC Score: {AUC_Score}

Confusion Matrix: {Confusion_Matrix}
""")

 
Model Type :    DecisionTreeClassifier(random_state=0)

Training Accuracy: 0.8

Testing Accuracy:  0.68

AUC Score: 0.68

Confusion Matrix: [[38 19]
 [43 95]]



### Final Model

#### Gradient Boosted Model

In [17]:
#Build a Gradient Boosted Model
model =  GradientBoostingClassifier(random_state=0)

#train the model
model.fit(X_res, y_res)

#Get required paramters for for both train and test sets
training_accuracy  = round(accuracy_score(y_train,model.predict(X_train)),2)
test_accuracy  = round(accuracy_score(y_test, model.predict(X_test)),2)
AUC_Score = round(roc_auc_score(y_test, model.predict(X_test)),2)
Confusion_Matrix =  confusion_matrix(y_test, model.predict(X_test))

print('Confusion Matrix of the Best Model - GradientBoostingClassifier before tunning')
print(Confusion_Matrix)

#Print the required output
print(f""" 
Model Type :    {model}

Training Accuracy: {training_accuracy}

Testing Accuracy:  {test_accuracy}

AUC Score: {AUC_Score}

Confusion Matrix: {Confusion_Matrix}
""")

Confusion Matrix of the Best Model - GradientBoostingClassifier before tunning
[[ 39  18]
 [ 29 109]]
 
Model Type :    GradientBoostingClassifier(random_state=0)

Training Accuracy: 0.8

Testing Accuracy:  0.76

AUC Score: 0.74

Confusion Matrix: [[ 39  18]
 [ 29 109]]




The GBM model has the best performance (for AUC score,training and testing gap and confusion matrix as measuring metrics)
Further tuning is done to see if the score improves

#### Hyperparameter tuning 
Hyperparameter tuning is done in this cell but marked down to reduce run time

#declare search for hyperparameter tuning
param_distributions = {
  "n_estimators":[5,50,250,500],
  "max_depth":[1,3,5,7,9],
  "learning_rate":[0.01,0.1,1,10,100]}

#tune the model
model = GradientBoostingClassifier()
search = RandomizedSearchCV(model, param_distributions= param_distributions, n_iter=100, scoring='roc_auc', cv=10,refit = True,
                           return_train_score= True,random_state=219,verbose = 10)

result = search.fit(X_train, y_train)
#summarize result and get best parameters
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [18]:
#Initiate the algorithm instance and put the best parameters
Gradient_Boosting_Classifier = GradientBoostingClassifier(n_estimators =50 ,max_depth = 7, learning_rate = 0.1 )


#train the model
Gradient_Boosting_Classifier.fit(X_res, y_res)

GradientBoostingClassifier(max_depth=7, n_estimators=50)

#### GBM after tuning

In [19]:
#Get required parameters for train and test sets
training_accuracy  = round(accuracy_score(y_train, Gradient_Boosting_Classifier.predict(X_train)),2)
test_accuracy  = round(accuracy_score(y_test, Gradient_Boosting_Classifier.predict(X_test)),2)
AUC_Score = round(roc_auc_score(y_test, Gradient_Boosting_Classifier.predict(X_test)),2)
Confusion_Matrix =  confusion_matrix(y_test, Gradient_Boosting_Classifier.predict(X_test)) 
print('Confusion Matrix of the Best Model - GradientBoostingClassifier after tunning')
print(Confusion_Matrix)

#Print the required output
print(f""" 
Model Type :    {model}

Training Accuracy: {training_accuracy}

Testing Accuracy:  {test_accuracy}

AUC Score: {AUC_Score}

Confusion Matrix: {Confusion_Matrix}
""")
            

Confusion Matrix of the Best Model - GradientBoostingClassifier after tunning
[[ 42  15]
 [ 29 109]]
 
Model Type :    GradientBoostingClassifier(random_state=0)

Training Accuracy: 0.83

Testing Accuracy:  0.77

AUC Score: 0.76

Confusion Matrix: [[ 42  15]
 [ 29 109]]



Note: The tuned GBM model did not improve the AUC score,training and testing gap became wider
The untuned GBM model is adopted as final AUC score

In [20]:
#Using the best model - GBM
model =  GradientBoostingClassifier(random_state=0)

#train the model
model.fit(X_res, y_res)

#Get required paramters for for both train and test sets
training_accuracy  = round(accuracy_score(y_train,model.predict(X_train)),2)
test_accuracy  = round(accuracy_score(y_test, model.predict(X_test)),2)
AUC_Score = round(roc_auc_score(y_test, model.predict(X_test)),2)
Confusion_Matrix =  confusion_matrix(y_test, model.predict(X_test))


print('Confusion Matrix of the Best Model - GradientBoostingClassifier before tunning')
print(Confusion_Matrix)



Confusion Matrix of the Best Model - GradientBoostingClassifier before tunning
[[ 39  18]
 [ 29 109]]


#### Final Output

In [21]:
print('Confusion Matrix of the Best Model - GradientBoostingClassifier before tuning')
print(Confusion_Matrix)


Confusion Matrix of the Best Model - GradientBoostingClassifier before tuning
[[ 39  18]
 [ 29 109]]


In [22]:
#print required output
print (f"""
Model Type : {Gradient_Boosting_Classifier}

Training Accuracy: {training_accuracy}

Testing Accuracy: {test_accuracy}

AUC Score: {AUC_Score}

Confusion Matrix: {Confusion_Matrix}
""")


Model Type : GradientBoostingClassifier(max_depth=7, n_estimators=50)

Training Accuracy: 0.8

Testing Accuracy: 0.76

AUC Score: 0.74

Confusion Matrix: [[ 39  18]
 [ 29 109]]

