# Importing the data and libraries

In [432]:
# importing libraries
import pandas            as pd                          # data science essentials
import matplotlib.pyplot as plt                         # data visualization
import seaborn           as sns                         # enhanced data viz
from sklearn.model_selection import train_test_split    # train-test split
from sklearn.linear_model import LogisticRegression     # logistic regression
import statsmodels.formula.api as smf                   # logistic regression
from sklearn.metrics import confusion_matrix            # confusion matrix
from sklearn.metrics import roc_auc_score               # auc score
from sklearn.neighbors import KNeighborsClassifier      # KNN for classification
from sklearn.neighbors import KNeighborsRegressor       # KNN for regression
from sklearn.preprocessing import StandardScaler        # standard scaler
from sklearn.tree import DecisionTreeClassifier         # classification trees
from sklearn.tree import plot_tree                      # tree plots
import numpy as np                                      # mathematical essentials
from sklearn.model_selection import RandomizedSearchCV  # hyperparameter tuning
from sklearn.metrics import make_scorer                 # customizable scorer
from sklearn.ensemble import RandomForestClassifier     # random forest
from sklearn.ensemble import GradientBoostingClassifier # gbm


# loading data
file = './GOT_character_predictions.xlsx'

got_df = pd.read_excel(io         = file,
                      header     = 0,
                      sheet_name = 0)



# displaying the head of the dataset
got_df.head(n = 10)

Unnamed: 0,S.No,name,title,culture,dateOfBirth,mother,father,heir,house,spouse,...,isAliveMother,isAliveFather,isAliveHeir,isAliveSpouse,isMarried,isNoble,age,numDeadRelations,popularity,isAlive
0,1,Viserys II Targaryen,,,,Rhaenyra Targaryen,Daemon Targaryen,Aegon IV Targaryen,,,...,1.0,0.0,0.0,,0,0,,11,0.605351,0
1,2,Walder Frey,Lord of the Crossing,Rivermen,208.0,,,,House Frey,Perra Royce,...,,,,1.0,1,1,97.0,1,0.896321,1
2,3,Addison Hill,Ser,,,,,,House Swyft,,...,,,,,0,1,,0,0.267559,1
3,4,Aemma Arryn,Queen,,82.0,,,,House Arryn,Viserys I Targaryen,...,,,,0.0,1,1,23.0,0,0.183946,0
4,5,Sylva Santagar,Greenstone,Dornish,276.0,,,,House Santagar,Eldon Estermont,...,,,,1.0,1,1,29.0,0,0.043478,1
5,6,Tommen Baratheon,,,,Cersei Lannister,Robert Baratheon,Myrcella Baratheon,,,...,1.0,1.0,1.0,,0,0,,5,1.0,1
6,7,Valarr Targaryen,Hand of the King,Valyrian,183.0,,,,House Targaryen,Kiera of Tyrosh,...,,,,1.0,1,1,26.0,0,0.431438,0
7,8,Viserys I Targaryen,,,,Alyssa Targaryen,Baelon Targaryen,Rhaenyra Targaryen,,,...,1.0,1.0,1.0,,0,0,,5,0.67893,0
8,9,Wilbert,Ser,,,,,,,,...,,,,,0,1,,0,0.006689,0
9,10,Wilbert Osgrey,Ser,,,,,,House Osgrey,,...,,,,,0,1,,0,0.020067,1


# Exploring the data

After looking at the features names, I noticed one of them would be an issue in the future regression because of the dot in the name. I renamed it with an underscore to avoid potential error related to the name

In [433]:
# renaming S.No feature 
got_df.rename(columns={'S.No':'S_No'}, inplace = True)

In [434]:
# getting dataset information
got_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1946 entries, 0 to 1945
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   S_No                        1946 non-null   int64  
 1   name                        1946 non-null   object 
 2   title                       938 non-null    object 
 3   culture                     677 non-null    object 
 4   dateOfBirth                 433 non-null    float64
 5   mother                      21 non-null     object 
 6   father                      26 non-null     object 
 7   heir                        23 non-null     object 
 8   house                       1519 non-null   object 
 9   spouse                      276 non-null    object 
 10  book1_A_Game_Of_Thrones     1946 non-null   int64  
 11  book2_A_Clash_Of_Kings      1946 non-null   int64  
 12  book3_A_Storm_Of_Swords     1946 non-null   int64  
 13  book4_A_Feast_For_Crows     1946 

I loaded the user-defined functions in case I would need them in the following steps of the assignment. The code is taken from script 7 of Machine Learning (Chase Kusterer).

In [435]:
########################################
# optimal_neighbors
########################################
def optimal_neighbors(x_data,
                      y_data,
                      standardize = True,
                      pct_test=0.10,
                      seed=219,
                      response_type='reg',
                      max_neighbors=20,
                      show_viz=True):
    """
Exhaustively compute training and testing results for KNN across
[1, max_neighbors]. Outputs the maximum test score and (by default) a
visualization of the results.
PARAMETERS
----------
x_data        : explanatory variable data
y_data        : response variable
standardize   : whether or not to standardize the x data, default True
pct_test      : test size for training and validation from (0,1), default 0.25
seed          : random seed to be used in algorithm, default 219
response_type : type of neighbors algorithm to use, default 'reg'
    Use 'reg' for regression (KNeighborsRegressor)
    Use 'class' for classification (KNeighborsClassifier)
max_neighbors : maximum number of neighbors in exhaustive search, default 20
show_viz      : display or surpress k-neigbors visualization, default True
"""    
    
    
    if standardize == True:
        # optionally standardizing x_data
        scaler             = StandardScaler()
        scaler.fit(x_data)
        x_scaled           = scaler.transform(x_data)
        x_scaled_df        = pd.DataFrame(x_scaled)
        x_data             = x_scaled_df



    # train-test split
    x_train, x_test, y_train, y_test = train_test_split(x_data,
                                                        y_data,
                                                        test_size = pct_test,
                                                        random_state = seed)


    # creating lists for training set accuracy and test set accuracy
    training_accuracy = []
    test_accuracy = []
    
    
    # setting neighbor range
    neighbors_settings = range(1, max_neighbors + 1)


    for n_neighbors in neighbors_settings:
        # building the model based on response variable type
        if response_type == 'reg':
            clf = KNeighborsRegressor(n_neighbors = n_neighbors)
            clf.fit(x_train, y_train)
            
        elif response_type == 'class':
            clf = KNeighborsClassifier(n_neighbors = n_neighbors)
            clf.fit(x_train, y_train)            
            
        else:
            print("Error: response_type must be 'reg' or 'class'")
        
        
        # recording the training set accuracy
        training_accuracy.append(clf.score(x_train, y_train))
    
        # recording the generalization accuracy
        test_accuracy.append(clf.score(x_test, y_test))


    # optionally displaying visualization
    if show_viz == True:
        # plotting the visualization
        fig, ax = plt.subplots(figsize=(12,8))
        plt.plot(neighbors_settings, training_accuracy, label = "training accuracy")
        plt.plot(neighbors_settings, test_accuracy, label = "test accuracy")
        plt.ylabel("Accuracy")
        plt.xlabel("n_neighbors")
        plt.legend()
        plt.show()
    
    
    # returning optimal number of neighbors
    print(f"The optimal number of neighbors is: {test_accuracy.index(max(test_accuracy))+1}")
    return test_accuracy.index(max(test_accuracy))+1


########################################
# visual_cm
########################################
def visual_cm(true_y, pred_y, labels = None):
    """
Creates a visualization of a confusion matrix.

PARAMETERS
----------
true_y : true values for the response variable
pred_y : predicted values for the response variable
labels : , default None
    """
    # visualizing the confusion matrix

    # setting labels
    lbls = labels
    

    # declaring a confusion matrix object
    cm = confusion_matrix(y_true = true_y,
                          y_pred = pred_y)


    # heatmap
    sns.heatmap(cm,
                annot       = True,
                xticklabels = lbls,
                yticklabels = lbls,
                cmap        = 'Blues',
                fmt         = 'g')


    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix of the Classifier')
    plt.show()

In [436]:
# descriptive statistics of age
got_df['age'].describe()

count       433.000000
mean      -1293.563510
std       19564.340993
min     -298001.000000
25%          18.000000
50%          27.000000
75%          50.000000
max         100.000000
Name: age, dtype: float64

After displaying the descriptive statistics of age, I noticed the minimum value did not make sense. Hence, I highlighted the uncommon values in the following code, but did not modify them due to the restrictions.

In [437]:
# displaying all observations equal or lower than zero for age
got_df.loc[:,'age'][got_df.loc[:,'age']<=0]

329          0.0
790          0.0
1684   -277980.0
1868   -298001.0
Name: age, dtype: float64

In [438]:
# descriptive statistics of popularity
got_df['popularity'].describe()

count    1946.000000
mean        0.089584
std         0.160568
min         0.000000
25%         0.013378
50%         0.033445
75%         0.086957
max         1.000000
Name: popularity, dtype: float64

In [439]:
# descriptive statistics of dateOfBirth
got_df['dateOfBirth'].describe()

count       433.000000
mean       1577.364896
std       19565.414460
min         -28.000000
25%         240.000000
50%         268.000000
75%         285.000000
max      298299.000000
Name: dateOfBirth, dtype: float64

## Missing Values Imputation

In [440]:
#Checking the number of missing value
got_df.isnull().sum()

S_No                             0
name                             0
title                         1008
culture                       1269
dateOfBirth                   1513
mother                        1925
father                        1920
heir                          1923
house                          427
spouse                        1670
book1_A_Game_Of_Thrones          0
book2_A_Clash_Of_Kings           0
book3_A_Storm_Of_Swords          0
book4_A_Feast_For_Crows          0
book5_A_Dance_with_Dragons       0
isAliveMother                 1925
isAliveFather                 1920
isAliveHeir                   1923
isAliveSpouse                 1670
isMarried                        0
isNoble                          0
age                           1513
numDeadRelations                 0
popularity                       0
isAlive                          0
dtype: int64

In [441]:
# displaying all dataset columns
got_df.columns

Index(['S_No', 'name', 'title', 'culture', 'dateOfBirth', 'mother', 'father',
       'heir', 'house', 'spouse', 'book1_A_Game_Of_Thrones',
       'book2_A_Clash_Of_Kings', 'book3_A_Storm_Of_Swords',
       'book4_A_Feast_For_Crows', 'book5_A_Dance_with_Dragons',
       'isAliveMother', 'isAliveFather', 'isAliveHeir', 'isAliveSpouse',
       'isMarried', 'isNoble', 'age', 'numDeadRelations', 'popularity',
       'isAlive'],
      dtype='object')

In [442]:
# looping to detect features with missing values
for col in got_df:

    # creating columns with 1s if missing and 0 if not
    if got_df[col].isnull().astype(int).sum() > 0:
        got_df['m_'+col] = got_df[col].isnull().astype(int)


# summing the missing value flags to check the results of the loop above
got_df[    ['m_title', 'm_culture', 'm_dateOfBirth', 'm_mother', 'm_father',
            'm_heir', 'm_house', 'm_spouse', 'm_isAliveMother', 'm_isAliveFather',
            'm_isAliveHeir', 'm_isAliveSpouse', 'm_age']    ].sum(axis = 0)

m_title            1008
m_culture          1269
m_dateOfBirth      1513
m_mother           1925
m_father           1920
m_heir             1923
m_house             427
m_spouse           1670
m_isAliveMother    1925
m_isAliveFather    1920
m_isAliveHeir      1923
m_isAliveSpouse    1670
m_age              1513
dtype: int64

## Flag Based Feature Engineering

One of the variable (other than dummy) seemed to have a lot of zeros in the observations. I flagged this value since it had more than 100 observations in both counts (number of zeros and non-zeros observations).

In [443]:
# counting the number of zeroes for 
numDeadRelations_zeroes   = len(got_df['numDeadRelations'][got_df['numDeadRelations']==0]) 

# printing a table of the results
print(f"""
                        No\t\tYes
                        ---------------------
numDeadRelations       | {numDeadRelations_zeroes}\t\t{len(got_df) - numDeadRelations_zeroes}
""")



                        No		Yes
                        ---------------------
numDeadRelations       | 1801		145



In [444]:
# placeholder variables
got_df['has_dead_relations'] = 0

for index, value in got_df.iterrows():
    

    # numDeadRelations
    if got_df.loc[index, 'numDeadRelations'] > 0:
        got_df.loc[index, 'has_dead_relations'] = 1


In [445]:
# checking results
got_df[  ['has_dead_relations']  ].head(n = 5)

Unnamed: 0,has_dead_relations
0,1
1,1
2,0
3,0
4,0


# Logistic Regressions

## Stratifying the Response Variable

The following code shows the balance between the characters that are alive, and those who are not.

In [446]:
got_df.loc[ : ,'isAlive'].value_counts(normalize = True).round(decimals = 2)

1    0.75
0    0.25
Name: isAlive, dtype: float64

## Preparing the Explanatory and Response Data

In [447]:
# declaring explanatory variables
got_data=got_df.drop('isAlive', axis=1)

# declaring response variable
got_target=got_df.loc[: , 'isAlive']

## Prepare train-test split for statsmodel

In [448]:
# train-test split with stratification
x_train, x_test, y_train, y_test = train_test_split(
            got_data,
            got_target,
            test_size    = 0.10,
            random_state = 219,
            stratify     = got_target) # preserving balance


# merging training data for statsmodels
got_train = pd.concat([x_train, y_train], axis = 1)

In [449]:
# printing candidate variable sets
print(f"""

Response Variable Proportions (Training Set)
--------------------------------------------
{y_train.value_counts(normalize = True).round(decimals = 2)}



Response Variable Proportions (Testing Set)
--------------------------------------------
{y_test.value_counts(normalize = True).round(decimals = 2)}
""")





Response Variable Proportions (Training Set)
--------------------------------------------
1    0.75
0    0.25
Name: isAlive, dtype: float64



Response Variable Proportions (Testing Set)
--------------------------------------------
1    0.74
0    0.26
Name: isAlive, dtype: float64



The following code is used to ease the selection and the imputation of variables in the logistic models

In [450]:
# creating a loop to avoid copy/paste errors
for val in got_data:
    print(f" {val} + ")

 S_No + 
 name + 
 title + 
 culture + 
 dateOfBirth + 
 mother + 
 father + 
 heir + 
 house + 
 spouse + 
 book1_A_Game_Of_Thrones + 
 book2_A_Clash_Of_Kings + 
 book3_A_Storm_Of_Swords + 
 book4_A_Feast_For_Crows + 
 book5_A_Dance_with_Dragons + 
 isAliveMother + 
 isAliveFather + 
 isAliveHeir + 
 isAliveSpouse + 
 isMarried + 
 isNoble + 
 age + 
 numDeadRelations + 
 popularity + 
 m_title + 
 m_culture + 
 m_dateOfBirth + 
 m_mother + 
 m_father + 
 m_heir + 
 m_house + 
 m_spouse + 
 m_isAliveMother + 
 m_isAliveFather + 
 m_isAliveHeir + 
 m_isAliveSpouse + 
 m_age + 
 has_dead_relations + 


## Logistic Models 

The logistic_sig_1 (in the following cell) contains variables that have been selected based on this set of criteria:
1. I removed all the variables that did not contain numbers.
2. I removed all those with NaN values.
3. At least one of the book variables was removed right away, to avoid a multi-collinearity scenario.
4. All the variable that were related were removed (e.g.: isAliveFather -> m_father).
5. From then , I removed one by one the variable with the highest p-value and observed the change.
6. Finally, I removed m_age, although it had a p-value of 0. Indeed, I reckoned it was impossible to interpret such variable in the model.

In [451]:
# instantiating a logistic regression model object
logistic_sig_1 = smf.logit(formula   = """isAlive ~ S_No + 
 book1_A_Game_Of_Thrones + 
 book2_A_Clash_Of_Kings + 
 book4_A_Feast_For_Crows +
 m_house + 
 m_isAliveMother +
 has_dead_relations""",
                           data = got_train)


# FITTING the model object
results_logistic = logistic_sig_1.fit()


# checking the results SUMMARY
results_logistic.summary2() # summary2() has AIC and BIC

Optimization terminated successfully.
         Current function value: 0.490704
         Iterations 6


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.134
Dependent Variable:,isAlive,AIC:,1734.4441
Date:,2021-12-05 17:22,BIC:,1778.1876
No. Observations:,1751,Log-Likelihood:,-859.22
Df Model:,7,LL-Null:,-992.53
Df Residuals:,1743,LLR p-value:,7.981700000000001e-54
Converged:,1.0000,Scale:,1.0
No. Iterations:,6.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,-0.9641,0.6810,-1.4157,0.1569,-2.2988,0.3707
S_No,-0.0007,0.0001,-6.1017,0.0000,-0.0010,-0.0005
book1_A_Game_Of_Thrones,-0.4379,0.1549,-2.8265,0.0047,-0.7415,-0.1342
book2_A_Clash_Of_Kings,-0.3270,0.1356,-2.4111,0.0159,-0.5928,-0.0612
book4_A_Feast_For_Crows,1.7022,0.1381,12.3258,0.0000,1.4315,1.9729
m_house,0.3377,0.1602,2.1083,0.0350,0.0238,0.6516
m_isAliveMother,2.1684,0.6768,3.2039,0.0014,0.8419,3.4950
has_dead_relations,-0.9741,0.2192,-4.4443,0.0000,-1.4036,-0.5445


The logistic_sig_2 model was built with the same criteria as the first one. However, I tried to change the variables and include the m_age to observe how it would affect the Pseudo R-squared.

In [452]:
# instantiating a logistic regression model object
logistic_sig_2 = smf.logit(formula   = """isAlive ~  S_No +  
 book1_A_Game_Of_Thrones + 
 book2_A_Clash_Of_Kings + 
 book4_A_Feast_For_Crows + 
 m_age + 
 m_mother +
 numDeadRelations""",
                           data = got_train)


# FITTING the model object
results_logistic = logistic_sig_2.fit()


# checking the results SUMMARY
results_logistic.summary2() # summary2() has AIC and BIC

Optimization terminated successfully.
         Current function value: 0.484654
         Iterations 6


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.145
Dependent Variable:,isAlive,AIC:,1713.2585
Date:,2021-12-05 17:22,BIC:,1757.002
No. Observations:,1751,Log-Likelihood:,-848.63
Df Model:,7,LL-Null:,-992.53
Df Residuals:,1743,LLR p-value:,2.4217e-58
Converged:,1.0000,Scale:,1.0
No. Iterations:,6.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,-1.6381,0.7008,-2.3376,0.0194,-3.0116,-0.2646
S_No,-0.0007,0.0001,-6.1780,0.0000,-0.0010,-0.0005
book1_A_Game_Of_Thrones,-0.4717,0.1553,-3.0375,0.0024,-0.7761,-0.1673
book2_A_Clash_Of_Kings,-0.2724,0.1374,-1.9827,0.0474,-0.5417,-0.0031
book4_A_Feast_For_Crows,1.7668,0.1416,12.4760,0.0000,1.4892,2.0444
m_age,0.7891,0.1498,5.2693,0.0000,0.4956,1.0826
m_mother,2.2374,0.6790,3.2953,0.0010,0.9066,3.5681
numDeadRelations,-0.1114,0.0437,-2.5503,0.0108,-0.1970,-0.0258


The logistic-full model was also built on the same criteria. Nevertheless, I left all variables that would make the model work, despite them being statistically not significant.

In [453]:
# instantiating a logistic regression model object
logistic_full = smf.logit(formula   = """isAlive ~  S_No +  
 book1_A_Game_Of_Thrones + 
 book2_A_Clash_Of_Kings + 
 book4_A_Feast_For_Crows + 
 book5_A_Dance_with_Dragons + 
 isNoble +  
 numDeadRelations + 
 popularity + 
 m_title + 
 m_culture + 
 m_house + 
 m_isAliveMother + 
 m_isAliveFather + 
 m_isAliveHeir + 
 m_isAliveSpouse + 
 m_age + 
 has_dead_relations """,
                           data = got_train)


# FITTING the model object
results_logistic = logistic_full.fit()


# checking the results SUMMARY
results_logistic.summary2() # summary2() has AIC and BIC

Optimization terminated successfully.
         Current function value: 0.482665
         Iterations 6


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.148
Dependent Variable:,isAlive,AIC:,1726.2921
Date:,2021-12-05 17:22,BIC:,1824.715
No. Observations:,1751,Log-Likelihood:,-845.15
Df Model:,17,LL-Null:,-992.53
Df Residuals:,1733,LLR p-value:,1.3459e-52
Converged:,1.0000,Scale:,1.0
No. Iterations:,6.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,-2.1119,0.9960,-2.1203,0.0340,-4.0641,-0.1597
S_No,-0.0007,0.0001,-5.6260,0.0000,-0.0010,-0.0005
book1_A_Game_Of_Thrones,-0.4877,0.1615,-3.0204,0.0025,-0.8042,-0.1712
book2_A_Clash_Of_Kings,-0.2698,0.1397,-1.9308,0.0535,-0.5437,0.0041
book4_A_Feast_For_Crows,1.7403,0.1494,11.6514,0.0000,1.4476,2.0331
book5_A_Dance_with_Dragons,0.1691,0.1507,1.1223,0.2617,-0.1262,0.4645
isNoble,-0.0713,0.4616,-0.1544,0.8773,-0.9759,0.8334
numDeadRelations,-0.0518,0.0642,-0.8060,0.4202,-0.1776,0.0741
popularity,0.0445,0.5241,0.0849,0.9324,-0.9827,1.0717


# Logistic Regression in Sci-Kit Learn

The following code is used to give a name to the different models and ease the process by imputing the name in the code instead of all the variables.

In [454]:
# creating a dictionary to store candidate models

candidate_dict = {

 # full model
 'logit_full'   : ['S_No', 'book1_A_Game_Of_Thrones', 'book2_A_Clash_Of_Kings', 
                   'book4_A_Feast_For_Crows', 'book5_A_Dance_with_Dragons', 
                   'isNoble', 'numDeadRelations', 'popularity', 'm_title', 
                   'm_culture', 'm_house', 'm_isAliveMother', 'm_isAliveFather', 
                   'm_isAliveHeir', 'm_isAliveSpouse', 'm_age', 'has_dead_relations'],
 

 # significant variables only (set 1)
 'logit_sig_1'  : ['S_No', 'book1_A_Game_Of_Thrones', 'book2_A_Clash_Of_Kings',
                   'm_house', 'book4_A_Feast_For_Crows', 'm_isAliveMother',
                   'has_dead_relations'],
    
    
 # significant variables only (set 2)
 'logit_sig_2'  : ['S_No', 'book1_A_Game_Of_Thrones', 'book2_A_Clash_Of_Kings', 
                   'book4_A_Feast_For_Crows', 'm_mother', 'm_age', 
                   'numDeadRelations']

}

## Dynamically printing each explanatory variable set 

In [455]:
# printing candidate variable sets
print(f"""
/--------------------------\\
|Explanatory Variable Sets |
\\--------------------------/

Full Model:
-----------
{candidate_dict['logit_full']}


First Significant p-value Model:
--------------------------------
{candidate_dict['logit_sig_1']}


Second Significant p-value Model:
---------------------------------
{candidate_dict['logit_sig_2']}
""")


/--------------------------\
|Explanatory Variable Sets |
\--------------------------/

Full Model:
-----------
['S_No', 'book1_A_Game_Of_Thrones', 'book2_A_Clash_Of_Kings', 'book4_A_Feast_For_Crows', 'book5_A_Dance_with_Dragons', 'isNoble', 'numDeadRelations', 'popularity', 'm_title', 'm_culture', 'm_house', 'm_isAliveMother', 'm_isAliveFather', 'm_isAliveHeir', 'm_isAliveSpouse', 'm_age', 'has_dead_relations']


First Significant p-value Model:
--------------------------------
['S_No', 'book1_A_Game_Of_Thrones', 'book2_A_Clash_Of_Kings', 'm_house', 'book4_A_Feast_For_Crows', 'm_isAliveMother', 'has_dead_relations']


Second Significant p-value Model:
---------------------------------
['S_No', 'book1_A_Game_Of_Thrones', 'book2_A_Clash_Of_Kings', 'book4_A_Feast_For_Crows', 'm_mother', 'm_age', 'numDeadRelations']



## Building a logistic regression model in scikit-learn 

In [456]:
# train/test split with the full model
got_df_data   =  got_df.loc[ : , candidate_dict['logit_sig_1']]
got_df_target =  got_df.loc[ : , 'isAlive']


# This is the exact code we were using before
x_train, x_test, y_train, y_test = train_test_split(
            got_df_data,
            got_df_target,
            test_size    = 0.10,
            random_state = 219,
            stratify     = got_df_target)


# INSTANTIATING a logistic regression model
logreg = LogisticRegression(solver = 'lbfgs',
                            C = 1,
                            random_state = 219)


# FITTING the training data
logreg_fit = logreg.fit(x_train, y_train)


# PREDICTING based on the testing set
logreg_pred = logreg_fit.predict(x_test)


# SCORING the results
print('Training ACCURACY:', logreg_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', logreg_fit.score(x_test, y_test).round(4))
print('AUC Score:', roc_auc_score(y_true  = y_test,
                    y_score = logreg_pred).round(decimals = 4))


# saving scoring data for future use
logreg_train_score = logreg_fit.score(x_train, y_train).round(4) # accuracy
logreg_test_score  = logreg_fit.score(x_test, y_test).round(4) # accuracy
logreg_auc_score = roc_auc_score(y_true  = y_test,
                                 y_score = logreg_pred).round(decimals = 4)

Training ACCURACY: 0.767
Testing  ACCURACY: 0.8513
AUC Score: 0.7428


In [457]:
# unpacking the confusion matrix
logreg_tn, \
logreg_fp, \
logreg_fn, \
logreg_tp = confusion_matrix(y_true = y_test, y_pred = logreg_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {logreg_tn}
False Positives: {logreg_fp}
False Negatives: {logreg_fn}
True Positives : {logreg_tp}
""")


True Negatives : 26
False Positives: 24
False Negatives: 5
True Positives : 140



# Classification Trees

## Full Classification Tree 

In [458]:
# INSTANTIATING a classification tree object
full_tree = DecisionTreeClassifier()


# FITTING the training data
full_tree_fit = full_tree.fit(x_train, y_train)


# PREDICTING on new data
full_tree_pred = full_tree_fit.predict(x_test)


# SCORING the model
print('Full Tree Training ACCURACY:', full_tree_fit.score(x_train,
                                                     y_train).round(4))

print('Full Tree Testing ACCURACY :', full_tree_fit.score(x_test,
                                                     y_test).round(4))

print('Full Tree AUC Score:', roc_auc_score(y_true  = y_test,
                                            y_score = full_tree_pred).round(4))


# saving scoring data for future use
full_tree_train_score = full_tree_fit.score(x_train, y_train).round(4) # accuracy
full_tree_test_score  = full_tree_fit.score(x_test, y_test).round(4)   # accuracy


# saving AUC
full_tree_auc_score   = roc_auc_score(y_true  = y_test,
                                      y_score = full_tree_pred).round(4) # auc

Full Tree Training ACCURACY: 1.0
Full Tree Testing ACCURACY : 0.6769
Full Tree AUC Score: 0.619


In [459]:
# unpacking the confusion matrix
full_tree_tn, \
full_tree_fp, \
full_tree_fn, \
full_tree_tp = confusion_matrix(y_true = y_test, y_pred = full_tree_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {full_tree_tn}
False Positives: {full_tree_fp}
False Negatives: {full_tree_fn}
True Positives : {full_tree_tp}
""")


True Negatives : 25
False Positives: 25
False Negatives: 38
True Positives : 107



## Pruned Classification Tree 

In [460]:
# INSTANTIATING a classification tree object
tree_pruned = DecisionTreeClassifier(max_depth = 4,
                    min_samples_leaf = 25,
                    random_state = 219)


# FITTING the training data
tree_pruned_fit = tree_pruned.fit(x_train, y_train)


# PREDICTING on new data
tree_pred = tree_pruned_fit.predict(x_test)


# SCORING the model
print('Training ACCURACY:', tree_pruned_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', tree_pruned_fit.score(x_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = tree_pred).round(4))


# saving scoring data for future use
tree_pruned_train_score = tree_pruned_fit.score(x_train, y_train).round(4) # accuracy
tree_pruned_test_score  = tree_pruned_fit.score(x_test, y_test).round(4) # accuracy


# saving auc score
tree_pruned_auc_score   = roc_auc_score(y_true  = y_test,
                                        y_score = tree_pred).round(4) # auc

Training ACCURACY: 0.7801
Testing  ACCURACY: 0.8205
AUC Score        : 0.7417


In [461]:
# unpacking the confusion matrix
tree_pruned_tn, \
tree_pruned_fp, \
tree_pruned_fn, \
tree_pruned_tp = confusion_matrix(y_true = y_test, y_pred = tree_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {tree_pruned_tn}
False Positives: {tree_pruned_fp}
False Negatives: {tree_pruned_fn}
True Positives : {tree_pruned_tp}
""")


True Negatives : 29
False Positives: 21
False Negatives: 14
True Positives : 131



# Random Forests

## Importing Remaining Necessary Packages 

In [462]:
# new tools
from sklearn.ensemble import RandomForestClassifier     # random forest
from sklearn.ensemble import GradientBoostingClassifier # gbm

#machine learning
from sklearn.model_selection import RandomizedSearchCV # hyperparameter tuning
from sklearn.metrics import make_scorer                # customizable scorer

## Default Random Forest 

In [463]:
# INSTANTIATING a random forest model with default values
rf_default = RandomForestClassifier(n_estimators     = 100,
                                    criterion        = 'gini',
                                    max_depth        = 4,
                                    min_samples_leaf = 1,
                                    bootstrap        = True,
                                    warm_start       = False,
                                    random_state     = 219)

In [464]:
# FITTING the training data
rf_default_fit = rf_default.fit(x_train, y_train)


# PREDICTING based on the testing set
rf_default_fit_pred = rf_default_fit.predict(x_test)


# SCORING the results
print('Training ACCURACY:', rf_default_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', rf_default_fit.score(x_test, y_test).round(4))


# saving AUC score
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = rf_default_fit_pred).round(4))


# saving scoring data for future use
rf_train_score = rf_default_fit.score(x_train, y_train).round(4) # accuracy
rf_test_score  = rf_default_fit.score(x_test, y_test).round(4)   # accuracy
rf_auc_score   = roc_auc_score(y_true  = y_test,
                                          y_score = rf_default_fit_pred).round(4)

Training ACCURACY: 0.7744
Testing  ACCURACY: 0.841
AUC Score        : 0.69


In [465]:
# unpacking the confusion matrix
rf_tn, \
rf_fp, \
rf_fn, \
rf_tp = confusion_matrix(y_true = y_test, y_pred = rf_default_fit_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {rf_tn}
False Positives: {rf_fp}
False Negatives: {rf_fn}
True Positives : {rf_tp}
""")


True Negatives : 19
False Positives: 31
False Negatives: 0
True Positives : 145



## Tuning Model's Hyper-parameters 

In the following cell, the n_iter has been fixed to 96 after Python informed me it could no go over 96 iterations considering the data

In [466]:
# FITTING the training data
rf_default_fit = rf_default.fit(x_train, y_train)


# PREDICTING based on the testing set
rf_default_fit_pred = rf_default_fit.predict(x_test)


# declaring a hyperparameter space
estimator_range  = np.arange(100, 1100, 250)
leaf_range       = np.arange(1, 31, 10)
criterion_range  = ['gini', 'entropy']
bootstrap_range  = [True, False]
warm_start_range = [True, False]


# creating a hyperparameter grid
param_grid = {'n_estimators'     : estimator_range,
              'min_samples_leaf' : leaf_range,
              'criterion'        : criterion_range,
              'bootstrap'        : bootstrap_range,
              'warm_start'       : warm_start_range}


# INSTANTIATING the model object without hyperparameters
forest_grid = RandomForestClassifier(random_state = 219)


# GridSearchCV object
forest_cv = RandomizedSearchCV(estimator           = forest_grid,
                               param_distributions = param_grid,
                               cv         = 3,
                               n_iter     = 96,
                               scoring    = make_scorer(roc_auc_score,
                                            needs_threshold = False))


# FITTING to the FULL DATASET (due to cross-validation)
forest_cv.fit(got_df_data, got_df_target)


# PREDICT step is not needed


# printing the optimal parameters and best score
print("Tuned Parameters  :", forest_cv.best_params_)
print("Tuned Training AUC:", forest_cv.best_score_.round(4))

Tuned Parameters  : {'warm_start': True, 'n_estimators': 100, 'min_samples_leaf': 21, 'criterion': 'gini', 'bootstrap': True}
Tuned Training AUC: 0.5051


In [467]:
# best estimators based on RandomizedSearchCV
forest_cv.best_estimator_

RandomForestClassifier(min_samples_leaf=21, random_state=219, warm_start=True)

## Tuned Forest  

In [468]:
# building a model based on hyperparameter tuning results

# INSTANTIATING with best_estimator
forest_tuned = RandomForestClassifier(criterion='gini', min_samples_leaf=21,
                       n_estimators=600, random_state=219, warm_start=True,
                                     bootstrap = True)


# FITTING to the FULL DATASET (due to cross-validation)
forest_tuned_fit = forest_tuned.fit(got_df_data, got_df_target)


# PREDICTING based on the testing set
forest_tuned_pred = forest_tuned_fit.predict(x_test)


# SCORING the results
print('Forest Tuned Training ACCURACY:', forest_tuned.score(x_train, y_train).round(4))
print('Forest Tuned Testing  ACCURACY:', forest_tuned.score(x_test, y_test).round(4))
print('Forest Tuned AUC Score        :', roc_auc_score(y_true  = y_test,
                                                       y_score = forest_tuned_pred).round(4))


# saving scoring data for future use
forest_tuned_train_score = forest_tuned.score(x_train, y_train).round(4) # accuracy
forest_tuned_test_score  = forest_tuned.score(x_test, y_test).round(4)   # accuracy


# saving the AUC score
forest_tuned_auc = roc_auc_score(y_true  = y_test,
                                 y_score = forest_tuned_pred).round(4) # auc

Forest Tuned Training ACCURACY: 0.7773
Forest Tuned Testing  ACCURACY: 0.8615
Forest Tuned AUC Score        : 0.7366


In [469]:
# unpacking the confusion matrix
tuned_rf_tn, \
tuned_rf_fp, \
tuned_rf_fn, \
tuned_rf_tp = confusion_matrix(y_true = y_test, y_pred = forest_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {tuned_rf_tn}
False Positives: {tuned_rf_fp}
False Negatives: {tuned_rf_fn}
True Positives : {tuned_rf_tp}
""")


True Negatives : 24
False Positives: 26
False Negatives: 1
True Positives : 144



# Gradient Boosted Machines

## Full Gradient Boosted Machine 

In [470]:
# INSTANTIATING the model object without hyperparameters
full_gbm_default = GradientBoostingClassifier(loss          = 'deviance',
                                              learning_rate = 0.1,
                                              n_estimators  = 100,
                                              criterion     = 'friedman_mse',
                                              max_depth     = 3,
                                              warm_start    = False,
                                              random_state  = 219)


# FIT step is needed as we are not using .best_estimator
full_gbm_default_fit = full_gbm_default.fit(x_train, y_train)


# PREDICTING based on the testing set
full_gbm_default_pred = full_gbm_default_fit.predict(x_test)


# SCORING the results
print('Training ACCURACY:', full_gbm_default_fit.score(x_train, y_train).round(4))
print('Testing ACCURACY :', full_gbm_default_fit.score(x_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = full_gbm_default_pred).round(4))

#Saving the results for future use
full_gbm_training = full_gbm_default_fit.score(x_train, y_train).round(4)
full_gbm_test     = full_gbm_default_fit.score(x_test, y_test).round(4)
full_gbm_auc      =  roc_auc_score(y_true  = y_test,
                                          y_score = full_gbm_default_pred).round(4)

Training ACCURACY: 0.8172
Testing ACCURACY : 0.8256
AUC Score        : 0.7255


In [471]:
# unpacking the confusion matrix
gbm_default_tn, \
gbm_default_fp, \
gbm_default_fn, \
gbm_default_tp = confusion_matrix(y_true = y_test, y_pred = full_gbm_default_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {gbm_default_tn}
False Positives: {gbm_default_fp}
False Negatives: {gbm_default_fn}
True Positives : {gbm_default_tp}
""")


True Negatives : 26
False Positives: 24
False Negatives: 10
True Positives : 135



## Tuning Model's Hyper-parameters 

Running the following code ended up in not complying to the time restrictions. However, to in order to have a better understanding of the values in the tuned GBM model, I left the whole code but put every line as a comment so it would not affect the execution time. The parameters in the Tuned model have been filled after the two following cells ran the first time.

In [472]:
# declaring a hyperparameter space
#learn_range        = np.arange(0.1, 2.2, 0.5)
#estimator_range    = np.arange(100, 501, 25)
#depth_range        = np.arange(2, 11, 2)
#warm_start_range   = [True, False]

# creating a hyperparameter grid
#param_grid = {'learning_rate' : learn_range,
#              'max_depth'     : depth_range,
#              'n_estimators'  : estimator_range,
#              'warm_start'    : warm_start_range}


# INSTANTIATING the model object without hyperparameters
#full_gbm_grid = GradientBoostingClassifier(random_state = 219)


# GridSearchCV object
#full_gbm_cv = RandomizedSearchCV(estimator     = full_gbm_grid,
#                           param_distributions = param_grid,
#                          cv                  = 3,
#                           n_iter              = 500,
#                           random_state        = 219,
#                           scoring             = make_scorer(roc_auc_score,
#                                                 needs_threshold = False))


# FITTING to the FULL DATASET (due to cross-validation)
#full_gbm_cv.fit(got_df_data, got_df_target)


# PREDICT step is not needed


# printing the optimal parameters and best score
#print("Tuned Parameters  :", full_gbm_cv.best_params_)
#print("Tuned Training AUC:", full_gbm_cv.best_score_.round(4))

In [473]:
# checking the best estimator for the model
#full_gbm_cv.best_estimator_

## Tuned Gradient Boosted Machine 

In [474]:
# INSTANTIATING with best_estimator
gbm_tuned = GradientBoostingClassifier(learning_rate = 0.1,
                                       max_depth     = 2,
                                       n_estimators  = 125,
                                       warm_start    = False,
                                       random_state  = 219)


# FITTING to the FULL DATASET (due to cross-validation)
gbm_tuned_fit = gbm_tuned.fit(got_df_data, got_df_target)


# PREDICTING based on the testing set
gbm_tuned_pred = gbm_tuned_fit.predict(x_test)


# SCORING the results
print('Training ACCURACY:', gbm_tuned_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', gbm_tuned_fit.score(x_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = gbm_tuned_pred).round(4))

#Saving the results for future use
gbm_tuned_training = gbm_tuned_fit.score(x_train, y_train).round(4)
gbm_tuned_test     = gbm_tuned_fit.score(x_test, y_test).round(4)
gbm_tuned_auc      = roc_auc_score(y_true  = y_test,
                                          y_score = gbm_tuned_pred).round(4)

Training ACCURACY: 0.7984
Testing  ACCURACY: 0.8615
AUC Score        : 0.7824


In [475]:
# unpacking the confusion matrix
gbm_tuned_tn, \
gbm_tuned_fp, \
gbm_tuned_fn, \
gbm_tuned_tp = confusion_matrix(y_true = y_test, y_pred = gbm_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {gbm_tuned_tn}
False Positives: {gbm_tuned_fp}
False Negatives: {gbm_tuned_fn}
True Positives : {gbm_tuned_tp}
""")


True Negatives : 31
False Positives: 19
False Negatives: 8
True Positives : 137



# Classification Modeling with KNN

In [476]:
# INSTANTIATING StandardScaler()
scaler = StandardScaler()


# FITTING the data
scaler.fit(got_df_data)


# TRANSFORMING the data
x_scaled     = scaler.transform(got_df_data)


# converting to a DataFrame
x_scaled_df  = pd.DataFrame(x_scaled) 


# train-test split with the scaled data
x_train_scaled, x_test_scaled, y_train_scaled, y_test_scaled = train_test_split(
            x_scaled_df,
            got_target,
            random_state = 219,
            test_size    = 0.10,
            stratify     = got_df_target)


# INSTANTIATING a KNN classification model with optimal neighbors
knn_opt = KNeighborsClassifier(n_neighbors = opt_neighbors)


# FITTING the training data
knn_fit = knn_opt.fit(x_train_scaled, y_train_scaled)


# PREDICTING based on the testing set
knn_pred = knn_fit.predict(x_test_scaled)


# SCORING the results
print('Training ACCURACY:', knn_fit.score(x_train_scaled, y_train_scaled).round(4))
print('Testing  ACCURACY:', knn_fit.score(x_test_scaled, y_test_scaled).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = knn_pred).round(4))


# saving scoring data
knn_train_score = knn_fit.score(x_train_scaled, y_train_scaled).round(4)
knn_test_score  = knn_fit.score(x_test_scaled, y_test_scaled).round(4)


# saving AUC score
knn_auc_score   = roc_auc_score(y_true  = y_test,
                                          y_score = knn_pred).round(4)

Training ACCURACY: 0.7796
Testing  ACCURACY: 0.8205
AUC Score        : 0.6828


In [477]:
# unpacking the confusion matrix
knn_tree_tn, \
knn_tree_fp, \
knn_tree_fn, \
knn_tree_tp = confusion_matrix(y_true = y_test, y_pred = knn_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {knn_tree_tn}
False Positives: {knn_tree_fp}
False Negatives: {knn_tree_fn}
True Positives : {knn_tree_tp}
""")


True Negatives : 20
False Positives: 30
False Negatives: 5
True Positives : 140



# Model Comparison and Selection

In [480]:
# comparing models
print(f"""
Model        Training Acc      Testing Acc      AUC Score      TN, FP, FN, TP
-----        ------------      -----------      ---------      --------------
Logistic       {logreg_train_score}              {logreg_test_score}         {logreg_auc_score}       {logreg_tn, logreg_fp, logreg_fn, logreg_tp}
Full Tree      {full_tree_train_score}                {full_tree_test_score}         {full_tree_auc_score}        {full_tree_tn, full_tree_fp, full_tree_fn, full_tree_tp}
Pruned Tree    {tree_pruned_train_score}             {tree_pruned_test_score}         {tree_pruned_auc_score}       {tree_pruned_tn, tree_pruned_fp, tree_pruned_fn, tree_pruned_tp}  
Random Forest  {rf_train_score}             {rf_test_score}          {rf_auc_score}         {rf_tn, rf_fp, rf_fn, rf_tp}
Forest Tuned   {forest_tuned_train_score}             {forest_tuned_test_score}         {forest_tuned_auc}       {tuned_rf_tn, tuned_rf_fp, tuned_rf_fn, tuned_rf_tp}
Full gbm       {full_gbm_training}             {full_gbm_test}         {full_gbm_auc}       {gbm_default_tn, gbm_default_fp, gbm_default_fn, gbm_default_tp}
Tuned gbm(*)   {gbm_tuned_training}             {gbm_tuned_test}         {gbm_tuned_auc}       {gbm_tuned_tn, gbm_tuned_fp, gbm_tuned_fn, gbm_tuned_tp}
KNN Tree       {knn_train_score}             {knn_test_score}         {knn_auc_score}       {knn_tree_tn, knn_tree_fp, knn_tree_fn, knn_tree_tp}


(*Final model)
""")



Model        Training Acc      Testing Acc      AUC Score      TN, FP, FN, TP
-----        ------------      -----------      ---------      --------------
Logistic       0.767              0.8513         0.7428       (26, 24, 5, 140)
Full Tree      1.0                0.6769         0.619        (25, 25, 38, 107)
Pruned Tree    0.7801             0.8205         0.7417       (29, 21, 14, 131)  
Random Forest  0.7744             0.841          0.69         (19, 31, 0, 145)
Forest Tuned   0.7773             0.8615         0.7366       (24, 26, 1, 144)
Full gbm       0.8172             0.8256         0.7255       (26, 24, 10, 135)
Tuned gbm(*)   0.7984             0.8615         0.7824       (31, 19, 8, 137)
KNN Tree       0.7796             0.8205         0.6828       (20, 30, 5, 140)


(*Final model)

