In [None]:
# Variable set-up

rq = 'rq2' # Options: 'rq1', 'rq2'
cv = 'ts' # Options: 'ts' (time series split, ignore siblings), 'ss' (stratified shuffle, ignore siblings)
data_type = 'all' # Options: 'str' (just structured data), 'all' (structured data and list of strings)
algorithm_names = ['decision_tree', 'logistic_regression', 'gradient_boosting'] 
rcv_n_iter = 50 # The more iterations, the more the randomised search searches for an optimal solution
parameters = 2 # 

# Don't change
file_stub_y_siblings = rq + '_' + cv + '_str' # use 'str' for all 
file_stub = rq + '_' + cv + '_' + data_type # Creates file stub for saving in the format e.g. rq1_ss_str
levers =  str(rcv_n_iter) + '_' + str(parameters)
print(file_stub + '_' + levers)

In [None]:
## File directories
local_dir = '/Users/[username]/Documents/Final transfer out data and code to WWC Jan 2020' # Insert [username]
hard_drive_dir = '/Volumes/diskAshur2/Final transfer out data and code to WWC Jan 2020/Data for model/Use'
summary_info = '/Users/[username]/Documents/Summary information' # Insert [username]

In [None]:
# Vary hyperparameters

## Starting hyperparameters
# Logistic regression (regularised, unregularised) - start with regularisation strength of 1
# Gradient boosting - start with 50 trees, max_depth = 10, min_leaf_node = 5, max_features = sqrt(no. of features) 
# Decision Tree

# https://stackoverflow.com/questions/44572109/what-are-the-arguments-for-scipy-stats-uniform
# ower bound, and the second argument is the range of the distribution

import numpy as np
import scipy.stats as stats
from sklearn.utils.fixes import loguniform

parameters_dtc = [{'dtc__max_features': np.random.uniform(low=0,high=0.8, size=50), 
                'dtc__max_depth': stats.uniform(5, 15),
                'dtc__min_samples_split': stats.randint(20, 40)
                  }]

parameters_lr = [{'lr__penalty': ['l1', 'l2'], # 'none' = no regularisation (didn't accept), l1 selects features, l2 pulls the weight down
        'lr__C': loguniform(1e-3, 1e-1)  # smaller values => stronger regularisation
                 }] 

parameters_gbc = [{'gbc__n_estimators': stats.randint(50, 150),
                'gbc__max_depth': stats.randint(5, 10), 
                'gbc__max_features': np.random.uniform(low=0,high=0.8, size=50)  
                }]





### Below here - generic set-up - don't vary. Change which models are run and output saved by varying the parameters above

In [None]:
# Load user-written functions 

%load_ext autoreload
%autoreload 2

import analysis_functions

In [None]:
# Set working directory
import os
import pickle
os.chdir(hard_drive_dir)
os.getcwd()

In [None]:
# Import structured data -train, test data
import pandas as pd
X_tr = pd.read_csv("X_train_{}.csv".format(file_stub), index_col = 0)
print(X_tr.shape)
X_tr.reset_index(inplace = True, drop = True)
print(X_tr.index)

X_test = pd.read_csv("X_test_{}.csv".format(file_stub), index_col = 0)
print(X_test.shape)
X_test.reset_index(inplace = True, drop = True)
print(X_test.index)

y_tr = pd.read_csv("y_train_{}.csv".format(file_stub_y_siblings), index_col = 0, header = None)
print(y_tr.shape)
y_tr.reset_index(inplace = True, drop = True)
y_tr = pd.Series(y_tr[1])
print(y_tr.index)

y_test = pd.read_csv("y_test_{}.csv".format(file_stub_y_siblings), index_col = 0, header = None)
print(y_test.shape)
y_test.reset_index(inplace = True, drop = True)
y_test = pd.Series(y_test[1])
print(y_test.index)

siblings_tr = pd.read_csv("siblings_train_{}.csv".format(file_stub_y_siblings), index_col = 0, header = None)
print(siblings_tr.shape)
siblings_tr.reset_index(inplace = True, drop = True)
siblings_tr = pd.Series(siblings_tr[1])
print(siblings_tr.index)

siblings_test = pd.read_csv("siblings_test_{}.csv".format(file_stub_y_siblings), index_col = 0, header = None)
print(siblings_test.shape)
siblings_test.reset_index(inplace = True, drop = True)
siblings_test = pd.Series(siblings_test[1])
print(siblings_test.index)

In [None]:
import matplotlib.pyplot as plt
#plt.scatter(X_tr['ReferralDatetime'], y_tr)
y_tr[0:2*int(y_tr.shape[0]/3)].value_counts()

In [None]:
set(X_test.columns).difference(set(X_tr.columns))

In [None]:
assert(len(set(X_test.columns).difference(set(X_tr.columns))) == 0)
assert(len(set(X_tr.columns).difference(set(X_test.columns))) == 0)

In [None]:
assert (len([col for col in X_tr.columns if 'ethnicity' in col]) == 0)
assert (len([col for col in X_test.columns if 'ethnicity' in col]) == 0)

In [None]:
import pandas as pd

sample_size = y_tr.shape[0] + y_test.shape[0]
print(sample_size)
percentage_positive_case = round((y_tr.value_counts() + y_test.value_counts())[1] / sample_size * 100, 2)
print(percentage_positive_case)

data_description = pd.Series({'Sample size': sample_size,
                    '% positive case': percentage_positive_case})


data_description.to_csv('{}/Sample Sizes/Data Description_{}.csv'.format(local_dir, file_stub))


In [None]:
#assert 1==2

In [None]:
# Check no overlapping Child_IDs or siblings
assert len(list(set(X_tr['PSID']).intersection(set(X_test['PSID'])))) <= 1

assert len(list(set(siblings_tr).intersection(set(siblings_test)))) <= 1

In [None]:
X_tr.select_dtypes(exclude='number').columns

In [None]:
# Delete duplicate columns
print(X_tr.shape)
X_tr = X_tr.loc[:,~X_tr.columns.duplicated()]
print(X_tr.shape)

print(X_test.shape)
X_test = X_test.loc[:,~X_test.columns.duplicated()]
print(X_test.shape)

In [None]:
## Check unique values
# Lots of unique values => personal ID
# Only one unique value => feature doesn't vary
import operator
dict_unique = {}
for col in X_tr.columns:
    len_unique = len(X_tr[col].unique())
    dict_unique[col] = len_unique
non_varying_cols = [col for col, len_unique in dict_unique.items() if len_unique ==1]
dict_unique_sorted = sorted(dict_unique.items(), key=operator.itemgetter(1))
print('Unique dictionary: ', dict_unique_sorted)

In [None]:
# Drop non varying colums (as they won't add anything to the model)
print(X_tr.shape)
print(X_test.shape)
X_tr.drop(columns = non_varying_cols, inplace = True, errors = 'ignore')
X_test.drop(columns = non_varying_cols, inplace = True, errors = 'ignore')
print(X_tr.shape)
print(X_test.shape)

In [None]:
X_tr.select_dtypes(exclude='number').columns

In [None]:
# Drop columns that are missing in test and test even if not missing in the training
print(X_tr.shape)
X_tr.dropna(axis=1, how = 'all', inplace = True)
print(X_tr.shape)
X_test = X_test[X_tr.columns]
print(X_test.shape)

In [None]:
X_tr.select_dtypes(exclude='number').columns

In [None]:
# Drop missing with more than 30%
# Missingness is otherwise handled within each fold in the gridsearch cv pipeline
import pandas as pd
print(X_tr.shape)
X_tr = X_tr.dropna(axis=1, how='all')
print(X_tr.shape)

percent_missing = X_tr.isnull().sum() * 100 / len(X_tr)
missing_value_df = pd.DataFrame({'column_name': X_tr.columns,
                                 'percent_missing': percent_missing})
missing_value_df.reset_index(inplace = True)
cols_to_drop = list(missing_value_df.loc[missing_value_df['percent_missing'] >=70,'column_name']) # CORRECTED: 0.7 to 70
X_tr.drop(columns = cols_to_drop, inplace = True)
print(X_tr.shape)

# Test and test data should match the columns in training data
print(X_test.shape)
X_test = X_test[list(X_tr.columns)]
print(X_test.shape)

assert (X_tr.shape[1] == X_test.shape[1])

In [None]:
X_tr.select_dtypes(exclude='number').columns

In [None]:
# Drop columns with possible information leakage

possible_information_leakage_cols = (['previous_mean_gaptoOtherEscalationstart', 'previous_mean_gaptocpplanstart',
                                    'previous_mean_gaptoLACstart', 'previous_mean_opencaselength', 
                                     'AssessmentType_ Child Social Work Assessment',
                                     'AssessmentType_ Child Social Work Assessment for Review Child Protection Conference',
                                'AssessmentType_ Child Social Work Assessment to Initial Child Protection Conference'])
X_tr.drop(columns = possible_information_leakage_cols, inplace = True, errors = 'ignore')
X_test.drop(columns = possible_information_leakage_cols, inplace = True, errors = 'ignore')
print(X_tr.shape)
print(X_test.shape)

In [None]:
X_tr.select_dtypes(exclude='number').columns

In [None]:
# Handle year (this was missed in data cleaning)

X_tr['ReferralDatetime_month_year'] = pd.to_datetime(X_tr['ReferralDatetime_month_year'])
X_tr['ReferralDatetime_month_year'] = X_tr['ReferralDatetime_month_year'].dt.year.map(int)

X_test['ReferralDatetime_month_year'] = pd.to_datetime(X_test['ReferralDatetime_month_year'])
X_test['ReferralDatetime_month_year'] = X_test['ReferralDatetime_month_year'].dt.year.map(int)

# Drop key cols for the purposes of handling missing and resampling (will add back into the saved dataset later)
key_cols = ['PSID','ReferralDatetime', 'ReferralDatetime_previous']
key_cols = [col for col in key_cols if col in X_tr.columns]
X_tr_key_cols_df = X_tr[key_cols ] # 'ReferralDatetime_previous'
print(X_tr_key_cols_df.shape)
print(X_tr_key_cols_df.index)

X_test_key_cols_df = X_test[key_cols ] #'ReferralDatetime_previous'
print(X_test_key_cols_df.shape)
print(X_test_key_cols_df.index)

X_tr.drop(columns = key_cols, inplace = True) # 'ReferralDatetime_previous'
X_test.drop(columns = key_cols, inplace = True) # 'ReferralDatetime_previous'

In [None]:
## Downsample the majority class (fine that it's outside the cross validation loop)
# 
from imblearn.pipeline import Pipeline # Use this Pipeline, sklearn doesn't like ADASYN in its Pipeline (not a transformer)
from imblearn.under_sampling import OneSidedSelection
from sklearn.impute import SimpleImputer

handle_missing = SimpleImputer(strategy='constant', add_indicator=True)
X_tr_no_na = handle_missing.fit_transform(X_tr)
cols_w_missing_data = [col for col in X_tr if X_tr[col].isna().sum() != 0]
missing_cols = [col+'_missing' for col in cols_w_missing_data]
X_tr_no_na_missing_cols = list(X_tr.columns) + list(missing_cols)
X_tr_no_na = pd.DataFrame(data = X_tr_no_na, columns = X_tr_no_na_missing_cols)
print(X_tr_no_na.shape)

X_test_no_na = handle_missing.fit_transform(X_test)
cols_w_missing_data = [col for col in X_test if X_test[col].isna().sum() != 0]
missing_cols = [col+'_missing' for col in cols_w_missing_data]
X_test_no_na_missing_cols = list(X_test.columns) + list(missing_cols)
X_test_no_na = pd.DataFrame(data = X_test_no_na, columns = X_test_no_na_missing_cols)
print(X_test_no_na.shape)

resampling = OneSidedSelection(random_state=3005)
pipeline = Pipeline([('resampling',resampling)])

X_tr_transformed, y_tr_transformed = pipeline.fit_resample(X_tr_no_na, y_tr) # Don't use - not sorted correctly
sample_indices = pipeline.named_steps['resampling'].sample_indices_

# Select sample indices, sort and reset index because the resampled data resorts the data so all the positive class 
# are at the end
y_tr_resampled = y_tr[sample_indices]
y_tr_resampled.sort_index(inplace = True)
y_tr_resampled.reset_index(drop=True, inplace = True)
print(y_tr_resampled.index)

X_tr_resampled = X_tr_no_na.loc[sample_indices,]
X_tr_resampled.sort_index(inplace = True)
X_tr_resampled.reset_index(drop=True, inplace = True)
print(X_tr_resampled.index)

siblings_tr_resampled = siblings_tr.loc[sample_indices,]
siblings_tr_resampled.sort_index(inplace = True)
siblings_tr_resampled.reset_index(drop=True, inplace = True)
print(siblings_tr_resampled.index)

X_tr_key_cols_df_resampled = X_tr_key_cols_df.loc[sample_indices,]
X_tr_key_cols_df_resampled.sort_index(inplace = True)
X_tr_key_cols_df_resampled.reset_index(drop=True, inplace = True)
print(X_tr_key_cols_df_resampled.index)

In [None]:
# Sometimes missing column doesn't appear in test so add manually
cols_not_in_test = list(set([col for col in X_tr_no_na.columns if 'missing' in col]).difference(set([col for col in X_test_no_na.columns if 'missing' in col])))

for col in cols_not_in_test:
    X_test_no_na[col] = 0
    
print(X_tr_resampled.shape)
print(X_test_no_na.shape)
assert (X_tr_resampled.shape[1] == X_test_no_na.shape[1])

In [None]:
# Check no missing after handling NA
assert (X_tr_resampled.isna().sum().sum() == 0)
assert (X_test_no_na.isna().sum().sum() == 0)


In [None]:
## Save so above transformations on the data persists for fairness models too

# Adding back in key cols to uniquely identify the rows
print(X_tr_resampled.shape)
X_tr_resampled = X_tr_resampled.merge(X_tr_key_cols_df_resampled, left_index = True, right_index = True)
print(X_tr_resampled.shape)

print(X_test_no_na.shape)
X_test_no_na = X_test_no_na.merge(X_test_key_cols_df, left_index = True, right_index = True)
print(X_test_no_na.shape)

X_tr_resampled.to_csv("Final/X_train_{}_final.csv".format(file_stub))
y_tr_resampled.to_csv("Final/y_train_{}_final.csv".format(file_stub))
siblings_tr_resampled.to_csv("Final/siblings_train_{}_final.csv".format(file_stub))
X_test_no_na.to_csv("Final/X_test_{}_final.csv".format(file_stub)) # Remains unsampled so that it stays close to real world scenario

In [None]:
X_test_no_na.isna().sum().sum()

In [None]:
print(X_tr_resampled.select_dtypes(exclude = 'number').columns)
print(X_test_no_na.select_dtypes(exclude = 'number').columns)

In [None]:
# Drop key columns (after data is saved as still need the keys for merging in 7a_Combine_and_Split_Data and 8_Fairness)
print(X_tr_resampled.shape)
print(X_test_no_na.shape)

key_cols = ['PSID','ReferralDatetime', 'ReferralDatetime_previous']
key_cols = [col for col in key_cols if col in X_tr.columns]
X_tr_resampled.drop(columns = key_cols, inplace = True, errors = 'ignore')
X_test_no_na.drop(columns = key_cols, inplace = True, errors = 'ignore')
print(X_tr_resampled.shape)
print(X_test_no_na.shape)

X_tr_resampled = X_tr_resampled.select_dtypes(include = 'number') 
X_test_no_na = X_test_no_na.select_dtypes(include = 'number') 
print(X_tr_resampled.shape)
print(X_test_no_na.shape)

In [None]:
# Check no string columns except those defined as text
assert len(X_tr_resampled.select_dtypes(exclude = 'number').columns) == 0
assert len(X_test_no_na.select_dtypes(exclude = 'number').columns) == 0

In [None]:
# Create dataframe for saving model results
# But mode='x' so doesn't overwrite saved results

import pandas as pd

cols_for_model_results = ['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'split0_test_score', 'split1_test_score','split2_test_score', 'mean_test_score',
       'std_test_score', 'rank_test_score', 'dtc__max_depth', 'dtc__max_features', 
       'dtc__min_samples_split', 'lr__C', 'lr__penalty', 'gbc__max_depth', 'gbc__max_features', 
       'gbc__n_estimators', 'algorithm', 'rcv_n_iter',
        'parameter_combination']

df_model_outputs = pd.DataFrame(columns = cols_for_model_results)
try:
    df_model_outputs.to_csv('{}/Models/model_output_{}_decision_tree_{}.csv'.format(local_dir, file_stub, levers), mode='x')
except(FileExistsError):
    pass
try:
    df_model_outputs.to_csv('{}/Models/model_output_{}_logistic_regression_{}.csv'.format(local_dir, file_stub, levers), mode='x')
except(FileExistsError):
    pass
try:
    df_model_outputs.to_csv('{}/Models/model_output_{}_gradient_boosting_{}.csv'.format(local_dir, file_stub, levers), mode='x')
except(FileExistsError):
    pass

In [None]:
# FeatureUnion concatenates whilsts ColumnTransformer allows transformations on select columns and then concatenation
import pandas as pd
import pickle
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from text_functions import vulnerabilities_phrases_dict

numerical_transformer = None

# FillNA for numerics (mean imputing only within cross-validation folds)
if data_type == 'str':
    preprocessor = numerical_transformer
    
# If text columns included

if rq == 'rq1':
    string_features = (['Contact and Referral Form_text',
       'Child Social Work Assessment for Review Child Protection Conference_text_prev',
       'Child Social Work Assessment to Initial Child Protection Conference_text_prev',
       'Child Social Work Assessment_text_prev'])
else:
    string_features = (['Child Social Work Assessment for Review Child Protection Conference_text',
           'Child Social Work Assessment to Initial Child Protection Conference_text',
           'Child Social Work Assessment_text', 'Contact and Referral Form_text',
           'Child Social Work Assessment for Review Child Protection Conference_text_prev',
           'Child Social Work Assessment to Initial Child Protection Conference_text_prev',
           'Child Social Work Assessment_text_prev',
           'Contact and Referral Form_text_prev'])


# LDA
if file_stub == 'rq2_ss_all':
    n_components=6
else:
    n_components=4


lda_model = LatentDirichletAllocation(n_components=n_components,
                                        learning_method='online',
                                     random_state=3005,
                                     evaluate_every=-1,
                                     learning_decay=.7,
                                     batch_size=64)

if data_type == 'all':
    transformers = []

    text_transformers = FeatureUnion(
                            transformer_list = [
                                ('lda', lda_model)]) 
        
    for f in string_features:
        try:
            f_all = [col for col in X_tr_resampled.columns if f in col]
            f_tfidf = [s for s in f_all if not any(xs in s for xs in list(vulnerabilities_phrases_dict.keys()))]
            transformers.append((f+'_lda', text_transformers, f_tfidf)) # LDA
        except:
            continue
    preprocessor = ColumnTransformer(transformers = transformers,
                                        remainder = 'passthrough') # 

In [None]:
from analysis_functions import StratifiedShuffleSplitGroups, TimeSeriesSplitIgnoreSiblings, grid_search_save_output

#from imblearn.over_sampling import ADASYN, SMOTE
#from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline # Use this Pipeline, sklearn doesn't like ADASYN in its Pipeline (not a transformer)
import numpy as np
import pickle
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectFpr, f_classif, VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

    
# Cross validation
if cv == 'ts':
    tss_sib = TimeSeriesSplitIgnoreSiblings(n_splits=3, sibling_group = siblings_tr, sibling_na = 99999.0)
    cross_val = tss_sib
else:
    sssg = StratifiedShuffleSplitGroups(n_splits=5, test_size=0.2, random_state=3005, sibling_group = siblings_tr, sibling_na = 99999.0)
    cross_val = sssg

## Algorithm options

dtc = DecisionTreeClassifier(random_state=3005)
pipeline_dtc = Pipeline([('preprocessor', preprocessor),
                        ('dtc', dtc)])

# Logistic Regression
scaler = StandardScaler()
lr = LogisticRegression(random_state=3005, fit_intercept=True, solver = 'liblinear') 
pipeline_lr = (Pipeline([('preprocessor', preprocessor),
                    ('scale',scaler), # regularisation requires features in same scale
                    ('lr', lr)]))

# Gradient Boosting
gbc = GradientBoostingClassifier(random_state = 3005)
pipeline_gbc = Pipeline([('preprocessor', preprocessor),
                            ('gbc', gbc)])


parameter_dict = ({'decision_tree': parameters_dtc, 
                    'logistic_regression': parameters_lr, 
                    'gradient_boosting': parameters_gbc})

pipeline_dict = ({'decision_tree': pipeline_dtc, 
                 'logistic_regression': pipeline_lr, 
                 'gradient_boosting': pipeline_gbc})

parameter_list, pipeline_list = [], []
for algorithm in algorithm_names:
    parameter_list.append(parameter_dict[algorithm])
    pipeline_list.append(pipeline_dict[algorithm])




In [None]:
assert 1==2

In [None]:
print(file_stub)

In [None]:
#Training
# Runtime warnings were because when selecting features, all the feature importances were
# below the alpha value set
predictive_model_results, best_parameters_list = [], []
best_estimator_dict = {}
for i in range(0,len(pipeline_list)):
    print('Current algorithm: ', algorithm_names[i])
    # Refit refits on whole dataset using the best parameters and allows you to call predict on the gridsearchcv instance
    rcv = RandomizedSearchCV(pipeline_list[i], parameter_list[i], n_iter = rcv_n_iter, cv=cross_val, scoring = 'average_precision', verbose = 5, error_score=0.0, refit = True, random_state=3005)
    results, best_parameters, best_estimator = grid_search_save_output(rcv, algorithm_names[i], rcv_n_iter, parameters, X_tr_resampled, y_tr_resampled, '{}/Models/model_output_{}_{}_{}.csv'.format(local_dir, file_stub, algorithm_names[i], levers))
    predictive_model_results.append(results)
    best_parameters_list.append(best_parameters)
    best_estimator_dict[algorithm_names[i]] = best_estimator
    
with open("{}/Models/best_estimator_{}_{}_dict.pkl".format(local_dir, file_stub, levers), "wb") as handle:
    pickle.dump(best_estimator_dict, handle, protocol = pickle.HIGHEST_PROTOCOL)

## Restart here after training

In [None]:
import glob
import pandas as pd

file_list = glob.glob("{}/Models/model_output_{}_*_50_2.csv".format(local_dir, file_stub))

file_dict = {}
for file_name in file_list:
    file = pd.read_csv(file_name)
    file_name = file_name.replace("{}/Models/model_output_".format(local_dir), "")
    file_name = file_name.replace(".csv", "")
    file_dict[file_name] = file
    
print(file_dict.keys())

In [None]:
import math
import numpy as np
import pandas as pd

model_output = pd.concat(file_dict.values(), axis = 0, ignore_index = True)

# Find the model which finds the best worst case scenario
# Mean - 2 * sd (i.e. if normally distributed the model which has the highest 95% lower bound of scores)
if cv == 'ts':
    n_splits = 3
else:
    n_splits = 5
    
model_dict = {'decision_tree': 'dtc',
        'logistic_regression': 'lr',
        'gradient_boosting': 'gbc'}

# Identify best algorithm from the maximum LB
max_min_test_score = np.argmax(model_output['mean_test_score'] - 1.96* model_output['std_test_score']/math.sqrt(n_splits)) 
best_algorithm = model_output.loc[max_min_test_score,'algorithm']
print("Best algorithm: ", best_algorithm)
print("Max min score", model_output.loc[max_min_test_score, 'mean_test_score'])

# For each algorithm find the max LB, max mean and accompanying SD
# Then extract the parameters
max_min_params_dict_all, max_min_mean_AP_dict, mean_AP_dict, sd_AP_dict = {}, {}, {}, {}
for a in model_output['algorithm'].unique():
    # Metrics
    model_output_algorithm = model_output.loc[model_output['algorithm'] == a,]
    model_output_algorithm.reset_index(drop= True, inplace = True)
    max_min_mean_idx = np.argmax(model_output_algorithm['mean_test_score'] - 1.96* model_output_algorithm['std_test_score']/math.sqrt(n_splits))
    max_min_mean = model_output_algorithm.loc[max_min_mean_idx, 'mean_test_score']
    best_mean = model_output_algorithm['mean_test_score'].max()
    max_min_mean_AP_dict[a] = round(max_min_mean, 4)
    mean_AP_dict[a] = round(best_mean, 4)
    sd_AP_dict[a] = round(model_output_algorithm.loc[model_output_algorithm['mean_test_score'] == best_mean, 'std_test_score'].values[0], 4)
    
    # Extract the parameters
    param_cols = [col for col in model_output_algorithm.columns if 'param_{}'.format(model_dict[a]) in col]
    #max_min_test_score = max_min_mean_AP_dict[a]
    max_min_params_dict = dict(model_output_algorithm.loc[max_min_mean_idx,param_cols])
    max_min_params_dict_keys = [k.replace('param_{}__'.format(model_dict[a]), '') for k in max_min_params_dict.keys()]
    max_min_params_dict = dict(zip(max_min_params_dict_keys, max_min_params_dict.values()))
    max_min_params_dict_all[a] = max_min_params_dict

In [None]:
import pickle

# Load saved models
filename = open("{}/Models/best_estimator_{}_{}_dict.pkl".format(local_dir, file_stub, levers), "rb")
best_estimator_dict = pickle.load(filename) 

In [None]:
### Retraining model with max_min

# Check whether it's the same as the maximum
    # Set the parameters and re-fit
    # Label as fitted_model

best_estimator_dict_new = {}
if max_min_mean_AP_dict['gradient_boosting'] != mean_AP_dict['gradient_boosting']:

    # Gradient boosting
    max_min_params_dict_all['gradient_boosting']['n_estimators'] = int(max_min_params_dict_all['gradient_boosting']['n_estimators']) # n_estimators has to be integer
    gbc.set_params(**max_min_params_dict_all['gradient_boosting'])
    pipeline_gbc = Pipeline([('preprocessor', preprocessor),
                                ('gbc', gbc)])
    print("Fitting GBC")
    pipeline_gbc.fit(X_tr_resampled, y_tr_resampled)
    best_estimator_dict_new['gradient_boosting'] = pipeline_gbc     

else:
    best_estimator_dict_new['gradient_boosting'] = best_estimator_dict['gradient_boosting']

# Decision tree
if max_min_mean_AP_dict['decision_tree'] != mean_AP_dict['decision_tree']:
    max_min_params_dict_all['decision_tree']['min_samples_split'] = int(max_min_params_dict_all['decision_tree']['min_samples_split']) # min_samples_split has to be integer   
    dtc.set_params(**max_min_params_dict_all['decision_tree'])
    pipeline_dtc = Pipeline([('preprocessor', preprocessor),
                        ('dtc', dtc)])
    print("Fitting DTC")
    pipeline_dtc.fit(X_tr_resampled, y_tr_resampled)
    best_estimator_dict_new['decision_tree'] = pipeline_dtc

else:
    best_estimator_dict_new['decision_tree'] = best_estimator_dict['decision_tree']
    
# Logistic regression    
if max_min_mean_AP_dict['logistic_regression'] != mean_AP_dict['logistic_regression']:  
    lr.set_params(**max_min_params_dict_all['logistic_regression'])
    pipeline_lr = (Pipeline([('preprocessor', preprocessor),
                    ('scale',scaler), # regularisation requires features in same scale
                    ('lr', lr)]))
    print("Fitting LR")
    pipeline_lr.fit(X_tr_resampled, y_tr_resampled)
    best_estimator_dict_new['logistic_regression'] = pipeline_lr
    
else:
    best_estimator_dict_new['logistic_regression'] = best_estimator_dict['logistic_regression']


In [None]:
assert 1==2

In [None]:
# Save fitted max min model

with open("{}/Models/best_estimator_{}_maxmin.pkl".format(local_dir, file_stub, levers), "wb") as handle:
    pickle.dump(best_estimator_dict_new[best_algorithm] , handle, protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
# Import final data again (because encrypted hard drive locks)
import os
hard_drive_dir = '/Volumes/diskAshur2/Final transfer out data and code to WWC Jan 2020/Data for model/Use'
os.chdir(hard_drive_dir)


import pandas as pd
X_tr_resampled = pd.read_csv("Final/X_train_{}_final.csv".format(file_stub), index_col = 0)
print(X_tr_resampled.shape)
X_tr_resampled.reset_index(inplace = True, drop = True)
print(X_tr_resampled.index)

X_test_no_na = pd.read_csv("Final/X_test_{}_final.csv".format(file_stub), index_col = 0)
print(X_test_no_na.shape)
X_test_no_na.reset_index(inplace = True, drop = True)
print(X_test_no_na.index)


# Drop key columns (after data is saved as still need the keys for merging in 7a_Combine_and_Split_Data and 8_Fairness)
print(X_tr_resampled.shape)
print(X_test_no_na.shape)

key_cols = ['PSID',  'ReferralDatetime']
X_tr_resampled.drop(columns = key_cols, inplace = True, errors = 'ignore')
X_test_no_na.drop(columns = key_cols, inplace = True, errors = 'ignore')
print(X_tr_resampled.shape)
print(X_test_no_na.shape)

X_tr_resampled = X_tr_resampled.select_dtypes(include = 'number') 
X_test_no_na = X_test_no_na.select_dtypes(include = 'number') 
print(X_tr_resampled.shape)
print(X_test_no_na.shape)

In [None]:
import numpy as np
from scipy import stats

# Summmary statistics for each dataset (after it's been transformed)
# Just needs to be one algorithm - could be any because we're just using the preprocessing step
feature_labels = pd.read_csv('{}/Graphs Analysis - All variables.csv'.format(summary_info))
feature_labels = feature_labels.loc[feature_labels['LA'] == 2]

X_tr_resampled_summary = round(X_tr_resampled.describe(),2)

rename_cols_dict = dict(zip(feature_labels['Variable Name'], feature_labels['Name of variable in the report']))
print(set(X_tr_resampled_summary.columns).difference(set(rename_cols_dict.keys())))
#assert (len(set(X_tr_resampled_summary.columns).difference(set(rename_cols_dict.keys()))) == 0)

X_tr_resampled_summary.rename(columns = rename_cols_dict, inplace = True)
X_tr_resampled_summary.to_csv('{}/Sample Sizes/Summary statistics for training data {}.csv'.format(local_dir, file_stub))

# Also report the number of features
number_features_no_na = X_tr_resampled.shape[1]

# Balance before and after sampling
class_balance_after_resampling = pd.DataFrame(data = {'Class balance before resampling - training data': round(y_tr.value_counts(normalize=True),4),
                     'Class balance after resampling - training data': round(y_tr_resampled.value_counts(normalize=True),4)}
                    )
class_balance_after_resampling.to_csv('{}/Sample Sizes/Class imbalance after resampling {}.csv'.format(local_dir, file_stub))

In [None]:
## What's happening inside the cv folds?
from statsmodels.api import OLS
import analysis_functions

sibling_fold_tr, sibling_fold_test, class_imb_tr, class_imb_test, ttest_pvalue_dict, ftest_pvalue_dict, ftest_pvalue_dict_no_prev = {}, {}, {}, {}, {}, {}, {}
split_no = 0
for train_idx, test_idx in cross_val.split(X_tr_resampled, y_tr_resampled):

    split_no +=1
    split = 'split ' + str(split_no)
    
    #train_idx_resampled = y_tr_no_na_resampled.index[train_idx]
    #test_idx_resampled = y_tr_no_na_resampled.index[test_idx]

    # Either the same child or the same sibling group
    sibling_fold_tr[split] = siblings_tr_resampled[train_idx].value_counts().value_counts()
    sibling_fold_test[split] = siblings_tr_resampled[test_idx].value_counts().value_counts() # siblings_tr because we're cross validating within the training dataset

    # Class balance
    class_imb_tr[split] = y_tr_resampled[train_idx].value_counts()
    class_imb_test[split] = y_tr_resampled[test_idx].value_counts() # y_tr because we're cross validating within the training dataset

    # Does test and training class balance look similar in cv?
    ttest = stats.ttest_ind(y_tr_resampled[train_idx], y_tr_resampled[test_idx])
    pvalue = round(ttest[1], 2)
    ttest_pvalue_dict[split] = pvalue
    
    print(pvalue)

    # Do test and training datasets look similar in cv?
    joint_orth_test_data = X_tr_resampled.copy()
    joint_orth_test_data['Train'] = np.where(joint_orth_test_data.index.isin(train_idx), 1, 0)
    results = OLS(joint_orth_test_data['Train'], joint_orth_test_data.drop(columns = 'Train')).fit()
    ftest_pvalue_dict[split] = round(results.f_pvalue,2)
    
    print(results.f_pvalue)
    
    # Do the test and training datasets look similar in cv excluding the previous columns?
    count_cols = [col for col in joint_orth_test_data.columns if 'count' in col]
    mean_cols = [col for col in joint_orth_test_data.columns if 'mean' in col]
    previous_cols = count_cols + mean_cols
    joint_orth_test_data_no_previous = joint_orth_test_data.drop(columns = previous_cols)
    results_no_prev = OLS(joint_orth_test_data_no_previous['Train'], joint_orth_test_data_no_previous.drop(columns = 'Train')).fit()
    ftest_pvalue_dict_no_prev[split] = round(results_no_prev.f_pvalue,2)    
    
    print(results_no_prev.f_pvalue)
    


In [None]:
# Save as dataframes
sibling_fold_tr_df = pd.DataFrame(sibling_fold_tr)
sibling_fold_tr_df = sibling_fold_tr_df.add_prefix('Train ')
sibling_fold_test_df = pd.DataFrame(sibling_fold_test)
sibling_fold_test_df = sibling_fold_test_df.add_prefix('Test ')
sibling_fold_df = pd.concat([sibling_fold_tr_df, sibling_fold_test_df], axis = 1)
sibling_fold_df.to_csv('{}/Sample Sizes/Number of occurences of the same sibling group in cv folds {}.csv'.format(local_dir, file_stub))

# Class imbalance
class_imb_tr_df = pd.DataFrame(class_imb_tr)
class_imb_tr_df = class_imb_tr_df.add_prefix('Train ')
class_imb_test_df = pd.DataFrame(class_imb_test)
class_imb_test_df = class_imb_test_df.add_prefix('Test ')
class_imb_df = pd.concat([class_imb_tr_df, class_imb_test_df], axis = 1)
class_imb_df.to_csv('{}/Sample Sizes/Class imbalance in cv folds {}.csv'.format(local_dir, file_stub))


# Tests for whether the folds look different
ttest_pvalue_dict_df = pd.DataFrame([ttest_pvalue_dict])
ttest_pvalue_dict_df = ttest_pvalue_dict_df.add_prefix('T-test p-value class imbalance ')
ftest_pvalue_dict_df = pd.DataFrame([ftest_pvalue_dict])
ftest_pvalue_dict_df = ftest_pvalue_dict_df.add_prefix('F-test p-value for joint orthonality of features ')
tests_pvalue_dict_df = pd.concat([ttest_pvalue_dict_df, ftest_pvalue_dict_df], axis=1)
tests_pvalue_dict_df.to_csv('{}/Sample Sizes/T-test and F-test for similarity of cv folds {}.csv'.format(local_dir, file_stub))

In [None]:
# Do folds look different without the test data too?
ftest_pvalue_dict_no_prev_df = pd.DataFrame([ftest_pvalue_dict_no_prev])
ftest_pvalue_dict_no_prev_df.to_csv('{}/Sample Sizes/F-test for similarity of cv folds on previous {}.csv'.format(local_dir, file_stub))
ftest_pvalue_dict_no_prev_df

In [None]:
assert 1==2

In [None]:
print(file_stub)
print([col for col in X_tr.columns if 'AssessmentType' in col])
print([col for col in X_test_no_na.columns if 'AssessmentType' in col])

In [None]:
# Feature importance to check for data leakage
#https://eli5.readthedocs.io/en/latest/blackbox/permutation_importance.html

# Works fine for the _all data too for LAs where we take the data offsite before the analysis
# because the columns already have names rather than being created in the randomisedsearchcv process
import eli5
from eli5.sklearn import PermutationImportance
import operator

model_dict = {'decision_tree': 'dtc',
            'logistic_regression': 'lr',
            'gradient_boosting': 'gbc'}

model_list = []
for algorithm in algorithm_names:
    model_list.append(model_dict[algorithm])


feature_importance_dict_dict = {}
    
for model_name, model_short_name in model_dict.items():
    # Fill na as permutation importance doesn't allow NAs
    fitted_model = best_estimator_dict_new[model_name][model_short_name]
    perm = PermutationImportance(fitted_model, cv = cross_val, scoring = 'average_precision').fit(X_tr_resampled, y_tr_resampled)
    feature_importance_dict = dict(zip(X_tr_resampled.columns, perm.feature_importances_))
    feature_importance_dict = sorted(feature_importance_dict.items(), key=operator.itemgetter(1))
    feature_importance_dict_dict[model_name] = feature_importance_dict
    
    
for key, value in feature_importance_dict_dict.items():
    print('Algorithm: ', key)
    print('High feature importance: ', [v for v in value if abs(v[1]) > 0.01])

In [None]:
import pandas as pd
df_feature_imp_list = []
for algorithm in feature_importance_dict_dict.keys():
    df = pd.DataFrame(feature_importance_dict_dict[algorithm], columns = ['column_name', 'feature_importance'])
    df['algorithm'] = algorithm
    df_feature_imp_list.append(df)

df_feature_imp = pd.concat(df_feature_imp_list, axis=0)
df_feature_imp.to_csv('{}/Models/feature_importance_{}.csv'.format(local_dir, file_stub), index=False)


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

df_feature_imp = pd.read_csv('{}/Models/feature_importance_{}.csv'.format(local_dir, file_stub))
df_feature_imp.sort_values(by = 'feature_importance', ascending = False)[0:19]

if cv == 'ts':
    cv_for_graph = 'predicting the future'
if cv == 'ss':
    cv_for_graph = 'predicting contemporaneously'

df_feature_imp['feature_importance'] = df_feature_imp['feature_importance'].abs()
df_feature_imp.sort_values(by = 'feature_importance', ascending = False, inplace = True)
df_feature_imp_best_algorithm = df_feature_imp[df_feature_imp['algorithm'] == best_algorithm] 
df_feature_imp_best_algorithm.reset_index(inplace = True, drop = True)
df_feature_imp_best_algorithm = df_feature_imp_best_algorithm.loc[df_feature_imp_best_algorithm['feature_importance']>0,]
feature_names = df_feature_imp_best_algorithm.loc[0:19,'column_name']
feature_importances = round(df_feature_imp_best_algorithm.loc[0:19,'feature_importance'],4)


In [None]:
# Rename features
import numpy as np
import re

feature_labels = pd.read_csv('{}/Graphs Analysis - All variables.csv'.format(summary_info))
feature_names_df = pd.DataFrame(feature_names).merge(feature_labels[['Variable Name', 'Name of variable in the report']], how = 'left', left_on = 'column_name', right_on = 'Variable Name')
feature_names_df.drop_duplicates(subset = 'column_name', inplace = True)

print(feature_names_df.loc[feature_names_df['Name of variable in the report'].isna(), 'column_name'].unique())
assert feature_names_df['Name of variable in the report'].isna().sum() == 0



In [None]:
# Try some automatic renaming
if feature_names_df['Name of variable in the report'].isna().sum() != 0:

    feature_names_df['Name of variable in the report_all'] = feature_names_df['Name of variable in the report'].fillna(feature_names_df['column_name'])
    feature_names_list = [n.replace('previous_exc_current_sum', 'Total Number of Previous') for n in feature_names_df['Name of variable in the report_all']]
    feature_names_list = [n.replace('previous_exc_current_mean', 'Average Number of Previous') for n in feature_names_list]

    feature_names_list = [n.title() for n in feature_names_list]
    feature_names_list = [n.replace('_', ': ') for n in feature_names_list]
    feature_names_list = [n.replace('.', ' ') for n in feature_names_list]

    word_list = ['date', 'time', 'source', 'code', 'start', 'social', 'work',
                 'assessment', 'completion', 'days', 'referral', 'care', 'reason',
                'legal', 'status', 'need', 'abuse', 'category', 'of', 'cp', 'length',
                'close', 'contact']
    for w in word_list:
        feature_names_list = [re.sub('{}'.format(w), ' {}'.format(w), t) for t in feature_names_list]
    feature_names_list = pd.Series(feature_names_list)
    feature_names_list.shape
else:
    feature_names_df['Name of variable in the report_all'] = feature_names_df['Name of variable in the report']
    feature_names_list = pd.Series(feature_names_df['Name of variable in the report'])

## Restart here

In [None]:
### Bring in source and base
years_sample_sizes = pd.read_csv("{}/Years and Sample Sizes.csv".format(summary_info))

# Identify source and base
dict_names = {'ss': 'learning from all cases',
             'ts': 'learning just from earlier cases',
             'str': 'structured data only',
             'all': 'structured and text data'}

outcome = years_sample_sizes.loc[(years_sample_sizes['Local authority'] == 'LA2') &
                       (years_sample_sizes['Research question'] == rq) & 
                        (years_sample_sizes['Cross-Validation'] == cv),'Shortened outcome'].values[0]    

years = years_sample_sizes.loc[(years_sample_sizes['Local authority'] == 'LA2') &
                       (years_sample_sizes['Research question'] == rq) & 
                        (years_sample_sizes['Cross-Validation'] == cv),'Years'].values[0]

sample_sizes = years_sample_sizes.loc[(years_sample_sizes['Local authority'] == 'LA2') &
                       (years_sample_sizes['Research question'] == rq) &
                       (years_sample_sizes['Cross-Validation'] == cv), 'Sample size'].values[0]

txt_source = 'Prediction: {}'.format(outcome)
print(txt_source)
txt_model_desc = 'Model: {}, {}'.format(dict_names[cv], dict_names[data_type])
print(txt_model_desc)
txt_base = 'Data: {}, {}, N = {}'.format('Local authority 2', years, sample_sizes)
print(txt_base)

In [None]:
from ale import ale_plot
fitted_model = best_estimator_dict_new[best_algorithm] 

columns_with_mt_5_values = X_tr_resampled.loc[:, X_tr_resampled.nunique() > 5].columns

columns_for_ale = list(set(columns_with_mt_5_values).intersection(set(feature_names)))

txt = [txt_source, txt_model_desc, txt_base]    

for col in columns_for_ale:
    col_label = feature_names_df.loc[feature_names_df['column_name'] == col, 'Name of variable in the report_all'].values[0]
    print(col)
    if '/' in col:
        col_name = col.replace('/', '')
    else:
        col_name = col
    ale_plot(fitted_model, X_tr_resampled, features = col, feature_name = col_label, outcome = outcome, txt = txt, file_name = '{}/Graphs/ALE_{}_{}.png'.format(local_dir, file_stub, col_name), monte_carlo=True)


In [None]:
#assert 1==2

In [None]:
# Checking other metrics - AUC p/r on test data
from sklearn.metrics import accuracy_score, average_precision_score, roc_auc_score, confusion_matrix, precision_score, recall_score, fbeta_score

scores_list = []
for a in algorithm_names:
    try:        
        if data_type == 'str':
            fitted_model = best_estimator_dict_new[a]
            y_pred = fitted_model.predict(X_test_no_na)
            y_pred_proba = best_estimator_dict_new[a].predict_proba(X_test_no_na)
        else:
            X_test_no_na = X_test_no_na[X_tr_resampled.columns] # columns have to be in the same order
            X_test_no_na_transformed = best_estimator_dict_new[a]['preprocessor'].transform(X_test_no_na)
            y_pred = best_estimator_dict_new[a][model_dict[a]].predict(X_test_no_na_transformed)
            y_pred_proba = best_estimator_dict_new[a][model_dict[a]].predict_proba(X_test_no_na_transformed)
        
        scores = pd.Series({'Algorithm': a,
                            'Proportion negative class value': round(y_test.value_counts(normalize=True)[0], 2),
                            'Accuracy': round(accuracy_score(y_test, y_pred), 2),
                            'Maximum lower bound (training)': round(max_min_mean_AP_dict[a],2), # ADDED_AUGUST
                            'Mean average precision (training)': round(mean_AP_dict[a],2),
                            'Std average precision (training)': round(sd_AP_dict[a],2),
                            'Average precision': round(average_precision_score(y_test, y_pred_proba[:,1]), 2), # CORRECTED
                            'AUC': round(roc_auc_score(y_test, y_pred_proba[:,1]), 2), 
                            'F score (beta = 0.1)': round(fbeta_score(y_test, y_pred, beta = 0.1), 2), 
                            'Precision': round(precision_score(y_test, y_pred), 2), # CORRECTED
                            'Recall': round(recall_score(y_test, y_pred),2)})
        scores_list.append(pd.DataFrame(scores))
    except(KeyError):
        pass

scores_multiple = pd.concat(scores_list, axis = 1)
print("Scores: ", scores_multiple)
scores_multiple.to_csv('{}/Models/Scores/scores_{}.csv'.format(local_dir, file_stub))

In [None]:
#assert 1==2

In [None]:
## 
from inspect import signature
import matplotlib.pyplot as plt
import numpy as np
import random 
from sklearn.metrics import precision_recall_curve
from textwrap import wrap

algorithm_names_full = {'decision_tree': 'Decision tree', 
                        'logistic_regression': 'Logistic regression', 
                        'gradient_boosting': 'Gradient boosting'}
if cv == 'ts':
    cv_for_graph = 'learning just from earlier cases'
if cv == 'ss':
    cv_for_graph = 'learning from all cases'

if data_type == 'str':
    data_desc = 'structured data only'    
elif data_type == 'all':
    data_desc = 'structured and text data' 

if cv == 'ss' and data_type == 'str':
    model_letter = 'a'
elif cv == 'ts' and data_type == 'str':
    model_letter = 'b'   
elif cv == 'ss' and data_type == 'all':
    model_letter = 'c'
elif cv == 'ts' and data_type == 'all':
    model_letter = 'd'

# Best estimator - test
y_pred_proba = fitted_model.predict_proba(X_test_no_na)
average_precision_test = round(average_precision_score(y_test, y_pred_proba[:,1]), 2)
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba[:,1])
step_kwargs = ({'step': 'post'}
               if 'step' in signature(plt.fill_between).parameters
               else {})

fig, ax = plt.subplots()

ax.step(recall, precision, color='#ff7057', alpha=0.2,
         where='post')
ax.fill_between(recall, precision, alpha=0.2, color='#ff7057', **step_kwargs)
ax.tick_params(axis='both', which='both', length=0, colors='#4d4d51')

plt.xlabel('Recall', fontname="Arial", color='#4d4d51', fontsize=12)
plt.ylabel('Precision', fontname="Arial", color='#4d4d51', fontsize=12)
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
ax.xaxis.grid(False)
ax.yaxis.grid(color='#d0dde1')  
title = '\n'.join(wrap('Curve plotting the decrease in precision and the corresponding increase in recall as the threshold for "at risk" cases decreases', 60))
ax.set_title(title, fontname="Arial", color = '#4d4d51', fontsize=12, loc='left')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
plt.suptitle('PRECISION RECALL CURVE', fontname="Arial", color = '#ff7057', fontsize=12, x=0.31, y=1.05)

plt.figtext(0, -0.05, txt_source, wrap=True, fontname="Arial", color='#4d4d51', fontsize=12)
plt.figtext(0, -0.1, txt_model_desc, wrap=True, fontname="Arial", color='#4d4d51', fontsize=12)
plt.figtext(0, -0.15, txt_base, wrap=True, fontname="Arial", color='#4d4d51', fontsize=12)

plt.savefig('{}/Graphs/Precision Recall {} ({}).png'.format(local_dir, file_stub, cv_for_graph), transparent=False, dpi=80, bbox_inches="tight")
plt.show()

In [None]:
# % of risky cases in top 10%
fitted_model = best_estimator_dict_new[best_algorithm] 
y_pred_proba = fitted_model.predict_proba(X_test_no_na)
# prob[1] is probability of the positive class
y_pred_prob_1 = y_pred_proba[:,1] 
y_pred_prob_1 = pd.Series(y_pred_prob_1, index = y_test.index) # get index
y_pred_prob_1 = y_pred_prob_1.sort_values(ascending = False) # from highest to lowest
tenpc = int(len(y_pred_prob_1)*0.1) # 10% through the sample
top10pc_pred_index = y_pred_prob_1[0:tenpc].index # indices of top 10%
test_postive_class_index = y_test[y_test == 1].index # indices of test data = positive class
# Of the positive class cases in the test data, how many are in the top 10% of prediction probabilities?
pc_positive_class_top10_pred = len(set(top10pc_pred_index).intersection(test_postive_class_index)) / len(test_postive_class_index) * 100

# % of safe cases in bottom 10%
bottom10pc_pred_index = y_pred_prob_1[len(y_pred_prob_1) - tenpc:].index # indices of bottom 10%
test_negative_class_index = y_test[y_test == 0].index # indices of test data = negative class
# Of the negative class cases in the test data, how many are in the bottom 10% of prediction probabilities?
pc_positive_class_bottom10_pred = len(set(bottom10pc_pred_index).intersection(test_negative_class_index)) / len(test_negative_class_index) * 100

intuitive_metrics = pd.Series({"% of risky cases in top 10%": round(pc_positive_class_top10_pred, 0),
                              "% of safe cases in bottom 10%": round(pc_positive_class_bottom10_pred, 0)})

# Number of false negatives and false positives in 1000 cases
from sklearn.metrics import confusion_matrix
y_pred = fitted_model.predict(X_test_no_na)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
intuitive_metrics = intuitive_metrics.append(pd.Series({"Number of true positives in 1000 cases": round(tp/len(y_test)*1000, 0), 
                            "Number of true negatives in 1000 cases": round(tn/len(y_test)*1000, 0),
                          "Number of false positives in 1000 cases": round(fp/len(y_test)*1000, 0), 
                            "Number of false negatives in 1000 cases": round(fn/len(y_test)*1000, 0)}))
print(intuitive_metrics)
intuitive_metrics.to_csv('{}/Models/Scores/Intuitive metrics {}.csv'.format(local_dir, file_stub))

In [None]:
# Density plots of predictions
import seaborn as sns

ax = sns.kdeplot(y_pred_proba[:,1][y_test == 0], label='Not at risk', color='#ff7057')
sns.kdeplot(y_pred_proba[:,1][y_test == 1], label='At risk', color='#4d4d51')

ax.xaxis.grid(False)
ax.yaxis.grid(color='#d0dde1')  

title = '\n'.join(wrap('Distribution of predicted probability of {}'.format(outcome), 60))
ax.set_title(title, fontname="Arial", color = '#4d4d51', fontsize=12, loc='left')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.tick_params(axis='both', which='both', length=0, colors='#4d4d51')

plt.xlabel('Probability', fontname="Arial", color='#4d4d51', fontsize=12)
plt.ylabel('Density', fontname="Arial", color='#4d4d51', fontsize=12)
plt.xlim([0.0, 1.05])
plt.suptitle('KERNEL DENSITY PLOT', fontname="Arial", color = '#ff7057', fontsize=12, x=0.28, y=1.01)
legend = plt.legend()
plt.setp(legend.get_texts(), color='#4d4d51')

plt.figtext(0, -0.05, txt_source, wrap=True, fontname="Arial", color='#4d4d51', fontsize=12)
plt.figtext(0, -0.1, txt_model_desc, wrap=True, fontname="Arial", color='#4d4d51', fontsize=12)
plt.figtext(0, -0.15, txt_base, wrap=True, fontname="Arial", color='#4d4d51', fontsize=12)

plt.savefig('{}/Graphs/Kernel Density Plot {}.png'.format(local_dir, file_stub), transparent=False, dpi=80, bbox_inches="tight")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from textwrap import wrap

def plot_learning_curve(estimator, title, subtitle, X, y, axes=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5), scoring = None):
    """
    Generate 3 plots: the test and training learning curve, the training
    samples vs fit times curve, the fit times vs score curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    axes : array of 3 axes, optional (default=None)
        Axes to use for plotting the curves.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

          - None, to use the default 5-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, optional (default=None)
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like, shape (n_ticks,), dtype float or int
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the dtype is float, it is regarded as a
        fraction of the maximum size of the training set (that is determined
        by the selected validation method), i.e. it has to be within (0, 1].
        Otherwise it is interpreted as absolute sizes of the training sets.
        Note that for classification the number of samples usually have to
        be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    """
    if axes is None:
        _, ax = plt.subplots()

    ax.set_title(title, fontname="Arial", color = '#4d4d51', fontsize=12, loc='left')
    ax.set_xlabel("Number of observations", fontname="Arial", color = '#4d4d51', fontsize=12)
    ax.set_ylabel("Average Precision", fontname="Arial", color = '#4d4d51', fontsize=12)

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True, scoring=scoring)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    ax.yaxis.grid(color="#d0dde1")
    ax.xaxis.grid(False)
    ax.set_ylim(0, 1.1)
    ax.set_yticks(np.arange(0,1.2,0.2))
    ax.tick_params(axis='both', which='both', length=0, colors='#4d4d51')
    ax.fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color='#ff7057')
    ax.fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color='#4d4d51')
    ax.plot(train_sizes, train_scores_mean, '^-', color='#ff7057',
                 label="Training score")
    ax.plot(train_sizes, test_scores_mean, 'o-', color='#4d4d51',
                 label="Cross-validation score")
    #ax.legend(loc="best", color='#4d4d51')
    
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(False)
    plt.suptitle(subtitle, fontname="Arial", color = '#ff7057', fontsize=12, x=0.25, y=1.03)

    legend = plt.legend(loc="best")
    plt.setp(legend.get_texts(), color='#4d4d51')

    plt.figtext(0, -0.05, txt_source, wrap=True, fontname="Arial", color='#4d4d51', fontsize=12)
    plt.figtext(0, -0.1, txt_model_desc, wrap=True, fontname="Arial", color='#4d4d51', fontsize=12)
    plt.figtext(0, -0.15, txt_base, wrap=True, fontname="Arial", color='#4d4d51', fontsize=12)
    
    plt.savefig('{}/Graphs/Learning Curves {} {}.pdf'.format(local_dir, file_stub, levers), transparent=False, dpi=80, bbox_inches="tight")
    return train_sizes, train_scores_mean, test_scores_mean


title = '\n'.join(wrap('The effect of increasing the number of observations on model performance', 60))
subtitle = 'LEARNING CURVE'

# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
fitted_model = best_estimator_dict_new[best_algorithm] 
learning_curve = plot_learning_curve(fitted_model, title, subtitle, X_tr_resampled, y_tr_resampled,
                    cv=cross_val, n_jobs=1, scoring = 'average_precision')

In [None]:
learning_curve

In [None]:
# Export learning curve data
learning_curve_data = pd.DataFrame()
learning_curve_data['Number of observations'], learning_curve_data['Average precision - train'], learning_curve_data['Average precision - test'] = learning_curve[0], learning_curve[1], learning_curve[2]
learning_curve_data[['Average precision - train','Average precision - test']] = learning_curve_data[['Average precision - train','Average precision - test']].round(2)
learning_curve_data['model_id'] = file_stub
learning_curve_data['LA'] = 'LA2'
learning_curve_data.to_csv('{}/Sample Sizes/Learning curve data {}.csv'.format(local_dir, file_stub), index = False)
learning_curve_data