## This covers:

- Hyperparameters; timing
- Feature importance plots

In [None]:
import os

os.getcwd()

In [None]:
# Variable set-up
LA = 'LA2'
rq = 'rq1' # Options: 'rq1', 'rq2'
cv = 'ss' # Options: 'ts' (time series split, ignore siblings), 'ss' (stratified shuffle, ignore siblings)
data_type = 'str' # Options: 'str' (just structured data), 'all' (structured data and list of strings)


file_stub = rq + '_' + cv + '_' + data_type # Creates file stub for saving in the format e.g. rq1_ss_str

if LA =='LA1':
    input_folder = '../LA1 Results July 2020'
    output_folder = 'LA1'
if LA =='LA2':
    input_folder = '../Final transfer out data and code to WWC Jan 2020'
    output_folder = 'LA2'
if LA =='LA3':
    input_folder = '../Anonymised structured data'
    output_folder = 'LA3'
if LA =='LA4':
    input_folder = '../LA4 August 2020'
    output_folder = 'LA4'

In [None]:
# Read in years and sample sizes
import pandas as pd
years_sample_sizes = pd.read_csv("../Summary information FINAL/Years and Sample Sizes.csv")

In [None]:
# Which is the best algorithm
import pickle

if LA == 'LA1':
    best_algorithm = 'gradient_boosting'

else:
    filename = open("{}/Models/best_estimator_{}_maxmin.pkl".format(input_folder, file_stub), 'rb')
    best_estimator_maxmin = pickle.load(filename)

    if 'dtc' in best_estimator_maxmin.named_steps.keys():
        best_algorithm = 'decision_tree'
        best_params = [p for p in best_estimator_maxmin.get_params() if 'dtc__' in p]
    elif 'lr' in best_estimator_maxmin.named_steps.keys():
        best_algorithm = 'logistic_regression'  
        best_params = [p for p in best_estimator_maxmin.get_params() if 'lr__' in p]
    elif 'gbc' in best_estimator_maxmin.named_steps.keys():
        best_algorithm = 'gradient_boosting'  
        best_params = [p for p in best_estimator_maxmin.get_params() if 'gbc__' in p]

    print(best_algorithm)
    print(LA)
    print(file_stub)

In [None]:
assert 1==2

## Hyperparameters

In [None]:
#%%script false --no-raise-error
## Mean and SD average precision in training
# Use model_output csvs
# Already available for LA1, LA2, LA3 and available for LA4 after final extract

import glob
import pandas as pd
file_list = glob.glob("{}/Models/model_output_{}_*.csv".format(input_folder, file_stub))

file_dict = {}
for file_name in file_list:
    file = pd.read_csv(file_name, index_col=0)
    file_name = file_name.replace("{}/Models/model_output_{}_".format(input_folder, file_stub), "")
    if LA == 'LA2':
        file_name = str(file_name).replace('_50_2', '')
    else:
        file_name = file_name
    file_name = file_name.replace(".csv", "")
    file_dict[file_name] = file
    
print(file_dict.keys())
model_output = pd.concat([file_dict['decision_tree'], file_dict['gradient_boosting'], file_dict['logistic_regression']], axis = 0, ignore_index = True)
print(model_output.shape)

In [None]:
#%%script false --no-raise-error
# Range of hyperparameters and the best ones
hyperparameters_df = pd.DataFrame(data = {}, index = ['Minimum', 'Maximum', 'Best'])
for file_name in file_dict.keys():
    hyperparameter_cols = [col for col in file_dict[file_name] if 'param_' in col]
    for col in hyperparameter_cols:
        col_name = str(file_name) + '_' + str(col).replace('param_', '')
        print(col_name)
        hyperparameters_df.at['Minimum', col_name] = file_dict[file_name][col].min()
        hyperparameters_df.at['Maximum', col_name] = file_dict[file_name][col].max()
        # Use the hyperparameters for the final model for the best algorithm
        if file_name == best_algorithm:
            col_shortened = str(col).replace('param_', '')
            hyperparameters_df.at['Best', col_name] = best_estimator_maxmin.get_params()[col_shortened]
        # Use the hyperparameters for the max mean test score for the other two algorithms
        else:
            hyperparameters_df.at['Best', col_name] = file_dict[file_name].loc[file_dict[file_name]['rank_test_score'] ==1,col].values[0]


In [None]:
#%%script false --no-raise-error
# Check whether there's any columns not covered (perhaps where we keep the text columns separate)
parameters_all_text = {'decision_tree_dtc__max_depth', 'decision_tree_dtc__max_features',
       'decision_tree_dtc__min_samples_split',
       'decision_tree_preprocessor__All_text__tfidf__max_features',
       'decision_tree_preprocessor__All_text__tf_lda__lda__max_iter',
       'decision_tree_preprocessor__All_text__tf_lda__lda__n_components',
       'decision_tree_preprocessor__All_text__tfidf_lda__tfidf__max_features',
       'logistic_regression_lr__C', 'logistic_regression_lr__penalty',
       'logistic_regression_preprocessor__All_text__tfidf__max_features',
       'logistic_regression_preprocessor__All_text__tf_lda__lda__max_iter',
       'logistic_regression_preprocessor__All_text__tf_lda__lda__n_components',
       'logistic_regression_preprocessor__All_text__tfidf_lda__tfidf__max_features',
       'gradient_boosting_gbc__max_depth',
       'gradient_boosting_gbc__max_features',
       'gradient_boosting_gbc__n_estimators',
       'gradient_boosting_preprocessor__All_text__tfidf__max_features',
       'gradient_boosting_preprocessor__All_text__tf_lda__lda__max_iter',
       'gradient_boosting_preprocessor__All_text__tf_lda__lda__n_components',
       'gradient_boosting_preprocessor__All_text__tfidf_lda__tfidf__max_features', 
        'decision_tree_preprocessor__All_text__tf_lda__tf__max_df',
 'decision_tree_preprocessor__All_text__tfidf__max_df',
 'gradient_boosting_preprocessor__All_text__tf_lda__tf__max_df',
 'gradient_boosting_preprocessor__All_text__tfidf__max_df',
 'logistic_regression_preprocessor__All_text__tf_lda__tf__max_df',
 'logistic_regression_preprocessor__All_text__tfidf__max_df'}


assert len(set(hyperparameters_df.columns).difference(parameters_all_text)) == 0

In [None]:
#%%script false --no-raise-error
hyperparameters_df.rename(columns = {
    'decision_tree_dtc__max_depth': 'Decision tree: maximum depth', 
    'decision_tree_dtc__max_features': 'Decision tree: maximum features',
   'decision_tree_dtc__min_samples_split': 'Decision tree: minimum sample split',
   'decision_tree_preprocessor__All_text__tfidf__max_features': 'Decision tree: text - term frequency inverse document frequency maximum features',
   'decision_tree_preprocessor__All_text__tf_lda__lda__max_iter': 'Decision tree: text - latent dirichlet allocation - maximum iterations',
   'decision_tree_preprocessor__All_text__tf_lda__lda__n_components': 'Decision tree: text - latent dirichlet allocation - number of components',
   'decision_tree_preprocessor__All_text__tfidf_lda__tfidf__max_features': 'Decision tree: text - term frequency inverse document frequency maximum features to feed into latent dirichlet allocation',
   'logistic_regression_lr__C': 'Logistic regression: inverse of the regularisation strength', 
    'logistic_regression_lr__penalty': 'Logistic regression: penalisation norm',
   'logistic_regression_preprocessor__All_text__tfidf__max_features': 'Logistic regression: text - term frequency inverse document frequency maximum features',
   'logistic_regression_preprocessor__All_text__tf_lda__lda__max_iter': 'Logistic regression: text - latent dirichlet allocation - maximum iterations',
   'logistic_regression_preprocessor__All_text__tf_lda__lda__n_components': 'Logistic regression: text - latent dirichlet allocation - number of components',
   'logistic_regression_preprocessor__All_text__tfidf_lda__tfidf__max_features': 'Logistic regression: text - term frequency inverse document frequency maximum features to feed into latent dirichlet allocation',
   'gradient_boosting_gbc__max_depth': 'Gradient boosting: maximum depth',
   'gradient_boosting_gbc__max_features': 'Gradient boosting: maximum features',
   'gradient_boosting_gbc__n_estimators': 'Gradient boosting: number of estimators',
   'gradient_boosting_preprocessor__All_text__tfidf__max_features': 'Gradient boosting: text - term frequency inverse document frequency maximum features',
   'gradient_boosting_preprocessor__All_text__tf_lda__lda__max_iter': 'Gradient boosting: text - latent dirichlet allocation - maximum iterations',
   'gradient_boosting_preprocessor__All_text__tf_lda__lda__n_components': 'Gradient boosting: text - latent dirichlet allocation - number of components',
   'gradient_boosting_preprocessor__All_text__tfidf_lda__tfidf__max_features': 'Gradient boosting: text - term frequency inverse document frequency maximum features to feed into latent dirichlet allocation',
    'decision_tree_preprocessor__All_text__tf_lda__tf__max_df': 'Decision tree: text - maximum term frequency for terms in topic modelling',
 'decision_tree_preprocessor__All_text__tfidf__max_df': 'Decision tree: text - maximum term frequency for term frequency inverse document frequency matrix',
 'gradient_boosting_preprocessor__All_text__tf_lda__tf__max_df': 'Gradient boosting: text - maximum term frequency for terms in topic modelling',
 'gradient_boosting_preprocessor__All_text__tfidf__max_df': 'Gradient boosting: text - maximum term frequency for term frequency inverse document frequency matrix',
 'logistic_regression_preprocessor__All_text__tf_lda__tf__max_df': 'Logistic regression: text - maximum term frequency for terms in topic modelling',
 'logistic_regression_preprocessor__All_text__tfidf__max_df': 'Logistic regression: text - maximum term frequency for term frequency inverse document frequency matrix'
                                 }, 
                               inplace = True, errors='ignore')

In [None]:
# Round
numeric_cols = list(hyperparameters_df.select_dtypes(include='number').columns)
numeric_cols.remove('Logistic regression: inverse of the regularisation strength')
hyperparameters_df[numeric_cols] = hyperparameters_df[numeric_cols].round(1)
hyperparameters_df['Logistic regression: inverse of the regularisation strength'] = hyperparameters_df['Logistic regression: inverse of the regularisation strength'].round(4)
hyperparameters_df

In [None]:
# Split to allow the tables to fit to a page
hyperparameters_df_dtc = hyperparameters_df[[col for col in hyperparameters_df.columns if 'Decision tree' in col]]
hyperparameters_df_lr = hyperparameters_df[[col for col in hyperparameters_df.columns if 'Logistic regression' in col]]
hyperparameters_df_gb = hyperparameters_df[[col for col in hyperparameters_df.columns if 'Gradient boosting' in col]]

In [None]:
#%%script false --no-raise-error
if (data_type == 'str') or ((LA == 'LA2') and (data_type == 'all')) or ((LA == 'LA3') and (data_type == 'all')):
    print(LA)
    print(data_type)
    # Get the order right
    hyperparameters_df = pd.concat([hyperparameters_df_dtc, hyperparameters_df_lr, hyperparameters_df_gb], axis=1)
    hyperparameters_df.to_csv("{}/Hyperparameters {}.csv".format(output_folder, file_stub))
    
else:
    hyperparameters_df_dtc.to_csv("{}/Hyperparameters dtc {}.csv".format(output_folder, file_stub))
    hyperparameters_df_lr.to_csv("{}/Hyperparameters lr {}.csv".format(output_folder, file_stub))
    hyperparameters_df_gb.to_csv("{}/Hyperparameters gb {}.csv".format(output_folder, file_stub))

In [None]:
hyperparameters_df

In [None]:
assert 1==2

## Training time

In [None]:
# Mean training time
import math
import pandas as pd
df = pd.DataFrame()

# corrected
if cv=='ss' and LA=='LA4':
    num_cv=3
elif cv=='ss':
    num_cv=5
elif cv=='ts':
    num_cv=3
rank_1_iteration_dict = {}
for file_name in file_dict.keys():
    print(file_name)
    rank_1_iteration = file_dict[file_name].loc[file_dict[file_name]['rank_test_score'] == 1,]
    rank_1_iteration_dict[file_name] = rank_1_iteration
    df.at[file_name, 'Average training time (seconds)'] = round(rank_1_iteration['mean_fit_time'].values[0], 2)


In [None]:
algorithm_names_dict = {'gradient_boosting': 'Gradient Boosting', 'logistic_regression': 'Logistic Regression',
                'decision_tree': 'Decision Tree'}

df.rename(index=algorithm_names_dict, inplace = True)
print(df)
df.to_csv("{}/Training time {}.csv".format(output_folder, file_stub))

In [None]:
assert 1==2

## Feature importance plot

In [None]:
# Variable set-up
LA = 'LA4'
rq = 'rq2' # Options: 'rq1', 'rq2'
cv = 'ts' # Options: 'ts' (time series split, ignore siblings), 'ss' (stratified shuffle, ignore siblings)
data_type = 'str' # Options: 'str' (just structured data), 'all' (structured data and list of strings)


file_stub = rq + '_' + cv + '_' + data_type # Creates file stub for saving in the format e.g. rq1_ss_str

if LA =='LA1':
    input_folder = '../LA1 Results July 2020'
    output_folder = 'LA1'
if LA =='LA2':
    input_folder = '../Final transfer out data and code to WWC Jan 2020'
    output_folder = 'LA2'
if LA =='LA3':
    input_folder = '../Anonymised structured data'
    output_folder = 'LA3'
if LA =='LA4':
    input_folder = '../LA4 August 2020'
    output_folder = 'LA4'

In [None]:
import matplotlib.pyplot as plt
import pandas as pd


if data_type != 'all':
    df_feature_imp = pd.read_csv('{}/Models/feature_importance_{}.csv'.format(input_folder, file_stub))
    df_feature_imp.sort_values(by = 'feature_importance', ascending = False)[0:20]

    df_feature_imp['feature_importance'] = df_feature_imp['feature_importance'].abs()
    df_feature_imp.sort_values(by = 'feature_importance', ascending = False, inplace = True)
    df_feature_imp_best_algorithm = df_feature_imp[df_feature_imp['algorithm'] == best_algorithm] 
    df_feature_imp_best_algorithm.reset_index(inplace = True, drop = True)
    df_feature_imp_best_algorithm = df_feature_imp_best_algorithm.loc[df_feature_imp_best_algorithm['feature_importance']>0,]
    feature_names = df_feature_imp_best_algorithm.loc[0:19,'column_name']
    feature_importances = round(df_feature_imp_best_algorithm.loc[0:19,'feature_importance'],4)



In [None]:
# Rename features
import numpy as np
import re

feature_labels = pd.read_csv('Graphs Analysis - All variables.csv')
feature_names_df = pd.DataFrame(feature_names).merge(feature_labels[['Variable Name', 'Name of variable in the report']], how = 'left', left_on = 'column_name', right_on = 'Variable Name')
feature_names_df.drop_duplicates(subset = 'column_name', inplace = True)

print(feature_names_df.loc[feature_names_df['Name of variable in the report'].isna(), 'column_name'].unique())
assert feature_names_df['Name of variable in the report'].isna().sum() == 0


In [None]:
feature_names_df.to_csv('{}/Feature names for summary stats {}.csv'.format(output_folder, file_stub))

In [None]:
assert 1==2

In [None]:
# Try some automatic renaming
if feature_names_df['Name of variable in the report'].isna().sum() != 0:

    feature_names_df['Name of variable in the report_all'] = feature_names_df['Name of variable in the report'].fillna(feature_names_df['column_name'])
    feature_names_list = [n.replace('previous_exc_current_sum', 'Total Number of Previous') for n in feature_names_df['Name of variable in the report_all']]
    feature_names_list = [n.replace('previous_exc_current_mean', 'Average Number of Previous') for n in feature_names_list]

    feature_names_list = [n.title() for n in feature_names_list]
    feature_names_list = [n.replace('_', ': ') for n in feature_names_list]
    feature_names_list = [n.replace('.', ' ') for n in feature_names_list]

    word_list = ['date', 'time', 'source', 'code', 'start', 'social', 'work',
                 'assessment', 'completion', 'days', 'referral', 'care', 'reason',
                'legal', 'status', 'need', 'abuse', 'category', 'of', 'cp', 'length',
                'close', 'contact']
    for w in word_list:
        feature_names_list = [re.sub('{}'.format(w), ' {}'.format(w), t) for t in feature_names_list]
    feature_names_list = pd.Series(feature_names_list)
    feature_names_list.shape
else:
    feature_names_list = pd.Series(feature_names_df['Name of variable in the report'])
    
    

In [None]:
# Prediction question
if (LA =='LA1') and (rq == 'rq1'):
    prediction_qu = '1'
elif (LA =='LA1') and (rq == 'rq2'):
    prediction_qu = '2'   
elif (LA =='LA2') and (rq == 'rq1'):
    prediction_qu = '3' 
elif (LA =='LA2') and (rq == 'rq2'):
    prediction_qu = '4' 
elif (LA =='LA3') and (rq == 'rq1'):
    prediction_qu = '5' 
elif (LA =='LA3') and (rq == 'rq2'):
    prediction_qu = '6' 
elif (LA =='LA4') and (rq == 'rq1'):
    prediction_qu = '7' 
elif (LA =='LA4') and (rq == 'rq2'):
    prediction_qu = '8' 

In [None]:
# Identify source and base
dict_names = {'ss': 'learning from all cases',
             'ts': 'learning just from earlier cases',
             'str': 'structured data only',
             'all': 'structured and text data'}


outcome = years_sample_sizes.loc[(years_sample_sizes['Local authority'] == LA) &
                       (years_sample_sizes['Research question'] == rq) & 
                        (years_sample_sizes['Cross-Validation'] == cv),'Shortened outcome'].values[0] 

years = years_sample_sizes.loc[(years_sample_sizes['Local authority'] == LA) &
                       (years_sample_sizes['Research question'] == rq) & 
                        (years_sample_sizes['Cross-Validation'] == cv),'Years'].values[0]

sample_sizes = years_sample_sizes.loc[(years_sample_sizes['Local authority'] == LA) &
                       (years_sample_sizes['Research question'] == rq) &
                       (years_sample_sizes['Cross-Validation'] == cv), 'Sample size'].values[0]

txt_source = 'Prediction: {}'.format(outcome)
print(txt_source)
txt_model_desc = 'Model: {}, {}'.format(dict_names[cv], dict_names[data_type])
print(txt_model_desc)
LA_num = LA.replace('LA', '')
txt_base = 'Data: {}, {}, N = {}'.format('Local authority {}'.format(LA_num), years, sample_sizes)
print(txt_base)

In [None]:
import numpy as np
from textwrap import wrap

print(feature_names_list.shape[0])
print(feature_importances.shape[0])
assert (feature_names_list.shape[0] == feature_importances.shape[0])

# Wrap long feature names
feature_names_list = pd.Series([ '\n'.join(wrap(f, 100)) for f in feature_names_list])

if data_type != 'all':
    fig, ax = plt.subplots()

    ax.barh(feature_names_list, feature_importances, color='#ff7057')
    ax.tick_params(axis='both', which='both', length=0)
    ax.set_yticks(feature_names_list)
    x_ticks = [round(n,4) for n in np.linspace(0,max(feature_importances)+0.1*max(feature_importances),5)]
    ax.set_xticks(x_ticks)
    ax.set_yticklabels(feature_names_list, fontname="Arial", color='#4d4d51', fontsize=12)
    ax.set_xticklabels(x_ticks, fontname="Arial", color='#4d4d51', fontsize=12)  
    ax.xaxis.grid(color='#d0dde1')
    ax.yaxis.grid(False)    
    ax.invert_yaxis()  # labels read top-to-bottom
    ax.set_title('The relative importance of each feature when making a prediction', fontname="Arial", color = '#4d4d51', fontsize=12, loc='left')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(False)
    plt.suptitle('MOST IMPORTANT FEATURES (UP TO TOP 20)', fontname="Arial", color = '#ff7057', fontsize=12, x=0.43)

    plt.figtext(0, -0.05, txt_source, wrap=True, fontname="Arial", color='#4d4d51', fontsize=12)
    plt.figtext(0, -0.1, txt_model_desc, wrap=True, fontname="Arial", color='#4d4d51', fontsize=12)
    plt.figtext(0, -0.15, txt_base, wrap=True, fontname="Arial", color='#4d4d51', fontsize=12)

    plt.savefig('{}/Graphs/Feature importances {} ({}).png'.format(input_folder, file_stub, dict_names[cv]), transparent=False, dpi=80, bbox_inches="tight")
    plt.show()