In [None]:
# Variable set-up

rq = 'rq2' # Options: 'rq1', 'rq2'
cv = 'ss' # Options: 'ts' (time series split, ignore siblings), 'ss' (stratified shuffle, ignore siblings)
data_type = 'all' # Options: 'str' (just structured data), 'all' (structured data and list of strings)
algorithm_names = ['decision_tree', 'logistic_regression', 'gradient_boosting'] 
#resampling_name = 'oss' # anything other than 'ada' does 'smote' 
#select_features_alpha = 0.001 # 0 to keep all features. Should be highest 0.001 as otherwise all dropped
rcv_n_iter = 50 # The more iterations, the more the randomised search searches for an optimal solution
parameters = 2 # 

# Don't change
file_stub_y_siblings = rq + '_' + cv + '_str' # use 'str' for all 
file_stub = rq + '_' + cv + '_' + data_type # Creates file stub for saving in the format e.g. rq1_ss_str
levers =  str(rcv_n_iter) + '_' + str(parameters)
print(file_stub + '_' + levers)

In [None]:
## File directories
local_dir = '/Users/[username]/Documents/Final transfer out data and code to WWC Jan 2020' # insert [username]
hard_drive_dir = '/Volumes/diskAshur2/Final transfer out data and code to WWC Jan 2020/Data for model/Use'
summary_info = '/Users/[username]/Documents/Summary information'

In [None]:
# Load user-written functions 

%load_ext autoreload
%autoreload 2

import analysis_functions

In [None]:
# Set working directory
import os
import pickle
os.chdir(hard_drive_dir)
os.getcwd()

In [None]:
# Topic results
import glob
import pandas as pd

file_list = glob.glob("{}/Models/topic_modelling_results_{}_*.csv".format(local_dir, file_stub))

file_dict = {}
for file_name in file_list:
    file = pd.read_csv(file_name)
    file_name = file_name.replace("{}/Models/topic_modelling_results_".format(local_dir), "")
    file_name = file_name.replace(".csv", "")
    file_dict[file_name] = file
    
print(file_dict.keys())

In [None]:
### Bring in source and base
years_sample_sizes = pd.read_csv("{}/Years and Sample Sizes.csv".format(summary_info))

# Identify source and base
dict_names = {'ss': 'learning from all cases',
             'ts': 'learning just from earlier cases',
             'str': 'structured data only',
             'all': 'structured and text data'}

outcome = years_sample_sizes.loc[(years_sample_sizes['Local authority'] == 'LA2') &
                       (years_sample_sizes['Research question'] == rq) & 
                        (years_sample_sizes['Cross-Validation'] == cv),'Shortened outcome'].values[0]    

years = years_sample_sizes.loc[(years_sample_sizes['Local authority'] == 'LA2') &
                       (years_sample_sizes['Research question'] == rq) & 
                        (years_sample_sizes['Cross-Validation'] == cv),'Years'].values[0]

sample_sizes = years_sample_sizes.loc[(years_sample_sizes['Local authority'] == 'LA2') &
                       (years_sample_sizes['Research question'] == rq) &
                       (years_sample_sizes['Cross-Validation'] == cv), 'Sample size'].values[0]

txt_source = 'Prediction: {}'.format(outcome)
print(txt_source)
txt_model_desc = 'Model: {}, {}'.format(dict_names[cv], dict_names[data_type])
print(txt_model_desc)
txt_base = 'Data: {}, {}, N = {}'.format('Local authority 2', years, sample_sizes)
print(txt_base)

if cv == 'ts':
    cv_for_graph = 'learning just from earlier cases'
if cv == 'ss':
    cv_for_graph = 'learning from all cases'

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from textwrap import wrap


for col, topic_modelling_results in file_dict.items():

    topics_1 = topic_modelling_results.loc[(topic_modelling_results['tfidf__max_features'] == 30.0) & 
                                                             (topic_modelling_results['param_lda__max_iter'] == 100),]

    topics_2 = topic_modelling_results.loc[(topic_modelling_results['tfidf__max_features'] == 25.0) & 
                                                             (topic_modelling_results['param_lda__max_iter'] == 100),]

    fig, ax = plt.subplots()

    ax.plot(topics_1['param_lda__n_components'], topics_1['mean_test_score'], '^-', color='#ff7057', label = 'Maximum features = 25')
    ax.plot(topics_2['param_lda__n_components'], topics_2['mean_test_score'], 'o-', color='#ff7057', label = 'Maximum features = 30')


    ax.tick_params(axis='both', which='both', length=0, colors='#4d4d51')

    plt.xlabel('Number of topics', fontname="Arial", color='#4d4d51', fontsize=12)
    plt.ylabel('Log likelihood', fontname="Arial", color='#4d4d51', fontsize=12)
    ax.set_xticks(np.arange(2,8,2))
    ax.xaxis.grid(False)
    ax.yaxis.grid(color='#d0dde1')  
    title = '\n'.join(wrap('How well the topic model explains the data (as measured by log likelihood) for increasing numbers of topics', 60))
    ax.set_title(title, fontname="Arial", color = '#4d4d51', fontsize=12, loc='left')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(False)
    plt.suptitle('LIKELIHOOD OF TOPIC MODEL BY NUMBER OF TOPICS', fontname="Arial", color = '#ff7057', fontsize=12, x=0.5, y=1.05)
    legend = plt.legend()
    plt.setp(legend.get_texts(), color='#4d4d51')

    plt.figtext(0, -0.05, txt_source, wrap=True, fontname="Arial", color='#4d4d51', fontsize=12)
    plt.figtext(0, -0.1, txt_model_desc, wrap=True, fontname="Arial", color='#4d4d51', fontsize=12)
    plt.figtext(0, -0.15, txt_base, wrap=True, fontname="Arial", color='#4d4d51', fontsize=12)

    plt.savefig('{}/Graphs/Elbow Plots {} ({}).png'.format(local_dir, file_stub, col), transparent=False, dpi=80, bbox_inches="tight")
    plt.show()

In [None]:
# Concatenate all the dataframes
df_list = []
for col, topic_modelling_results in file_dict.items():
    df = file_dict[col][['mean_test_score', 'std_test_score',
           'param_lda__max_iter', 'param_lda__n_components',
           'tfidf__max_features']]
    df['mean_test_score'] = round(df['mean_test_score'] ,2)
    df['std_test_score'] = round(df['std_test_score'] ,2)
    df['Document'] = col
    df_list.append(df)

# Create dataset to save
results = pd.concat(df_list, axis=0)
results.rename(columns = {'mean_test_score': 'Mean test score', 'std_test_score': 'Standard deviation test score', 
                          'param_lda__max_iter': 'Maximum iterations', 'param_lda__n_components': 'Number of components',
                          'tfidf__max_features': 'Maximum number of features'}, inplace = True)

results = results[['Document',
'Maximum iterations', 'Number of components',
'Maximum number of features', 'Mean test score', 
'Standard deviation test score']]



In [None]:
results['Document'].replace({'{}_Child Social Work Assessment to Initial Child Protection Conference_text_prev'.format(file_stub): 'Report to previous Initial Child Protection Conferences',
       '{}_Contact and Referral Form_text'.format(file_stub): 'Contact and Referral Record',
       '{}_Child Social Work Assessment_text_prev'.format(file_stub): 'Assessment',
       '{}_Child Social Work Assessment for Review Child Protection Conference_text_prev'.format(file_stub): 'Report to previous Review Child Protection Conferences',
       '{}_Child Social Work Assessment to Initial Child Protection Conference_text'.format(file_stub): 'Report to Review Child Protection Conference'}, inplace = True)

In [None]:
results.to_csv("{}/Models/Log Likelihood_{}.csv".format(local_dir, file_stub), index = False)