### Topic modelling

 - Conducts TFIDF and LDA on training data 
 - Note deviation from trial protocol which stated that we would chose based on coherence - coherence is not available in sklearn and sklearn wrapper for the gensim coherence was buggy

In [1]:
# Variable set-up
rq = 'rq1' # Options: 'rq1', 'rq2'
cv = 'ss' # Options: 'ts' (time series split, ignore siblings), 'ss' (stratified shuffle, ignore siblings)
data_type = 'all' # Options: 'str' (just structured data), 'all' (structured data and list of strings)


In [2]:
# Parameters to vary
parameters = {'tfidf__max_features': [25, 30], # Relatively small numbers (keep relatively simple)
            'lda__n_components': [2, 4, 6], # Relatively small numbers (keep relatively simple)
              'lda__max_iter': [10, 100], # Max learning iterations
              'lda__learning_decay': [.7], # Doesn't vary much
               'lda__batch_size': [64] # Doesn't vary much
             } 

In [3]:
if rq == 'rq1':
    text_cols = (['Contact and Referral Form_text',
       'Child Social Work Assessment for Review Child Protection Conference_text_prev',
       'Child Social Work Assessment to Initial Child Protection Conference_text_prev',
       'Child Social Work Assessment_text_prev'])
else:
    text_cols = (['Child Social Work Assessment for Review Child Protection Conference_text',
           'Child Social Work Assessment to Initial Child Protection Conference_text',
           'Child Social Work Assessment_text', 'Contact and Referral Form_text',
           'Child Social Work Assessment for Review Child Protection Conference_text_prev',
           'Child Social Work Assessment to Initial Child Protection Conference_text_prev',
           'Child Social Work Assessment_text_prev',
           'Contact and Referral Form_text_prev'])

file_stub_y_siblings = rq + '_' + cv + '_str' # use 'str' for all 
file_stub = rq + '_' + cv + '_' + data_type # Creates file stub for saving in the format e.g. rq1_ss_str

In [4]:
# Add to system path so we can find all the packages
import sys
sys.path
sys.path.append('C:\\Program Files\\Python37\\Lib\\site-packages')
sys.path.append('C:\Program Files\Python37')

# Load user-written functions

get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')

#import text_functions

import analysis_functions, text_functions

In [None]:
# Only want to train the LDA on training data (otherwise there's leakage from the test  data)
# NB df_text_list_of_strings dataset indices reset in 5_Combine_Data
# Siblings dataset indices need to be reset to match 
# (not done earlier because structured dataset still needed with original indices and marginally 
# helpful siblings for siblings to match the structured dataset)
import os
import pickle

# Set working directory
os.chdir("C:\\Users\\[username]\\Downloads\\Updated Text Data\\Created\\") # inser [username]

filename = open("df_text_list_of_strings_train_{}.pkl".format(file_stub),"rb")
df_text_list_of_strings_train = pickle.load(filename)
print(df_text_list_of_strings_train.shape)
print(df_text_list_of_strings_train.index)

filename = open("../../Data for Model/y_train_{}.pkl".format(file_stub_y_siblings), "rb")
y_train = pickle.load(filename)
print(y_train.shape)
y_train.reset_index(inplace = True, drop = True)
print(y_train.index)

filename = open("../../Data for Model/siblings_train_{}.pkl".format(file_stub_y_siblings), "rb")
siblings_train = pickle.load(filename)
print(siblings_train.shape)
siblings_train.reset_index(inplace = True, drop = True)
print(siblings_train.index)

In [None]:
from analysis_functions import TimeSeriesSplitIgnoreSiblings, StratifiedShuffleSplitGroups, grid_search_save_output
import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline # ok to use here as not considering the imbalance of the data => no need for imblearn pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection._split import TimeSeriesSplit, GroupKFold

tfidf = TfidfVectorizer(stop_words = 'english',
                        lowercase = True,
                        min_df = 1, # lower min_df means more pruning => fewer tokens to build models on, 1 = in only one document, otherwise fraction
                        token_pattern="\\b[a-zA-Z][a-zA-Z]+\\b",#words with >= 2 alpha chars 
                       use_idf = True) 

lda_model = LatentDirichletAllocation(learning_method='online',  # Online is faster
                                      random_state=3005,          # Random state
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      #n_jobs = -1 # Use all available CPUs
                                     )               

# Define Search Param
pipeline_lda = (Pipeline([
                    ('tfidf',tfidf),
                    ('lda', lda_model)]))

# Create cross-validation splits
if cv == 'ts':
    tss_sib = TimeSeriesSplitIgnoreSiblings(n_splits=3, sibling_group = siblings_train, sibling_na = "99999.0")
    cross_val = tss_sib
else:
    sssg = StratifiedShuffleSplitGroups(n_splits=3, random_state=3005, sibling_group = siblings_train, sibling_na = '99999.0')
    cross_val = sssg

                  
column_model_dict = {}
column_model_cv_results_dict = {}

for col in text_cols:
    try:
        gscv = GridSearchCV(pipeline_lda, parameters, cv=cross_val, verbose=5, refit = True)
        print(gscv)
        df_all, best_parameters, best_estimator = grid_search_save_output(gscv, 'lda', df_text_list_of_strings_train[col], y_train, '../../Models/topic_modelling_results_{}_{}.csv'.format(file_stub, col))
        # Best Model
        column_model_dict[col] = best_estimator
        column_model_cv_results_dict[col] = df_all
        print("Column: ", col)
        print("Best Model's Params: ", best_parameters)
        print("Best score: ", gscv.best_score_)
        with open("../../Models/column_model_cv_results_dict_{}.pkl".format(file_stub), "wb") as handle:
            pickle.dump(column_model_cv_results_dict, handle, protocol = pickle.HIGHEST_PROTOCOL)
        with open("../../Models/column_model_dict_{}.pkl".format(file_stub), "wb") as handle:
            pickle.dump(column_model_dict, handle, protocol = pickle.HIGHEST_PROTOCOL)
    except(ValueError):
        print(col)
        continue
    

In [None]:
# Look at which words are most frequent in a topic
# Gives an indication of how to name topics
from text_functions import topic_top_words

import pickle


filename = open("../../Models/column_model_dict_{}.pkl".format(file_stub), "rb")
column_model_dict = pickle.load(filename)

n_top_words = 15
avoid_list = ["name", "surname", "date", "child", "social", "care",  "mr", "ms", "mrs"]

print("Topics in LDA model:")


for col in text_cols:
    try:
        print("Column: ", col)
        lda = column_model_dict[col]['lda']
        tfidf_vectorizer = column_model_dict[col]['tfidf']
        tf_feature_names = tfidf_vectorizer.get_feature_names() 
        topic_top_words_dict = topic_top_words(lda, tf_feature_names, n_top_words, avoid_list)
        filepath = "../../Topics/top_words_topics_{}_{}.txt".format(file_stub, col)
        with open(filepath, 'w') as file_handler:
            for key, value in topic_top_words_dict.items():
                print(key, [v[0] for i, v in enumerate(value)])
                file_handler.writelines("Topics {} {} {}\n".format(file_stub, key, [v[0] for i, v in enumerate(value)]))
    except(KeyError):
        print("Error in : ", col)
        continue

In [None]:
# Word clouds of highest weighted topics - one figure per column
from text_functions import create_wordcloud

wordcloud_avoid_list = ["surname", "date", "child", "children", "yes", "social", "care", "nan", "single",  "mr", "ms", "mrs"]

for col in text_cols:
    try:
        create_wordcloud(column_model_dict[col], wordcloud_avoid_list, 15, file_stub, col, 'tfidf', 2)
    except(KeyError):
        continue

In [None]:
# Add TFIDF features and topics as features
# ts

import pandas as pd
import pickle

filename = open("../../Models/column_model_dict_{}.pkl".format(file_stub), "rb")
column_model_dict = pickle.load(filename)

df_tfidf_all = pd.DataFrame()
df_topics_all = pd.DataFrame()
for col in column_model_dict.keys(): 
    # Make dataframe of documents x terms
    tfidf_vectorizer = column_model_dict[col]['tfidf']
    tfidf_vecs = tfidf_vectorizer.fit_transform(df_text_list_of_strings_train[col])
    df_tfidf = pd.DataFrame(tfidf_vecs.todense(), columns=tfidf_vectorizer.get_feature_names())
    df_tfidf = df_tfidf.add_prefix(col + '_') # CORRECTED
    df_tfidf_all = pd.concat([df_tfidf_all, df_tfidf], axis=1)
    # Make dataframe of documents x topics
    lda = column_model_dict[col]['lda']
    df_topics = lda.transform(tfidf_vecs)
    topic_columns = ['{}_Topic_{}'.format(col, n) for n in range(df_topics.shape[1])]
    df_topics = pd.DataFrame(df_topics, columns = topic_columns)
    df_topics_all = pd.concat([df_topics_all, df_topics], axis=1)

with open("../../Data for Model/df_tfidf_all_train_{}.pkl".format(file_stub), "wb") as handle:
    pickle.dump(df_tfidf_all, handle, protocol = pickle.HIGHEST_PROTOCOL)

with open("../../Data for Model/df_topics_all_train_{}.pkl".format(file_stub), "wb") as handle:
    pickle.dump(df_topics_all, handle, protocol = pickle.HIGHEST_PROTOCOL)
    
# Combine tfidf and topics
keep_columns = ['ReferralDatetime', 'PSID']
df_text_tfidf_features_topics_train = pd.concat([df_text_list_of_strings_train[keep_columns], df_tfidf_all, df_topics_all], axis = 1)

with open("../../Updated Structured Data/Created/df_tfidf_topics_train_{}.pkl".format(file_stub), "wb") as handle:
    pickle.dump(df_text_tfidf_features_topics_train, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
# Use same vectoriser and LDA model to create TFIDF and topics for test  data
import pandas as pd
import pickle
import traceback

# Bring in test  list of strings
filename = open("../../Updated Structured Data/Created/df_text_list_of_strings_test_{}.pkl".format(file_stub), "rb")
df_text_list_of_strings_test = pickle.load(filename)
print(df_text_list_of_strings_test.shape)
print(df_text_list_of_strings_test.index)


# Bring in previous models
filename = open("../../Models/column_model_dict_{}.pkl".format(file_stub), "rb")
column_model_dict = pickle.load(filename)

## Test
df_tfidf_all = pd.DataFrame()
df_topics_all = pd.DataFrame()
for col in column_model_dict.keys(): 
    try:
        # Make dataframe of documents x terms
        tfidf_vectorizer = column_model_dict[col]['tfidf']
        tfidf_vecs = tfidf_vectorizer.transform(df_text_list_of_strings_test[col])
        df_tfidf = pd.DataFrame(tfidf_vecs.todense(), columns=tfidf_vectorizer.get_feature_names())
        df_tfidf = df_tfidf.add_prefix(col + '_')
        df_tfidf_all = pd.concat([df_tfidf_all, df_tfidf], axis=1)
        # Make dataframe of documents x topics
        lda = column_model_dict[col]['lda']
        df_topics = lda.transform(tfidf_vecs)
        topic_columns = ['{}_Topic_{}'.format(col, n) for n in range(df_topics.shape[1])]
        df_topics = pd.DataFrame(df_topics, columns = topic_columns)
        df_topics_all = pd.concat([df_topics_all, df_topics], axis=1)        
    except Exception as exc:
        print(traceback.format_exc())
        print(exc)
        continue

# Run number 7 to combine all
with open("../../Updated Structured Data/Created/df_tfidf_all_test_{}.pkl".format(file_stub), "wb") as handle:
    pickle.dump(df_tfidf_all, handle, protocol = pickle.HIGHEST_PROTOCOL)
    
with open("../../Updated Structured Data/Created/df_topics_all_test_{}.pkl".format(file_stub), "wb") as handle:
    pickle.dump(df_topics_all, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
# Combine tfidf and topics
keep_columns = ['ReferralDatetime', 'PSID']
# Reset indices of new dataframes to match list of strings index => lines up for concatenation
df_tfidf_all.set_index(df_text_list_of_strings_test.index, drop = True, append = False, inplace = True)
df_topics_all.set_index(df_text_list_of_strings_test.index, drop = True, append = False, inplace = True)
df_text_tfidf_topics_test = pd.concat([df_text_list_of_strings_test[keep_columns], df_tfidf_all, df_topics_all], axis = 1)

print(df_tfidf_all.shape)
print(df_topics_all.shape)
print(df_text_tfidf_topics_test.shape)

with open("../../Updated Structured Data/Created/df_tfidf_topics_test_{}.pkl".format(file_stub), "wb") as handle:
    pickle.dump(df_text_tfidf_topics_test, handle, protocol = pickle.HIGHEST_PROTOCOL)