In [None]:
#!/usr/bin/env python
# coding: utf-8

# ## Structured Data
# 
# - Create final dataset for each research question
# - Create test, train, holdout splits

# Load user-written functions

get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')

In [None]:
import pandas as pd
import os

# Set working directory
os.chdir("C:\\Users\\[username]\\Downloads\\Updated Structured Data\\Created") # insert [username]

rq1data = pd.read_csv("rq1data_w_previous.csv", index_col = 0)
print(rq1data.shape)
print(rq1data.index)
print(rq1data.columns)

rq2data = pd.read_csv("rq2data_w_previous.csv", index_col = 0)
print(rq2data.shape)
print(rq2data.index)
print(rq2data.columns)

In [None]:
# Object to categorical (analysis pipeline doesn't accept objects)
rq1data[['detailed_ethnicity', 'Gender', 'ReReferral','hasdisability', 'ethnicity_highlevel']] = rq1data[['detailed_ethnicity', 'Gender', 'ReReferral','hasdisability', 'ethnicity_highlevel']].astype('category')
rq2data[['detailed_ethnicity', 'Gender', 'ReReferral','hasdisability', 'ethnicity_highlevel']] = rq2data[['detailed_ethnicity', 'Gender', 'ReReferral','hasdisability', 'ethnicity_highlevel']].astype('category')

In [None]:
# Drop prescient columns / ID columns / columns used for matching
(rq1data.drop(columns = ['NFAandreturnwithintwoyears', 'ActualStartDate', 'ReferralCloseDate',
       'ReferralDatetime_previous', 'ReferralCloseDate_previous',
       'ActualStartDate_previous'], inplace = True))
(rq2data.drop(columns = ['ActualStartDate', 'ReferralCloseDate',
       'ReferralDatetime_previous', 'ReferralCloseDate_previous',
       'ActualStartDate_previous'], inplace = True))

In [None]:
# Sort by ReferralDatetime and PSID and reset indices so that it matches with the text data
import pickle

rq1data.sort_values(by = ['PSID', 'ReferralDatetime'], inplace = True)
rq1data.reset_index(drop = True, inplace = True)
rq2data.sort_values(by = ['PSID', 'ReferralDatetime'], inplace = True)
rq2data.reset_index(drop = True, inplace = True)


In [None]:
# Drop na in outcome variable
print(rq1data.shape)
rq1data.dropna(subset = ['NFAandreturnwithinoneyear'], inplace = True)
print(rq1data.shape)

print(rq2data.shape)
rq2data.dropna(subset = ['escalation'], inplace = True)
print(rq2data.shape)

In [None]:
# Those above 18 aren't eligible for CPP or being LAC 
# Drop from analysis
print(rq1data.shape)
print("Age missing: ", rq1data['AgeAtReferralDate'].isna().sum())
print(rq1data['AgeAtReferralDate'].value_counts().sort_index())
rq1data = rq1data.loc[(rq1data['AgeAtReferralDate'] <18.0) | (rq1data['AgeAtReferralDate'].isna()),]
print(rq1data['AgeAtReferralDate'].value_counts().sort_index())
print(rq1data.shape)

print(rq2data.shape)
print("Age missing: ", rq2data['AgeAtReferralDate'].isna().sum())
print(rq2data['AgeAtReferralDate'].value_counts().sort_index())
rq2data = rq2data.loc[(rq2data['AgeAtReferralDate'] <18.0)  | (rq2data['AgeAtReferralDate'].isna()),]
print(rq2data['AgeAtReferralDate'].value_counts().sort_index()) 
print(rq2data.shape)

In [None]:
# Creating month / year of referral date to sort by for the time series splitting
rq1data['ReferralDatetime'] = pd.to_datetime(rq1data['ReferralDatetime'])
dates = pd.DataFrame()
dates['year'] = rq1data['ReferralDatetime'].dt.year
dates['month'] = rq1data['ReferralDatetime'].dt.month
dates['day'] = 1
rq1data['ReferralDatetime_month_year'] = pd.to_datetime(dates)
print(rq1data['ReferralDatetime_month_year'].isna().sum())

rq2data['ReferralDatetime'] = pd.to_datetime(rq2data['ReferralDatetime'])
dates = pd.DataFrame()
dates['year'] = rq2data['ReferralDatetime'].dt.year
dates['month'] = rq2data['ReferralDatetime'].dt.month
dates['day'] = 1
rq2data['ReferralDatetime_month_year'] = pd.to_datetime(dates)
print(rq2data['ReferralDatetime_month_year'].isna().sum())

In [None]:
# Each PSID must have only one PseudoID for the purposes of keeping together multiple observations of the same child over time
# in cross-validation

# rq1
sib_unique = rq1data.groupby('PSID')['PseudoID'].nunique()
assert (sib_unique == 1).all()

#rq2
sib_unique = rq2data.groupby('PSID')['PseudoID'].nunique()
assert (sib_unique == 1).all()

In [None]:
# Create separate sensitive dataset for evaluating bias
rq1data_sensitive = rq1data[['PSID', 'ReferralDatetime', 
                             'ReferralDatetime_month_year',
                            'detailed_ethnicity',
                             'ethnicity_highlevel',
                             'hasdisability',
                             'numberofdisabilities']]

print(rq1data_sensitive.shape)
print(rq1data_sensitive.columns)
rq2data_sensitive = rq2data[['PSID', 'ReferralDatetime', 
                             'ReferralDatetime_month_year',
                            'detailed_ethnicity',
                             'ethnicity_highlevel',
                             'hasdisability',
                             'numberofdisabilities']]

print(rq2data_sensitive.shape)
print(rq2data_sensitive.columns)
with open("df_outcome1_sensitive_characteristics.pkl", "wb") as handle:
    pickle.dump(rq1data_sensitive, handle, protocol = pickle.HIGHEST_PROTOCOL)
    
with open("df_outcome2_sensitive_characteristics.pkl", "wb") as handle:
    pickle.dump(rq2data_sensitive, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
import pickle
# Save data sorted by referral start date
# From this dataset springs the X, y and siblings => sorting before splitting ensures the datasets line up
rq1data = rq1data.sort_values(by = ['ReferralDatetime_month_year'])
rq1data.reset_index(inplace = True, drop = True)

rq2data = rq2data.sort_values(by = ['ReferralDatetime_month_year'])
rq2data.reset_index(inplace = True, drop = True)

with open("df_outcome1_before_splitting.pkl", "wb") as handle:
    pickle.dump(rq1data, handle, protocol = pickle.HIGHEST_PROTOCOL)
    
with open("df_outcome2_before_splitting.pkl", "wb") as handle:
    pickle.dump(rq2data, handle, protocol = pickle.HIGHEST_PROTOCOL)  

### Test, train, holdout split - RQ1

In [None]:
# Import data ready for test / train / holdout split (already sorted by referral date)

import pandas as pd
import pickle
import os

# Set working directory
os.chdir("C:\\Users\\[username]\\Downloads\\Updated Structured Data\\Created") # insert [username]

filename = open("df_outcome1_before_splitting.pkl", "rb")
df_outcome1_before_splitting = pickle.load(filename)
print(df_outcome1_before_splitting.shape)

filename = open("df_outcome2_before_splitting.pkl", "rb")
df_outcome2_before_splitting = pickle.load(filename)
print(df_outcome2_before_splitting.shape)

#### Stratified shuffle splits

In [None]:
# rq1, ss
from analysis_functions import StratifiedShuffleSplitGroups, create_test_train_splits

siblings = df_outcome1_before_splitting['PseudoID'] # pandas Series
df_outcome1_before_splitting_no_sib = df_outcome1_before_splitting.drop(columns = ['PseudoID'])
outcome = 'NFAandreturnwithinoneyear'

# Uses user-written StratifiedShuffleSplitGroups and create_test_train_splits
# create_test_train_splits saves test, train and holdout data for X, y and siblings in Created folder
# n_splits=1 for ss because create_test_train_splits then splits the second split into 2 for test and holdout
# otherwise overlap between test and holdout (not the same for ts)
ss = StratifiedShuffleSplitGroups(n_splits=1, test_size = .4, sibling_group = siblings, sibling_na = "99999.0", random_state=3005)
create_test_train_splits(df_outcome1_before_splitting_no_sib, ss, 'ss', outcome, siblings, 'rq1_ss_str')


In [None]:
## Check no overlapping siblings or child ids
# Check no overlapping siblings (except value denoting missing)

import pandas as pd
import pickle
import os

# Set working directory
os.chdir("C:\\Users\\[username]\\Downloads\\Updated Structured Data\\Created") # insert [username]

filename = open("../../Data for Model/X_train_rq1_ss_str.pkl", "rb")
X_tr_ss = pickle.load(filename)
print(X_tr_ss.shape)
print(X_tr_ss.index)

filename = open("../../Data for Model/X_test_rq1_ss_str.pkl", "rb")
X_test_ss = pickle.load(filename)
print(X_test_ss.shape)
print(X_test_ss.index)

print(set(X_tr_ss['PSID']).intersection(set(X_test_ss['PSID'])))

# There should actually be 0 overlapping PSIDs as there shouldn't be any missing 
# (==1 would allow for missing to be in both)
assert len(list(set(X_tr_ss['PSID']).intersection(set(X_test_ss['PSID'])))) <= 1 

filename = open("../../Data for Model/siblings_train_rq1_ss_str.pkl", "rb")
siblings_tr_ss = pickle.load(filename)
print(siblings_tr_ss.shape)
print(siblings_tr_ss.index)

filename = open("../../Data for Model/siblings_test_rq1_ss_str.pkl", "rb")
siblings_test_ss = pickle.load(filename)
print(siblings_test_ss.shape)
print(siblings_test_ss.index)

print(set(siblings_tr_ss).intersection(set(siblings_test_ss)))

# There should actually be 0 overlapping PseudoIDs as there shouldn't be any missing 
# (==1 would allow for missing to be in both)
assert len(list(set(siblings_tr_ss).intersection(set(siblings_test_ss)))) <= 1


In [None]:
# rq2, ss
siblings = df_outcome2_before_splitting['PseudoID'] # pandas Series
df_outcome2_before_splitting_no_sib = df_outcome2_before_splitting.drop(columns = ['PseudoID'])
outcome = 'escalation'

# Uses user-written StratifiedShuffleSplitGroups and create_test_train_splits
# create_test_train_splits saves test, train and holdout data for X, y and siblings in Created folder
# n_splits=1 for ss because create_test_train_splits then splits the second split into 2 for test and holdout
# otherwise overlap between test and holdout (not the same for ts)
ss = StratifiedShuffleSplitGroups(n_splits=1, test_size = 0.4, sibling_group = siblings, sibling_na = "99999.0", random_state=3005)
create_test_train_splits(df_outcome2_before_splitting_no_sib, ss, 'ss', outcome, siblings, 'rq2_ss_str')


In [None]:
## Check no overlapping siblings or child ids
# Check no overlapping siblings (except value denoting missing)

import pandas as pd
import pickle
import os

# Set working directory
os.chdir("C:\\Users\\[username]\\Downloads\\Updated Structured Data\\Created") # insert [username]

filename = open("../../Data for Model/X_train_rq2_ss_str.pkl", "rb")
X_tr_ss = pickle.load(filename)
print(X_tr_ss.shape)
print(X_tr_ss.index)

filename = open("../../Data for Model/X_test_rq2_ss_str.pkl", "rb")
X_test_ss = pickle.load(filename)
print(X_test_ss.shape)
print(X_test_ss.index)

print(set(X_tr_ss['PSID']).intersection(set(X_test_ss['PSID'])))

# There should actually be 0 overlapping PSIDs as there shouldn't be any missing 
# (==1 would allow for missing to be in both)
assert len(list(set(X_tr_ss['PSID']).intersection(set(X_test_ss['PSID'])))) <= 1

filename = open("../../Data for Model/siblings_train_rq2_ss_str.pkl", "rb")
siblings_tr_ss = pickle.load(filename)
print(siblings_tr_ss.shape)
print(siblings_tr_ss.index)

filename = open("../../Data for Model/siblings_test_rq2_ss_str.pkl", "rb")
siblings_test_ss = pickle.load(filename)
print(siblings_test_ss.shape)
print(siblings_test_ss.index)

print(set(siblings_tr_ss).intersection(set(siblings_test_ss)))

# There should actually be 0 overlapping PseudoIDs as there shouldn't be any missing 
# (==1 would allow for missing to be in both)
assert len(list(set(siblings_tr_ss).intersection(set(siblings_test_ss)))) <= 1


#### Time Series split

In [None]:

from analysis_functions import TimeSeriesSplitIgnoreSiblings, create_test_train_splits

# rq1, ts
siblings = df_outcome1_before_splitting['PseudoID'] # pandas Series
df_outcome1_before_splitting_no_sib = df_outcome1_before_splitting.drop(columns = ['PseudoID'])
outcome = 'NFAandreturnwithinoneyear'

# Uses user-written functions TimeSeriesSplitIgnoreSiblings and create_test_train_splits
# Splitting into 3 (2 splits) is fine because create_test_train_splits takes the first n_splits-1 folds as training
# and the n_splits-th as test. Ok because there's no shuffling of data
ts = TimeSeriesSplitIgnoreSiblings(n_splits=2, sibling_group = siblings, sibling_na = "99999.0")
create_test_train_splits(df_outcome1_before_splitting_no_sib, ts, 'ts', outcome, siblings, 'rq1_ts_str')

In [None]:
## Check no overlapping siblings or child ids
# Check no overlapping siblings (except value denoting missing)

import pandas as pd
import pickle
import os

# Set working directory
os.chdir("C:\\Users\\[username]\\Downloads\\Updated Structured Data\\Created") # insert [username]

filename = open("../../Data for Model/X_train_rq1_ts_str.pkl", "rb")
X_tr_ts = pickle.load(filename)
print(X_tr_ts.shape)
print(X_tr_ts.index)

filename = open("../../Data for Model/X_test_rq1_ts_str.pkl", "rb")
X_test_ts = pickle.load(filename)
print(X_test_ts.shape)
print(X_test_ts.index)

print(set(X_tr_ts['PSID']).intersection(set(X_test_ts['PSID'])))

# There should actually be 0 overlapping PSIDs as there shouldn't be any missing 
# (==1 would allow for missing to be in both)
assert len(list(set(X_tr_ts['PSID']).intersection(set(X_test_ts['PSID'])))) <= 1

filename = open("../../Data for Model/siblings_train_rq1_ts_str.pkl", "rb")
siblings_tr_ts = pickle.load(filename)
print(siblings_tr_ts.shape)
print(siblings_tr_ts.index)

filename = open("../../Data for Model/siblings_test_rq1_ts_str.pkl", "rb")
siblings_test_ts = pickle.load(filename)
print(siblings_test_ts.shape)
print(siblings_test_ts.index)

print(set(siblings_tr_ts).intersection(set(siblings_test_ts)))

# There should actually be 0 overlapping PseudoIDs as there shouldn't be any missing 
# (==1 would allow for missing to be in both)
assert len(list(set(siblings_tr_ts).intersection(set(siblings_test_ts)))) <= 1


In [None]:
from analysis_functions import TimeSeriesSplitIgnoreSiblings, create_test_train_splits

# rq2, ts
siblings = df_outcome2_before_splitting['PseudoID'] # pandas Series
df_outcome2_before_splitting_no_sib = df_outcome2_before_splitting.drop(columns = ['PseudoID'])
outcome = 'escalation'

# Uses user-written functions TimeSeriesSplitIgnoreSiblings and create_test_train_splits
# Splitting into 3 (2 splits) is fine because create_test_train_splits takes the first n_splits-1 folds as training
# and the n_splits-th as test. Ok because there's no shuffling of data
ts = TimeSeriesSplitIgnoreSiblings(n_splits=2, sibling_group = siblings, sibling_na = "99999.0")
create_test_train_splits(df_outcome2_before_splitting_no_sib, ts, 'ts', outcome, siblings, 'rq2_ts_str')

In [None]:
## Check no overlapping siblings or child ids
# Check no overlapping siblings (except value denoting missing)

import pandas as pd
import pickle
import os

# Set working directory
os.chdir("C:\\Users\\[username]\\Downloads\\Updated Structured Data\\Created") # insert [username]

filename = open("../../Data for Model/X_train_rq2_ts_str.pkl", "rb")
X_tr_ts = pickle.load(filename)
print(X_tr_ts.shape)
print(X_tr_ts.index)

filename = open("../../Data for Model/X_test_rq2_ts_str.pkl", "rb")
X_test_ts = pickle.load(filename)
print(X_test_ts.shape)
print(X_test_ts.index)

print(set(X_tr_ts['PSID']).intersection(set(X_test_ts['PSID'])))

# There should actually be 0 overlapping PSIDs as there shouldn't be any missing 
# (==1 would allow for missing to be in both)
assert len(list(set(X_tr_ts['PSID']).intersection(set(X_test_ts['PSID'])))) <= 1

filename = open("../../Data for Model/siblings_train_rq2_ts_str.pkl", "rb")
siblings_tr_ts = pickle.load(filename)
print(siblings_tr_ts.shape)
print(siblings_tr_ts.index)

filename = open("../../Data for Model/siblings_test_rq2_ts_str.pkl", "rb")
siblings_test_ts = pickle.load(filename)
print(siblings_test_ts.shape)
print(siblings_test_ts.index)

print(set(siblings_tr_ts).intersection(set(siblings_test_ts)))

# There should actually be 0 overlapping PseudoIDs as there shouldn't be any missing 
# (==1 would allow for missing to be in both)
assert len(list(set(siblings_tr_ts).intersection(set(siblings_test_ts)))) <= 1

 ## Just text data

- Train, test, holdout split for list of strings data
- Run after anonymisation (3) and text feature (4) notebooks to feed into tfidf and topic modelling

In [None]:
assert 1==2

In [None]:
import pickle
import os

# Set working directory
os.chdir("C:\\Users\\[username]\\Downloads\\Updated Text Data\\Created") # insert [username]

filename = open("text_rq1_list_of_strings.pkl", "rb")
df_text_list_of_strings_rq1 = pickle.load(filename)
print(df_text_list_of_strings_rq1.shape)
print(df_text_list_of_strings_rq1.index)
print(df_text_list_of_strings_rq1.columns)


filename = open("text_rq2_list_of_strings.pkl", "rb")
df_text_list_of_strings_rq2 = pickle.load(filename)
print(df_text_list_of_strings_rq2.shape)
print(df_text_list_of_strings_rq2.index)
print(df_text_list_of_strings_rq2.columns)


In [None]:
import pickle
import os

# Set working directory
os.chdir("C:\\Users\\[username]\\Downloads\\Updated Structured Data\\Created") # insert [username]

# Import structured data (already sorted by ReferralDatetime)
filename = open("df_outcome1_before_splitting.pkl", "rb")
df_outcome1_before_splitting = pickle.load(filename)
print(df_outcome1_before_splitting.shape)


filename = open("df_outcome2_before_splitting.pkl", "rb")
df_outcome2_before_splitting = pickle.load(filename)
print(df_outcome2_before_splitting.shape)

In [None]:
# Select the same rows as the cleaned structured data
import pandas as pd
# Format 
df_text_list_of_strings_rq1['ReferralDatetime'] = pd.to_datetime(df_text_list_of_strings_rq1['ReferralDatetime'])
#df_text_list_of_strings_rq2['ReferralDatetime'] = pd.to_datetime(df_text_list_of_strings_rq2['ReferralDatetime'])


# rq1 - don't need to merge into but still need to sort
df_text_list_of_strings_rq1 = pd.merge(df_outcome1_before_splitting[['PSID', 'ReferralDatetime', 'ReferralDatetime_month_year']], df_text_list_of_strings_rq1, on = ['PSID', 'ReferralDatetime'], how = 'left')
df_text_list_of_strings_rq1.drop_duplicates(subset = ['PSID', 'ReferralDatetime'], inplace = True)
df_text_list_of_strings_rq1.sort_values(by = 'ReferralDatetime_month_year', inplace = True)
print(df_text_list_of_strings_rq1.shape)

# rq2
df_text_list_of_strings_rq2 = pd.merge(df_outcome2_before_splitting[['PSID', 'ReferralDatetime', 'ReferralDatetime_month_year']], df_text_list_of_strings_rq2, on = ['PSID', 'ReferralDatetime'], how = 'left')
df_text_list_of_strings_rq2.drop_duplicates(subset = ['PSID', 'ReferralDatetime'], inplace = True)
df_text_list_of_strings_rq2.sort_values(by = 'ReferralDatetime_month_year', inplace = True)
print(df_text_list_of_strings_rq2.shape)


In [None]:
# Only want to train the LDA on training data (otherwise there's leakage from the test and holdout data)
import pickle
import os

# Set working directory
os.chdir("C:\\Users\\[username]\\Downloads\\Updated Structured Data\\Created") # insert [username]

filename = open("../../Data for Model/X_train_rq1_ts_str.pkl", "rb")
X_train_rq1_ts = pickle.load(filename)
print(X_train_rq1_ts.shape)
print(X_train_rq1_ts.index)

filename = open("../../Data for Model/X_test_rq1_ts_str.pkl", "rb")
X_test_rq1_ts = pickle.load(filename)
print(X_test_rq1_ts.shape)
print(X_test_rq1_ts.index)

In [None]:
# Only want to train the LDA on training data (otherwise there's leakage from the test and holdout data)

import pickle

filename = open("../../Data for Model/X_train_rq1_ss_str.pkl", "rb")
X_train_rq1_ss = pickle.load(filename)
print(X_train_rq1_ss.shape)
print(X_train_rq1_ss.index)

filename = open("../../Data for Model/X_test_rq1_ss_str.pkl", "rb")
X_test_rq1_ss = pickle.load(filename)
print(X_test_rq1_ss.shape)
print(X_test_rq1_ss.index)


In [None]:
# Only want to train the LDA on training data (otherwise there's leakage from the test and holdout data)
import pickle

filename = open("../../Data for Model/X_train_rq2_ts_str.pkl", "rb")
X_train_rq2_ts = pickle.load(filename)
print(X_train_rq2_ts.shape)
print(X_train_rq2_ts.index)

filename = open("../../Data for Model/X_test_rq2_ts_str.pkl", "rb")
X_test_rq2_ts = pickle.load(filename)
print(X_test_rq2_ts.shape)
print(X_test_rq2_ts.index)

In [None]:
filename = open("../../Data for Model/X_train_rq2_ss_str.pkl", "rb")
X_train_rq2_ss = pickle.load(filename)
print(X_train_rq2_ss.shape)
print(X_train_rq2_ss.index)

filename = open("../../Data for Model/X_test_rq2_ss_str.pkl", "rb")
X_test_rq2_ss = pickle.load(filename)
print(X_test_rq2_ss.shape)
print(X_test_rq2_ss.index)

In [None]:
df_text_list_of_strings_rq1.columns

In [None]:
# Train, test, holdout split for list of strings for feeding into topic models and also modelling
# Identify the right rows by merging the key columns from the train, test, holdout structured datasets

text_columns = (['Contact and Referral Form_text',
       'Child Social Work Assessment for Review Child Protection Conference_text_prev',
       'Child Social Work Assessment to Initial Child Protection Conference_text_prev',
       'Child Social Work Assessment_text_prev'])

print(df_text_list_of_strings_rq1.index)
df_text_list_of_strings_rq1_ts_train = pd.merge(X_train_rq1_ts[['PSID', 'ReferralDatetime']], df_text_list_of_strings_rq1, how = 'left', on = ['PSID', 'ReferralDatetime']) 
print(df_text_list_of_strings_rq1_ts_train.shape)
df_text_list_of_strings_rq1_ts_train.reset_index(inplace = True, drop = True)
print(df_text_list_of_strings_rq1_ts_train.index)
df_text_list_of_strings_rq1_ts_train[text_columns] = df_text_list_of_strings_rq1_ts_train[text_columns].fillna('')
print(df_text_list_of_strings_rq1_ts_train[text_columns].isna().sum())

print(df_text_list_of_strings_rq1.index)
df_text_list_of_strings_rq1_ts_test = pd.merge(X_test_rq1_ts[['PSID', 'ReferralDatetime']], df_text_list_of_strings_rq1, how = 'left', on = ['PSID', 'ReferralDatetime']) 
print(df_text_list_of_strings_rq1_ts_test.shape)
df_text_list_of_strings_rq1_ts_test.reset_index(inplace = True, drop = True)
print(df_text_list_of_strings_rq1_ts_test.index)
df_text_list_of_strings_rq1_ts_test[text_columns] = df_text_list_of_strings_rq1_ts_test[text_columns].fillna('')
print(df_text_list_of_strings_rq1_ts_test[text_columns].isna().sum())

print(df_text_list_of_strings_rq1.index)
df_text_list_of_strings_rq1_ss_train = pd.merge(X_train_rq1_ss[['PSID', 'ReferralDatetime']], df_text_list_of_strings_rq1, how = 'left', on = ['PSID', 'ReferralDatetime']) 
print(df_text_list_of_strings_rq1_ss_train.shape)
df_text_list_of_strings_rq1_ss_train.reset_index(inplace = True, drop = True)
print(df_text_list_of_strings_rq1_ss_train.index)
df_text_list_of_strings_rq1_ss_train[text_columns] = df_text_list_of_strings_rq1_ss_train[text_columns].fillna('')
print(df_text_list_of_strings_rq1_ss_train[text_columns].isna().sum())

print(df_text_list_of_strings_rq1.index)
df_text_list_of_strings_rq1_ss_test = pd.merge(X_test_rq1_ss[['PSID', 'ReferralDatetime']], df_text_list_of_strings_rq1, how = 'left', on = ['PSID', 'ReferralDatetime']) 
print(df_text_list_of_strings_rq1_ss_test.shape)
df_text_list_of_strings_rq1_ss_test.reset_index(inplace = True, drop = True)
print(df_text_list_of_strings_rq1_ss_test.index)
df_text_list_of_strings_rq1_ss_test[text_columns] = df_text_list_of_strings_rq1_ss_test[text_columns].fillna('')
print(df_text_list_of_strings_rq1_ss_test[text_columns].isna().sum())


In [None]:
# Save training, test and holdout data

# RQ1
with open("df_text_list_of_strings_train_rq1_ts_all.pkl", "wb") as handle:
    pickle.dump(df_text_list_of_strings_rq1_ts_train, handle, protocol = pickle.HIGHEST_PROTOCOL)
    
with open("df_text_list_of_strings_test_rq1_ts_all.pkl", "wb") as handle:
    pickle.dump(df_text_list_of_strings_rq1_ts_test, handle, protocol = pickle.HIGHEST_PROTOCOL)
    
with open("df_text_list_of_strings_train_rq1_ss_all.pkl", "wb") as handle:
    pickle.dump(df_text_list_of_strings_rq1_ss_train, handle, protocol = pickle.HIGHEST_PROTOCOL)
    
with open("df_text_list_of_strings_test_rq1_ss_all.pkl", "wb") as handle:
    pickle.dump(df_text_list_of_strings_rq1_ss_test, handle, protocol = pickle.HIGHEST_PROTOCOL)


In [None]:
# Train, test, holdout split for list of strings for feeding into topic models and also modelling
# Identify the right rows by merging the key columns from the train, test, holdout structured datasets

print(df_text_list_of_strings_rq2.index)
df_text_list_of_strings_rq2_ts_train = pd.merge(X_train_rq2_ts[['PSID', 'ReferralDatetime']], df_text_list_of_strings_rq2, how = 'left', on = ['PSID', 'ReferralDatetime']) 
print(df_text_list_of_strings_rq2_ts_train.shape)
df_text_list_of_strings_rq2_ts_train.reset_index(inplace = True, drop = True)
print(df_text_list_of_strings_rq2_ts_train.index)
df_text_list_of_strings_rq2_ts_train[text_columns] = df_text_list_of_strings_rq2_ts_train[text_columns].fillna('')
print(df_text_list_of_strings_rq2_ts_train[text_columns].isna().sum())

print(df_text_list_of_strings_rq2.index)
df_text_list_of_strings_rq2_ts_test = pd.merge(X_test_rq2_ts[['PSID', 'ReferralDatetime']], df_text_list_of_strings_rq2, how = 'left', on = ['PSID', 'ReferralDatetime']) 
print(df_text_list_of_strings_rq2_ts_test.shape)
df_text_list_of_strings_rq2_ts_test.reset_index(inplace = True, drop = True)
print(df_text_list_of_strings_rq2_ts_test.index)
df_text_list_of_strings_rq2_ts_test[text_columns] = df_text_list_of_strings_rq2_ts_test[text_columns].fillna('')
print(df_text_list_of_strings_rq2_ts_test[text_columns].isna().sum())

print(df_text_list_of_strings_rq2.index)
df_text_list_of_strings_rq2_ss_train = pd.merge(X_train_rq2_ss[['PSID', 'ReferralDatetime']], df_text_list_of_strings_rq2, how = 'left', on = ['PSID', 'ReferralDatetime']) 
print(df_text_list_of_strings_rq2_ss_train.shape)
df_text_list_of_strings_rq2_ss_train.reset_index(inplace = True, drop = True)
print(df_text_list_of_strings_rq2_ss_train.index)
df_text_list_of_strings_rq2_ss_train[text_columns] = df_text_list_of_strings_rq2_ss_train[text_columns].fillna('')
print(df_text_list_of_strings_rq2_ss_train[text_columns].isna().sum())

print(df_text_list_of_strings_rq2.index)
df_text_list_of_strings_rq2_ss_test = pd.merge(X_test_rq2_ss[['PSID', 'ReferralDatetime']], df_text_list_of_strings_rq2, how = 'left', on = ['PSID', 'ReferralDatetime']) 
print(df_text_list_of_strings_rq2_ss_test.shape)
df_text_list_of_strings_rq2_ss_test.reset_index(inplace = True, drop = True)
print(df_text_list_of_strings_rq2_ss_test.index)
df_text_list_of_strings_rq2_ss_test[text_columns] = df_text_list_of_strings_rq2_ss_test[text_columns].fillna('')
print(df_text_list_of_strings_rq2_ss_test[text_columns].isna().sum())

In [None]:
# Save training, test and holdout data

# RQ2
with open("df_text_list_of_strings_train_rq2_ts_all.pkl", "wb") as handle:
    pickle.dump(df_text_list_of_strings_rq2_ts_train, handle, protocol = pickle.HIGHEST_PROTOCOL)
    
with open("df_text_list_of_strings_test_rq2_ts_all.pkl", "wb") as handle:
    pickle.dump(df_text_list_of_strings_rq2_ts_test, handle, protocol = pickle.HIGHEST_PROTOCOL)
    
with open("df_text_list_of_strings_train_rq2_ss_all.pkl", "wb") as handle:
    pickle.dump(df_text_list_of_strings_rq2_ss_train, handle, protocol = pickle.HIGHEST_PROTOCOL)
    
with open("df_text_list_of_strings_test_rq2_ss_all.pkl", "wb") as handle:
    pickle.dump(df_text_list_of_strings_rq2_ss_test, handle, protocol = pickle.HIGHEST_PROTOCOL)


In [None]:
### Change data to csvs for inspection
import glob
import os
import pickle
import pandas as pd
import re

os.chdir("C:\\Users\\[username]\\Downloads\\Updated Structured Data\\Created") # insert [username]
filename_list = [file for file in glob.glob("df_text_list_of_strings_*.pkl")]

file_dict = {}
for file in filename_list:
    try:
        filename = open(file, "rb")
        f = pickle.load(filename)
        file_n = re.sub('.pkl', '', file)
        print(file_n)
        f.to_csv('{}.csv'.format(file_n))
    except(EOFError):
        break