In [4]:
# to ignore warnings 
import warnings
warnings.filterwarnings('ignore')

# base libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# here are the models I will use
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# for feature preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

# For model evaluation
from sklearn.metrics import plot_confusion_matrix, confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

# for model optimization
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from tempfile import mkdtemp

# For timings on for loops
from tqdm.notebook import tqdm, trange
import time   


In [5]:
# importing the scripts for creating X and Y variable subsets
import sys

sys.path.append("C:\\Users\\sarah\\Documents\\Brainstation\\DataScienceBootcamp\\Capstone\\FlaredownCapstone\\scripts")

from variable_splitter import variable_splitter

In [6]:
df_binary = pd.read_csv('data/chronic_agg_fibro_binary.csv', index_col = 'Unnamed: 0')

df_binary.head(5)

Unnamed: 0,ids,age,gender,country,target,total_logs,total_unique_dates_days,total_log_rate,conditions_total,fibro_comorbidities,...,nerve_pain,nerve_pain_total_logs,nerve_pain_activity,nerve_pain_unique_dates_days,nerve_pain_log_rate,menstrual_cramps,menstrual_cramps_total_logs,menstrual_cramps_activity,menstrual_cramps_unique_dates_days,menstrual_cramps_log_rate
0,14,24,female,US,1,7,1,7.0,2,0,...,0,0,0,0,0.0,0,0,0,0,0.0
1,17,33,female,US,1,18,2,9.0,3,0,...,0,0,0,0,0.0,0,0,0,0,0.0
2,78,29,female,US,1,16,3,5.333333,3,1,...,0,0,0,0,0.0,0,0,0,0,0.0
3,96,28,other,US,1,35,4,8.75,3,2,...,0,0,0,0,0.0,0,0,0,0,0.0
4,126,45,female,US,1,160,8,20.0,6,1,...,0,0,0,0,0.0,0,0,0,0,0.0


In [7]:
# Instantiate the OneHotEncoder
ohe = OneHotEncoder()
genders = pd.DataFrame(df_binary["gender"])

# Fit the OneHotEncoder to the subcategory column and transform
# It expects a 2D array, so we first convert the column into a DataFrame
gen_encoded = ohe.fit_transform(genders)
gen_encoded
dense_gen_encoded = gen_encoded.toarray()

In [8]:
ohe.categories_

[array(['female', 'male', 'other'], dtype=object)]

In [9]:
# Check

gen_check = pd.DataFrame(dense_gen_encoded, columns=ohe.categories_, dtype=int)

gen_check

Unnamed: 0,female,male,other
0,1,0,0
1,1,0,0
2,1,0,0
3,0,0,1
4,1,0,0
...,...,...,...
20708,0,1,0
20709,1,0,0
20710,1,0,0
20711,1,0,0


In [10]:
gen_check.columns = ['female', 'male', 'other'] # changing the column names and checking work

gen_check.columns

Index(['female', 'male', 'other'], dtype='object')

In [11]:
# Reset index

gen_check.index = df_binary.index

In [12]:
# Concatenating 

df_binary2 = pd.concat([df_binary,gen_check], axis=1)

df_binary2.head(2)

Unnamed: 0,ids,age,gender,country,target,total_logs,total_unique_dates_days,total_log_rate,conditions_total,fibro_comorbidities,...,nerve_pain_unique_dates_days,nerve_pain_log_rate,menstrual_cramps,menstrual_cramps_total_logs,menstrual_cramps_activity,menstrual_cramps_unique_dates_days,menstrual_cramps_log_rate,female,male,other
0,14,24,female,US,1,7,1,7.0,2,0,...,0,0.0,0,0,0,0,0.0,1,0,0
1,17,33,female,US,1,18,2,9.0,3,0,...,0,0.0,0,0,0,0,0.0,1,0,0


In [13]:
y_binary, X_all_binary, X_basic_binary, X_CS_binary_binary, X_CS_total_logs_binary, X_CS_activity_binary, X_CS_log_rate_binary, X_CS_unique_dates_binary = variable_splitter(df_binary2)

Total Condition / Symptom Binary length 0
Total logs length: 0
Median activity length: 0
Log rate length: 0
Unique dates length: 0
Total Condition / Symptom Binary length 76
Total logs length: 76
Median activity length: 76
Log rate length: 76
Unique dates length: 76


In [14]:


#Dropping colinear variables

X_CS_activity_binary2 = X_CS_activity_binary.drop(columns=['ids','conditions_total', 'symptoms_total', 'total_unique_dates_days', 'total_logs', 'total_log_rate',])



X_CS_activity_binary2.columns

Index(['age', 'female', 'other', 'fibro_comorbidities',
       'postural_orthostatic_tachycardia_syndrome_activity',
       'ehlers-danlos_syndrome_activity', 'irritable_bowel_syndrome_activity',
       'osteoarthritis_activity', 'depression_activity', 'asthma_activity',
       'anxiety_activity', 'arthritis_activity', 'insomnia_activity',
       'endometriosis_activity', 'gerd_activity', 'lupus_activity',
       'generalized_anxiety_disorder_activity',
       'post-traumatic_stress_disorder_activity',
       'chronic_fatigue_syndrome_activity', 'rheumatoid_arthritis_activity',
       'polycystic_ovary_syndrome_(pcos)_activity', 'stomach_pain_activity',
       'migraine_activity', 'stomach_cramps_activity',
       'subluxation_or_dislocation_activity', 'back_pain_activity',
       'headache_activity', 'muscle_pain_activity', 'shoulder_pain_activity',
       'fatigue_activity', 'bloating_activity', 'abdominal_pain_activity',
       'knee_pain_activity', 'brain_fog_activity', 'chronic_pa

In [15]:


#Dropping colinear variables

X_CS_activity_binary2 = X_CS_activity_binary.drop(columns=['ids','conditions_total', 'symptoms_total', 'total_unique_dates_days', 'total_logs', 'total_log_rate',])



X_CS_activity_binary2.columns

Index(['age', 'female', 'other', 'fibro_comorbidities',
       'postural_orthostatic_tachycardia_syndrome_activity',
       'ehlers-danlos_syndrome_activity', 'irritable_bowel_syndrome_activity',
       'osteoarthritis_activity', 'depression_activity', 'asthma_activity',
       'anxiety_activity', 'arthritis_activity', 'insomnia_activity',
       'endometriosis_activity', 'gerd_activity', 'lupus_activity',
       'generalized_anxiety_disorder_activity',
       'post-traumatic_stress_disorder_activity',
       'chronic_fatigue_syndrome_activity', 'rheumatoid_arthritis_activity',
       'polycystic_ovary_syndrome_(pcos)_activity', 'stomach_pain_activity',
       'migraine_activity', 'stomach_cramps_activity',
       'subluxation_or_dislocation_activity', 'back_pain_activity',
       'headache_activity', 'muscle_pain_activity', 'shoulder_pain_activity',
       'fatigue_activity', 'bloating_activity', 'abdominal_pain_activity',
       'knee_pain_activity', 'brain_fog_activity', 'chronic_pa

In [16]:
# Creating the remainder and test sets all variables
X_remainder_AcBinary, X_test_AcBinary, y_remainder_AcBinary, y_test_AcBinary = train_test_split(X_CS_activity_binary2, y_binary, test_size=0.3, random_state=42, stratify=y_binary)

In [17]:
# scaling remainder and test

Binary_Scaler2 = StandardScaler()

Binary_Scaler2.fit(X_remainder_AcBinary)

X_remainder_AcBinary_scaledFin = Binary_Scaler2.transform(X_remainder_AcBinary)

X_test_AcBinary_scaledFin = Binary_Scaler2.transform(X_test_AcBinary)

In [18]:
np.arange(18, 70, step=1)

array([18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69])

In [19]:
X_remainder_AcBinary.columns

Index(['age', 'female', 'other', 'fibro_comorbidities',
       'postural_orthostatic_tachycardia_syndrome_activity',
       'ehlers-danlos_syndrome_activity', 'irritable_bowel_syndrome_activity',
       'osteoarthritis_activity', 'depression_activity', 'asthma_activity',
       'anxiety_activity', 'arthritis_activity', 'insomnia_activity',
       'endometriosis_activity', 'gerd_activity', 'lupus_activity',
       'generalized_anxiety_disorder_activity',
       'post-traumatic_stress_disorder_activity',
       'chronic_fatigue_syndrome_activity', 'rheumatoid_arthritis_activity',
       'polycystic_ovary_syndrome_(pcos)_activity', 'stomach_pain_activity',
       'migraine_activity', 'stomach_cramps_activity',
       'subluxation_or_dislocation_activity', 'back_pain_activity',
       'headache_activity', 'muscle_pain_activity', 'shoulder_pain_activity',
       'fatigue_activity', 'bloating_activity', 'abdominal_pain_activity',
       'knee_pain_activity', 'brain_fog_activity', 'chronic_pa

Re-testing the model on train and validation:

In [20]:
import pickle 

LR_binary_final = LogisticRegression(C=1, solver='saga', penalty='l1')

LR_binary_final.fit(X_remainder_AcBinary_scaledFin, y_remainder_AcBinary)

pickle.dump(LR_binary_final, open('binary_finalA.pkl', 'wb'))

LR_binary_final.score(X_remainder_AcBinary_scaledFin, y_remainder_AcBinary)

0.7166701151803573

In [21]:


#Dropping colinear variables

X_CS_activity_binary2 = X_CS_activity_binary.drop(columns=['ids','conditions_total', 'symptoms_total', 'total_unique_dates_days', 'total_logs', 'total_log_rate',])



X_CS_activity_binary2.columns

Index(['age', 'female', 'other', 'fibro_comorbidities',
       'postural_orthostatic_tachycardia_syndrome_activity',
       'ehlers-danlos_syndrome_activity', 'irritable_bowel_syndrome_activity',
       'osteoarthritis_activity', 'depression_activity', 'asthma_activity',
       'anxiety_activity', 'arthritis_activity', 'insomnia_activity',
       'endometriosis_activity', 'gerd_activity', 'lupus_activity',
       'generalized_anxiety_disorder_activity',
       'post-traumatic_stress_disorder_activity',
       'chronic_fatigue_syndrome_activity', 'rheumatoid_arthritis_activity',
       'polycystic_ovary_syndrome_(pcos)_activity', 'stomach_pain_activity',
       'migraine_activity', 'stomach_cramps_activity',
       'subluxation_or_dislocation_activity', 'back_pain_activity',
       'headache_activity', 'muscle_pain_activity', 'shoulder_pain_activity',
       'fatigue_activity', 'bloating_activity', 'abdominal_pain_activity',
       'knee_pain_activity', 'brain_fog_activity', 'chronic_pa

In [22]:


#Dropping colinear variables

X_CS_binary_binary2 = X_CS_binary_binary.drop(columns=['ids','conditions_total', 'symptoms_total', 'total_unique_dates_days', 'total_logs', 'total_log_rate',])



X_CS_binary_binary2.columns

Index(['age', 'female', 'other', 'fibro_comorbidities',
       'postural_orthostatic_tachycardia_syndrome', 'ehlers-danlos_syndrome',
       'irritable_bowel_syndrome', 'osteoarthritis', 'depression', 'asthma',
       'anxiety', 'arthritis', 'insomnia', 'endometriosis', 'gerd', 'lupus',
       'generalized_anxiety_disorder', 'post-traumatic_stress_disorder',
       'chronic_fatigue_syndrome', 'rheumatoid_arthritis',
       'polycystic_ovary_syndrome_(pcos)', 'stomach_pain', 'migraine',
       'stomach_cramps', 'subluxation_or_dislocation', 'back_pain', 'headache',
       'muscle_pain', 'shoulder_pain', 'fatigue', 'bloating', 'abdominal_pain',
       'knee_pain', 'brain_fog', 'chronic_pain', 'gas', 'weakness',
       'constipation', 'neck_pain', 'diarrhea', 'jaw_pain', 'joint_pain',
       'dizziness_or_vertigo', 'light_sensitivity', 'palpitations',
       'stiffness', 'sore_throat', 'shortness_of_breath', 'irritability',
       'nausea', 'difficulty_concentrating', 'chest_pain',
      

In [23]:
# Creating the remainder and test sets all variables
X_remainder_Binary, X_test_Binary, y_remainder_Binary, y_test_Binary = train_test_split(X_CS_binary_binary2, y_binary, test_size=0.3, random_state=42, stratify=y_binary)

In [24]:
X_remainder_Binary.head(1).to_csv('binary_df.csv')

In [25]:
# scaling remainder and test

Binary_Scaler22 = StandardScaler()

Binary_Scaler22.fit(X_remainder_Binary)

X_remainder_Binary_scaledFin = Binary_Scaler22.transform(X_remainder_Binary)

X_test_Binary_scaledFin = Binary_Scaler22.transform(X_test_Binary)

In [26]:
np.arange(18, 70, step=1)

array([18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69])

In [27]:
X_remainder_Binary.columns

Index(['age', 'female', 'other', 'fibro_comorbidities',
       'postural_orthostatic_tachycardia_syndrome', 'ehlers-danlos_syndrome',
       'irritable_bowel_syndrome', 'osteoarthritis', 'depression', 'asthma',
       'anxiety', 'arthritis', 'insomnia', 'endometriosis', 'gerd', 'lupus',
       'generalized_anxiety_disorder', 'post-traumatic_stress_disorder',
       'chronic_fatigue_syndrome', 'rheumatoid_arthritis',
       'polycystic_ovary_syndrome_(pcos)', 'stomach_pain', 'migraine',
       'stomach_cramps', 'subluxation_or_dislocation', 'back_pain', 'headache',
       'muscle_pain', 'shoulder_pain', 'fatigue', 'bloating', 'abdominal_pain',
       'knee_pain', 'brain_fog', 'chronic_pain', 'gas', 'weakness',
       'constipation', 'neck_pain', 'diarrhea', 'jaw_pain', 'joint_pain',
       'dizziness_or_vertigo', 'light_sensitivity', 'palpitations',
       'stiffness', 'sore_throat', 'shortness_of_breath', 'irritability',
       'nausea', 'difficulty_concentrating', 'chest_pain',
      

In [28]:
Binary_Conditions = list(('postural_orthostatic_tachycardia_syndrome', 'ehlers-danlos_syndrome',
       'irritable_bowel_syndrome', 'osteoarthritis', 'depression', 'asthma',
       'anxiety', 'arthritis', 'insomnia', 'endometriosis', 'gerd', 'lupus',
       'generalized_anxiety_disorder', 'post-traumatic_stress_disorder',
       'chronic_fatigue_syndrome', 'rheumatoid_arthritis',
       'polycystic_ovary_syndrome_(pcos)'))

CLEAN = list()

for cond in Binary_Conditions:
    cond = cond.replace("_", " ")
    CLEAN.append(cond)

CLEAN.sort()

CLEAN

['anxiety',
 'arthritis',
 'asthma',
 'chronic fatigue syndrome',
 'depression',
 'ehlers-danlos syndrome',
 'endometriosis',
 'generalized anxiety disorder',
 'gerd',
 'insomnia',
 'irritable bowel syndrome',
 'lupus',
 'osteoarthritis',
 'polycystic ovary syndrome (pcos)',
 'post-traumatic stress disorder',
 'postural orthostatic tachycardia syndrome',
 'rheumatoid arthritis']

In [29]:
Binary_SYMPS = list(('stomach_pain', 'migraine',
       'stomach_cramps', 'subluxation_or_dislocation', 'back_pain', 'headache',
       'muscle_pain', 'shoulder_pain', 'fatigue', 'bloating', 'abdominal_pain',
       'knee_pain', 'brain_fog', 'chronic_pain', 'gas', 'weakness',
       'constipation', 'neck_pain', 'diarrhea', 'jaw_pain', 'joint_pain',
       'dizziness_or_vertigo', 'light_sensitivity', 'palpitations',
       'stiffness', 'sore_throat', 'shortness_of_breath', 'irritability',
       'nausea', 'difficulty_concentrating', 'chest_pain',
       'numbness_and_tingling', 'acid_reflux', 'swelling', 'lightheadedness',
       'hand_pain', 'muscle_ache', 'tachycardia', 'hip_pain', 'muscle_spasm',
       'pelvic_pain', 'vomiting', 'foot_pain', 'rash', 'sweating', 'arm_pain',
       'leg_pain', 'nerve_pain', 'menstrual_cramps'))

CLEAN2 = list()

for cond in Binary_SYMPS:
    cond = cond.replace("_", " ")
    CLEAN2.append(cond)

CLEAN2.sort()

CLEAN2

['abdominal pain',
 'acid reflux',
 'arm pain',
 'back pain',
 'bloating',
 'brain fog',
 'chest pain',
 'chronic pain',
 'constipation',
 'diarrhea',
 'difficulty concentrating',
 'dizziness or vertigo',
 'fatigue',
 'foot pain',
 'gas',
 'hand pain',
 'headache',
 'hip pain',
 'irritability',
 'jaw pain',
 'joint pain',
 'knee pain',
 'leg pain',
 'light sensitivity',
 'lightheadedness',
 'menstrual cramps',
 'migraine',
 'muscle ache',
 'muscle pain',
 'muscle spasm',
 'nausea',
 'neck pain',
 'nerve pain',
 'numbness and tingling',
 'palpitations',
 'pelvic pain',
 'rash',
 'shortness of breath',
 'shoulder pain',
 'sore throat',
 'stiffness',
 'stomach cramps',
 'stomach pain',
 'subluxation or dislocation',
 'sweating',
 'swelling',
 'tachycardia',
 'vomiting',
 'weakness']

In [30]:


#Dropping colinear variables

X_CS_activity_binary2 = X_CS_activity_binary.drop(columns=['ids','conditions_total', 'symptoms_total', 'total_unique_dates_days', 'total_logs', 'total_log_rate',])



X_CS_activity_binary2.columns

Index(['age', 'female', 'other', 'fibro_comorbidities',
       'postural_orthostatic_tachycardia_syndrome_activity',
       'ehlers-danlos_syndrome_activity', 'irritable_bowel_syndrome_activity',
       'osteoarthritis_activity', 'depression_activity', 'asthma_activity',
       'anxiety_activity', 'arthritis_activity', 'insomnia_activity',
       'endometriosis_activity', 'gerd_activity', 'lupus_activity',
       'generalized_anxiety_disorder_activity',
       'post-traumatic_stress_disorder_activity',
       'chronic_fatigue_syndrome_activity', 'rheumatoid_arthritis_activity',
       'polycystic_ovary_syndrome_(pcos)_activity', 'stomach_pain_activity',
       'migraine_activity', 'stomach_cramps_activity',
       'subluxation_or_dislocation_activity', 'back_pain_activity',
       'headache_activity', 'muscle_pain_activity', 'shoulder_pain_activity',
       'fatigue_activity', 'bloating_activity', 'abdominal_pain_activity',
       'knee_pain_activity', 'brain_fog_activity', 'chronic_pa

In [31]:


#Dropping colinear variables

X_CS_activity_binary2 = X_CS_activity_binary.drop(columns=['ids','conditions_total', 'symptoms_total', 'total_unique_dates_days', 'total_logs', 'total_log_rate',])



X_CS_activity_binary2.columns

Index(['age', 'female', 'other', 'fibro_comorbidities',
       'postural_orthostatic_tachycardia_syndrome_activity',
       'ehlers-danlos_syndrome_activity', 'irritable_bowel_syndrome_activity',
       'osteoarthritis_activity', 'depression_activity', 'asthma_activity',
       'anxiety_activity', 'arthritis_activity', 'insomnia_activity',
       'endometriosis_activity', 'gerd_activity', 'lupus_activity',
       'generalized_anxiety_disorder_activity',
       'post-traumatic_stress_disorder_activity',
       'chronic_fatigue_syndrome_activity', 'rheumatoid_arthritis_activity',
       'polycystic_ovary_syndrome_(pcos)_activity', 'stomach_pain_activity',
       'migraine_activity', 'stomach_cramps_activity',
       'subluxation_or_dislocation_activity', 'back_pain_activity',
       'headache_activity', 'muscle_pain_activity', 'shoulder_pain_activity',
       'fatigue_activity', 'bloating_activity', 'abdominal_pain_activity',
       'knee_pain_activity', 'brain_fog_activity', 'chronic_pa

In [32]:
# Creating the remainder and test sets all variables
X_remainder_AcBinary, X_test_AcBinary, y_remainder_AcBinary, y_test_AcBinary = train_test_split(X_CS_activity_binary2, y_binary, test_size=0.3, random_state=42, stratify=y_binary)

In [57]:
# scaling remainder and test

Binary_Scaler2 = StandardScaler()

Binary_Scaler2.fit(X_remainder_AcBinary)

X_remainder_AcBinary_scaledFin = Binary_Scaler2.transform(X_remainder_AcBinary)

X_test_AcBinary_scaledFin = Binary_Scaler2.transform(X_test_AcBinary)

In [61]:
X_remainder_AcBinary_scaledFin

array([[-0.3244509 ,  0.32159704, -0.20449782, ..., -0.18589493,
        -0.17245698, -0.11913086],
       [ 1.24101121,  0.32159704, -0.20449782, ..., -0.18589493,
        -0.17245698, -0.11913086],
       [ 0.06691462,  0.32159704, -0.20449782, ...,  3.84595347,
        -0.17245698, -0.11913086],
       ...,
       [ 3.09999747, -3.10948133, -0.20449782, ..., -0.18589493,
        -0.17245698, -0.11913086],
       [-0.91149919, -3.10948133,  4.89002772, ..., -0.18589493,
        -0.17245698, -0.11913086],
       [-1.10718196,  0.32159704, -0.20449782, ..., -0.18589493,
        -0.17245698, -0.11913086]])

In [34]:
np.arange(18, 70, step=1)

array([18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69])

In [35]:
X_remainder_AcBinary.columns

Index(['age', 'female', 'other', 'fibro_comorbidities',
       'postural_orthostatic_tachycardia_syndrome_activity',
       'ehlers-danlos_syndrome_activity', 'irritable_bowel_syndrome_activity',
       'osteoarthritis_activity', 'depression_activity', 'asthma_activity',
       'anxiety_activity', 'arthritis_activity', 'insomnia_activity',
       'endometriosis_activity', 'gerd_activity', 'lupus_activity',
       'generalized_anxiety_disorder_activity',
       'post-traumatic_stress_disorder_activity',
       'chronic_fatigue_syndrome_activity', 'rheumatoid_arthritis_activity',
       'polycystic_ovary_syndrome_(pcos)_activity', 'stomach_pain_activity',
       'migraine_activity', 'stomach_cramps_activity',
       'subluxation_or_dislocation_activity', 'back_pain_activity',
       'headache_activity', 'muscle_pain_activity', 'shoulder_pain_activity',
       'fatigue_activity', 'bloating_activity', 'abdominal_pain_activity',
       'knee_pain_activity', 'brain_fog_activity', 'chronic_pa

Re-testing the model on train and validation:

In [36]:
import pickle 

LR_binary_final = LogisticRegression(C=1, solver='saga', penalty='l1')

LR_binary_final.fit(X_remainder_AcBinary_scaledFin, y_remainder_AcBinary)

pickle.dump(LR_binary_final, open('binary_finalA.pkl', 'wb'))

LR_binary_final.score(X_remainder_AcBinary_scaledFin, y_remainder_AcBinary)

0.7166701151803573

Re-testing the model on train and validation:

In [37]:
import pickle 

LR_binary_final2 = LogisticRegression(C=1, solver='saga', penalty='l1')

LR_binary_final2.fit(X_remainder_Binary_scaledFin, y_remainder_Binary)

pickle.dump(LR_binary_final2, open('binary_final.pkl', 'wb'))

LR_binary_final.score(X_remainder_Binary_scaledFin, y_remainder_Binary)

0.7175667287399131

In [62]:
X_remainder_Binary_scaledFin

array([[-0.3244509 ,  0.32159704, -0.20449782, ..., -0.21559549,
        -0.20302913, -0.19402069],
       [ 1.24101121,  0.32159704, -0.20449782, ..., -0.21559549,
        -0.20302913, -0.19402069],
       [ 0.06691462,  0.32159704, -0.20449782, ...,  4.63831598,
        -0.20302913, -0.19402069],
       ...,
       [ 3.09999747, -3.10948133, -0.20449782, ..., -0.21559549,
        -0.20302913, -0.19402069],
       [-0.91149919, -3.10948133,  4.89002772, ..., -0.21559549,
        -0.20302913, -0.19402069],
       [-1.10718196,  0.32159704, -0.20449782, ..., -0.21559549,
        -0.20302913, -0.19402069]])

In [38]:
df_multi = pd.read_csv('data/chronic_agg_multiclass.csv', index_col = 'Unnamed: 0')

df_multi.head(5)

Unnamed: 0,ids,age,gender,country,target,total_logs,total_unique_dates_days,total_log_rate,conditions_total,fibro_comorbidities,...,mouth_ulcers,mouth_ulcers_total_logs,mouth_ulcers_activity,mouth_ulcers_unique_dates_days,mouth_ulcers_log_rate,blood_pooling,blood_pooling_total_logs,blood_pooling_activity,blood_pooling_unique_dates_days,blood_pooling_log_rate
0,16,42,male,CL,3,5,1,5.0,2,0,...,0,0,0,0,0.0,0,0,0,0,0.0
1,37,46,female,US,3,13,5,2.6,1,1,...,0,0,0,0,0.0,0,0,0,0,0.0
2,137,25,female,SE,3,80,8,10.0,4,0,...,0,0,0,0,0.0,0,0,0,0,0.0
3,140,20,female,US,3,12,1,12.0,1,0,...,0,0,0,0,0.0,0,0,0,0,0.0
4,322,31,female,US,3,42,6,7.0,1,0,...,0,0,0,0,0.0,0,0,0,0,0.0


In [39]:
# Instantiate the OneHotEncoder
oheM = OneHotEncoder()
gendersM = pd.DataFrame(df_multi["gender"])

# Fit the OneHotEncoder to the subcategory column and transform
# It expects a 2D array, so we first convert the column into a DataFrame
gen_encodedM = oheM.fit_transform(gendersM)
gen_encodedM
dense_gen_encodedM = gen_encodedM.toarray()

In [40]:
oheM.categories_ # looking at the categories in the OHE

[array(['female', 'male', 'other'], dtype=object)]

In [41]:
# Check the result as a dataframe

gen_checkM = pd.DataFrame(dense_gen_encodedM, columns=oheM.categories_, dtype=int)

gen_checkM

Unnamed: 0,female,male,other
0,0,1,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
...,...,...,...
5102,1,0,0
5103,1,0,0
5104,1,0,0
5105,1,0,0


In [42]:
gen_checkM.columns = ['female', 'male', 'other'] # rename the columns

gen_checkM.columns # checking the work

Index(['female', 'male', 'other'], dtype='object')

In [43]:
# Reset index 

gen_checkM.index = df_multi.index

In [44]:
# Concatenating with the original dataframe

df_multi2 = pd.concat([df_multi,gen_checkM], axis=1)

df_multi2.head(2)

Unnamed: 0,ids,age,gender,country,target,total_logs,total_unique_dates_days,total_log_rate,conditions_total,fibro_comorbidities,...,mouth_ulcers_unique_dates_days,mouth_ulcers_log_rate,blood_pooling,blood_pooling_total_logs,blood_pooling_activity,blood_pooling_unique_dates_days,blood_pooling_log_rate,female,male,other
0,16,42,male,CL,3,5,1,5.0,2,0,...,0,0.0,0,0,0,0,0.0,0,1,0
1,37,46,female,US,3,13,5,2.6,1,1,...,0,0.0,0,0,0,0,0.0,1,0,0


In [45]:
# Creating the data variable splits

y_multi, X_all_multi, X_basic_multi, X_CS_binary_multi, X_CS_total_logs_multi, X_CS_activity_multi, X_CS_log_rate_multi, X_CS_unique_dates_multi = variable_splitter(df_multi2)

Total Condition / Symptom Binary length 0
Total logs length: 0
Median activity length: 0
Log rate length: 0
Unique dates length: 0
Total Condition / Symptom Binary length 99
Total logs length: 99
Median activity length: 99
Log rate length: 99
Unique dates length: 99


In [46]:
X_CS_binary_multi.columns

Index(['ids', 'age', 'female', 'other', 'total_logs',
       'total_unique_dates_days', 'total_log_rate', 'conditions_total',
       'symptoms_total', 'fibro_comorbidities', 'hypothyroidism', 'insomnia',
       'depression', 'ehlers-danlos_syndrome', 'gerd', 'psoriasis',
       'irritable_bowel_syndrome', 'endometriosis',
       'post-traumatic_stress_disorder', 'fibromyalgia',
       'chronic_fatigue_syndrome', 'osteoarthritis',
       'generalized_anxiety_disorder', 'sjogren's_syndrome', 'arthritis',
       'anxiety', 'asthma', 'polycystic_ovary_syndrome_(pcos)', 'allergies',
       'raynaud's_disease', 'gastroparesis', 'mast_cell_activation_syndrome',
       'myalgic_encephalomyelitis', 'dysautonomia', 'headache', 'fatigue',
       'stomach_pain', 'chronic_pain', 'numbness_and_tingling',
       'dizziness_or_vertigo', 'joint_pain', 'hand_pain', 'foot_pain',
       'inflammation', 'syncope', 'migraine', 'vomiting', 'stomach_cramps',
       'nausea', 'back_pain', 'brain_fog', 'swellin

In [48]:
#Dropping colinear variables

X_CS_binary_multi2 = X_CS_binary_multi.drop(columns=['ids', 'conditions_total', 'symptoms_total', 'total_unique_dates_days', 'fibro_comorbidities'])

X_CS_binary_multi2.columns

Index(['age', 'female', 'other', 'total_logs', 'total_log_rate',
       'hypothyroidism', 'insomnia', 'depression', 'ehlers-danlos_syndrome',
       'gerd', 'psoriasis', 'irritable_bowel_syndrome', 'endometriosis',
       'post-traumatic_stress_disorder', 'fibromyalgia',
       'chronic_fatigue_syndrome', 'osteoarthritis',
       'generalized_anxiety_disorder', 'sjogren's_syndrome', 'arthritis',
       'anxiety', 'asthma', 'polycystic_ovary_syndrome_(pcos)', 'allergies',
       'raynaud's_disease', 'gastroparesis', 'mast_cell_activation_syndrome',
       'myalgic_encephalomyelitis', 'dysautonomia', 'headache', 'fatigue',
       'stomach_pain', 'chronic_pain', 'numbness_and_tingling',
       'dizziness_or_vertigo', 'joint_pain', 'hand_pain', 'foot_pain',
       'inflammation', 'syncope', 'migraine', 'vomiting', 'stomach_cramps',
       'nausea', 'back_pain', 'brain_fog', 'swelling', 'stiffness',
       'constipation', 'bloating', 'lightheadedness', 'elbow_pain', 'hip_pain',
       'rash

In [49]:
# Creating the remainder and test sets all variables
X_remainder_MBinary, X_test_MBinary, y_remainder_MBinary, y_test_MBinary = train_test_split(X_CS_binary_multi2, y_multi, test_size=0.3, random_state=42, stratify=y_multi)

In [50]:
X_remainder_MBinary.head(1).to_csv('data/multi_df.csv')

In [51]:
user_binary_df = pd.read_csv('data/binary_df.csv', index_col='Unnamed: 0')
user_binary_df.reset_index(inplace=True)
user_binary_df.drop(columns=['index'], inplace=True)

user_binary_df.loc[:,:] = 0

user_binary_df

Unnamed: 0,age,female,other,fibro_comorbidities,postural_orthostatic_tachycardia_syndrome,ehlers-danlos_syndrome,irritable_bowel_syndrome,osteoarthritis,depression,asthma,...,muscle_spasm,pelvic_pain,vomiting,foot_pain,rash,sweating,arm_pain,leg_pain,nerve_pain,menstrual_cramps
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
comorbs = list(('anxiety', 'depression', 'migraine', 'chronic_fatigue_syndrome'))
comorb_act = 0

for co in comorbs:
    if user_binary_df.loc[0, co] == 1:
        comorb_act = comorb_act + 1

print(comorb_act)

0


In [53]:
# Creating the remainder and test sets all variables
X_train_MBinary, X_validation_MBinary, y_train_MBinary, y_validation_MBinary = train_test_split(X_remainder_MBinary, y_remainder_MBinary, test_size=0.3, random_state=42, stratify=y_remainder_MBinary)

In [54]:
X_remainder_AcBinary.columns

Index(['age', 'female', 'other', 'fibro_comorbidities',
       'postural_orthostatic_tachycardia_syndrome_activity',
       'ehlers-danlos_syndrome_activity', 'irritable_bowel_syndrome_activity',
       'osteoarthritis_activity', 'depression_activity', 'asthma_activity',
       'anxiety_activity', 'arthritis_activity', 'insomnia_activity',
       'endometriosis_activity', 'gerd_activity', 'lupus_activity',
       'generalized_anxiety_disorder_activity',
       'post-traumatic_stress_disorder_activity',
       'chronic_fatigue_syndrome_activity', 'rheumatoid_arthritis_activity',
       'polycystic_ovary_syndrome_(pcos)_activity', 'stomach_pain_activity',
       'migraine_activity', 'stomach_cramps_activity',
       'subluxation_or_dislocation_activity', 'back_pain_activity',
       'headache_activity', 'muscle_pain_activity', 'shoulder_pain_activity',
       'fatigue_activity', 'bloating_activity', 'abdominal_pain_activity',
       'knee_pain_activity', 'brain_fog_activity', 'chronic_pa

In [55]:
# scaling remainder and test

Multi_Scaler2 = StandardScaler() # initialize

Multi_Scaler2.fit(X_remainder_MBinary) # fit

X_remainder_MB_scaled = Multi_Scaler2.transform(X_remainder_MBinary) # transform

X_test_MB_scaled = Multi_Scaler2.transform(X_test_MBinary)

Re-testing the model on train and validation:

In [56]:
import pickle
LR_multi_final = LogisticRegression(C=0.01, solver='saga', penalty='l2') # initialize

LR_multi_final.fit(X_remainder_MB_scaled, y_remainder_MBinary) # fit

pickle.dump(LR_multi_final, open('Multi_final.pkl', 'wb'))

LR_multi_final.score(X_remainder_MB_scaled, y_remainder_MBinary) #score

0.7806379406827084