In [13]:
# Load dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from lifelines import KaplanMeierFitter
from lifelines.plotting import add_at_risk_counts

# Set pandas option to display all columns
pd.set_option('display.max_columns', None)

##### **Licals**

In [14]:
lical1 = pd.read_csv('/Users/Apple/Downloads/All_processed_data/LICALS_Data_EDA.csv')
# print(lical1.columns)
print()
lical1['Disease_Duration'] = lical1['Study_Duration'] + lical1['Diagnostic_Delay']
# lical1.head(3)




In [15]:
lical = lical1[['subject_id', 'Event', 'Vital_capacity', 'Sex', 'Onset_site', 'Study_Arm', 
                'European',  'Age_Rand', 'Diagnostic_Delay', 'Disease_Duration', 'TRICALS']]

lical = lical.rename(columns={'Age_Rand': 'Age'})

lical = lical.copy()
lical['Study_id'] = 'lica'
lical['Expt'] = 'licals'
lical.head(3)

Unnamed: 0,subject_id,Event,Vital_capacity,Sex,Onset_site,Study_Arm,European,Age,Diagnostic_Delay,Disease_Duration,TRICALS,Study_id,Expt
0,P01001,0,107.0,Male,Limb,Placebo,1,54.762491,13.600526,32.600526,-4.921357,lica,licals
1,P01002,1,99.0,Female,Limb,Placebo,1,61.160849,20.137976,41.538765,-6.347018,lica,licals
2,P01003,0,102.0,Male,Limb,Active,1,46.702259,15.571616,34.571616,-6.373852,lica,licals


##### **Mirocals**

In [16]:
miro1 = pd.read_csv('/Users/Apple/Downloads/All_processed_data/MIROCALS_Data_EDA.csv')
# print(miro1.columns)
print()
miro1 = miro1.copy()
miro1['European'] = 1
# miro1.head(2)




In [17]:
miro = miro1[['ID', 'STATUS', 'SVCHP_INC','SEX', 'INC_SITE_ONSET','ARM_C','European','AGERANDO_C',   
                 'Diagnostic_delay','Disease_Duration', 'TRICALS']]
miro = miro.copy()
miro['Study_id'] = 'miro'
miro['Expt'] = 'mirocals'

miro = miro.rename(columns={'ID': 'subject_id', 'AGERANDO_C': 'Age', 'STATUS': 'Event', 'SVCHP_INC': 'Vital_capacity', 
                            'SEX': 'Sex', 'INC_SITE_ONSET': 'Onset_site', 'ARM_C': 'Study_Arm', 'Diagnostic_delay': 'Diagnostic_Delay'})

miro.head(3)

Unnamed: 0,subject_id,Event,Vital_capacity,Sex,Onset_site,Study_Arm,European,Age,Diagnostic_Delay,Disease_Duration,TRICALS,Study_id,Expt
0,1-62478,1.0,99.0,F,Limb,Placebo,1,70.927,15.108,29.174444,-4.550056,miro,mirocals
1,1-62479,1.0,81.0,F,Bulbar,IL2,1,65.391,15.6,24.144657,-2.93366,miro,mirocals
2,1-62480,0.0,96.0,M,Limb,IL2,1,57.024,8.904,31.308967,-5.212384,miro,mirocals


##### **Riluzole**

In [18]:
rilu1 = pd.read_csv('/Users/Apple/Downloads/All_processed_data/riluzole_combined_data_tricals.csv')
# print(rilu1.columns)
print()
# rilu1['Disease_Duration'] = rilu1['Study_Duration_months'] + rilu1['Diagnosis_delay']
# rilu1.head(3)




In [19]:
rilu = rilu1[['subject_id', 'Event', 'Vital_Capacity','Sex', 'Onset_site', 'Study_Arm','European',
                'Age', 'Diagnosis_delay',  'Disease_Duration','TRICALS_Risk_Profile','Study_id']]
rilu = rilu.copy()
rilu['Expt'] = 'riluzole'

rilu = rilu.rename(columns={'Vital_Capacity': 'Vital_capacity', 'Diagnosis_delay': 'Diagnostic_Delay', 'TRICALS_Risk_Profile': 'TRICALS'})

rilu.head(3)

Unnamed: 0,subject_id,Event,Vital_capacity,Sex,Onset_site,Study_Arm,European,Age,Diagnostic_Delay,Disease_Duration,TRICALS,Study_id,Expt
0,101,1,80.087527,1,1.0,ACTIVE,1,45,38.600526,53.613666,-6.500075,216,riluzole
1,102,1,96.864111,0,1.0,PLACEBO,1,61,21.583443,34.329829,-4.765072,216,riluzole
2,103,1,100.0,0,1.0,PLACEBO,1,60,16.557162,32.555848,-4.38857,216,riluzole


In [20]:
rilu.Sex.value_counts().to_dict()

{1: 748, 0: 534}

In [21]:
df_concat = pd.concat([lical, miro, rilu], ignore_index=True)
df_concat.head(3)

Unnamed: 0,subject_id,Event,Vital_capacity,Sex,Onset_site,Study_Arm,European,Age,Diagnostic_Delay,Disease_Duration,TRICALS,Study_id,Expt
0,P01001,0.0,107.0,Male,Limb,Placebo,1,54.762491,13.600526,32.600526,-4.921357,lica,licals
1,P01002,1.0,99.0,Female,Limb,Placebo,1,61.160849,20.137976,41.538765,-6.347018,lica,licals
2,P01003,0.0,102.0,Male,Limb,Active,1,46.702259,15.571616,34.571616,-6.373852,lica,licals


In [22]:
df_concat.shape

(1678, 13)

In [23]:
df_concat.to_csv('/Users/Apple/Downloads/All_processed_data/combined_other_datasets_for_modeling.csv', index=False)

In [24]:
df_concat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1678 entries, 0 to 1677
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   subject_id        1678 non-null   object 
 1   Event             1678 non-null   float64
 2   Vital_capacity    1678 non-null   float64
 3   Sex               1678 non-null   object 
 4   Onset_site        1678 non-null   object 
 5   Study_Arm         1678 non-null   object 
 6   European          1678 non-null   int64  
 7   Age               1678 non-null   float64
 8   Diagnostic_Delay  1678 non-null   float64
 9   Disease_Duration  1678 non-null   float64
 10  TRICALS           1678 non-null   float64
 11  Study_id          1678 non-null   object 
 12  Expt              1678 non-null   object 
dtypes: float64(6), int64(1), object(6)
memory usage: 170.6+ KB
