#Data loading and Preparation

##Data Loading

In [3]:
#Importing libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

In [4]:
#Loading data
path = '/content/4A_TB_cleaned1.xlsx'
data =pd.read_excel(path)
data.head(2)

Unnamed: 0,UNIT,IP NUMBER,SEX,AGE,RELIGION,DISTRICT OF RESIDENCE,WORKING DIAGNOSIS,OUT COME,NUMBER OF PREVIOUS ADMISSIONS,TB_type,...,DAY OF ADMISSION,YEAR OF ADMISSION,DURATION,Survival_Week 1,Survival_Week 2,Survival_Week 3,Survival_Week 4,REGION,ADDITIONAL WORKING DIAGNOSIS,NUMBER OF ADDITIONAL WORKING DIAGNOSIS
0,GI,1861416,M,30,COU,Nakasongola,"ISS,Abdominal TB,,",IMPROVED,5,Abdominal_TB,...,Tuesday,2010,6,0,0,0,0,CENTRAL,"ISS,,",1
1,GI,1868011,F,46,COU,Kampala,"ISS Stage I&II,Disseminated TB,,",IMPROVED,3,Disseminated TB,...,Monday,2010,4,0,0,0,0,CENTRAL,"ISS Stage I&II,,",1


#**Data Preprocessing**

In [5]:
col_to_remove = ['DURATION', 'OUT COME', 'IP NUMBER','DISTRICT OF RESIDENCE','UNIT',
                 'ADDITIONAL WORKING DIAGNOSIS','WORKING DIAGNOSIS', 'MARITAL STATUS']
data = data.drop(columns=col_to_remove)

In [6]:
# Perform one-hot encoding
df_encoded = pd.get_dummies(data, columns=['YEAR OF ADMISSION'])

# The resulting DataFrame will have one-hot encoded columns
df_encoded.head()

Unnamed: 0,SEX,AGE,RELIGION,NUMBER OF PREVIOUS ADMISSIONS,TB_type,MONTH OF ADMISSION,DAY OF ADMISSION,Survival_Week 1,Survival_Week 2,Survival_Week 3,...,YEAR OF ADMISSION_2010,YEAR OF ADMISSION_2011,YEAR OF ADMISSION_2012,YEAR OF ADMISSION_2013,YEAR OF ADMISSION_2014,YEAR OF ADMISSION_2015,YEAR OF ADMISSION_2016,YEAR OF ADMISSION_2017,YEAR OF ADMISSION_2018,YEAR OF ADMISSION_2019
0,M,30,COU,5,Abdominal_TB,November,Tuesday,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,F,46,COU,3,Disseminated TB,December,Monday,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,M,80,ISLAM,7,Abdominal_TB,December,Wednesday,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,M,22,CATHOLIC,8,Abdominal_TB,December,Monday,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,F,18,COU,4,PTB,January,Thursday,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [8]:
#Encoding categorical features
non_features = ['NUMBER OF ADDITIONAL WORKING DIAGNOSIS','AGE','NUMBER OF PREVIOUS ADMISSIONS',
                'Survival_Week 1',	'Survival_Week 2',	'Survival_Week 3',	'Survival_Week 4']
data2 = df_encoded.drop(columns=non_features)

text_columns = data2.select_dtypes(include=['object']).columns

data_enc = pd.get_dummies(df_encoded, columns=text_columns)

In [9]:
data_enc.sample(4)

Unnamed: 0,AGE,NUMBER OF PREVIOUS ADMISSIONS,Survival_Week 1,Survival_Week 2,Survival_Week 3,Survival_Week 4,NUMBER OF ADDITIONAL WORKING DIAGNOSIS,YEAR OF ADMISSION_2010,YEAR OF ADMISSION_2011,YEAR OF ADMISSION_2012,...,DAY OF ADMISSION_Saturday,DAY OF ADMISSION_Sunday,DAY OF ADMISSION_Thursday,DAY OF ADMISSION_Tuesday,DAY OF ADMISSION_Wednesday,REGION_CENTRAL,REGION_EASTERN,REGION_NORTHERN,REGION_UNKNOWN,REGION_WESTERN
6987,36,0,1,1,1,1,2,0,0,0,...,0,0,1,0,0,1,0,0,0,0
4974,24,9,0,0,0,0,0,0,0,1,...,0,0,0,0,1,1,0,0,0,0
3082,40,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,1,0,0,0,0
21197,24,0,0,1,1,1,2,0,0,0,...,0,0,1,0,0,1,0,0,0,0


In [11]:
data_enc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21894 entries, 0 to 21893
Data columns (total 54 columns):
 #   Column                                  Non-Null Count  Dtype
---  ------                                  --------------  -----
 0   AGE                                     21894 non-null  int64
 1   NUMBER OF PREVIOUS ADMISSIONS           21894 non-null  int64
 2   Survival_Week 1                         21894 non-null  int64
 3   Survival_Week 2                         21894 non-null  int64
 4   Survival_Week 3                         21894 non-null  int64
 5   Survival_Week 4                         21894 non-null  int64
 6   NUMBER OF ADDITIONAL WORKING DIAGNOSIS  21894 non-null  int64
 7   YEAR OF ADMISSION_2010                  21894 non-null  uint8
 8   YEAR OF ADMISSION_2011                  21894 non-null  uint8
 9   YEAR OF ADMISSION_2012                  21894 non-null  uint8
 10  YEAR OF ADMISSION_2013                  21894 non-null  uint8
 11  YEAR OF ADMISSI

##Normalization and Visualization

In [12]:
#Transforming the continuous features
transform_features = ['AGE','NUMBER OF PREVIOUS ADMISSIONS', 'NUMBER OF ADDITIONAL WORKING DIAGNOSIS']
for col in transform_features:
  data_enc[col] = np.log1p(data_enc[col])

#Second transformation
data_enc['NUMBER OF PREVIOUS ADMISSIONS'] = np.log1p(data_enc['NUMBER OF PREVIOUS ADMISSIONS'])


In [13]:
#Normalizing and Standardizing continuous features
normalise_features = ['AGE','NUMBER OF PREVIOUS ADMISSIONS','NUMBER OF ADDITIONAL WORKING DIAGNOSIS']
scaler = MinMaxScaler()
data_enc[normalise_features] = scaler.fit_transform(data_enc[normalise_features])


In [14]:
data_enc.sample(5)

Unnamed: 0,AGE,NUMBER OF PREVIOUS ADMISSIONS,Survival_Week 1,Survival_Week 2,Survival_Week 3,Survival_Week 4,NUMBER OF ADDITIONAL WORKING DIAGNOSIS,YEAR OF ADMISSION_2010,YEAR OF ADMISSION_2011,YEAR OF ADMISSION_2012,...,DAY OF ADMISSION_Saturday,DAY OF ADMISSION_Sunday,DAY OF ADMISSION_Thursday,DAY OF ADMISSION_Tuesday,DAY OF ADMISSION_Wednesday,REGION_CENTRAL,REGION_EASTERN,REGION_NORTHERN,REGION_UNKNOWN,REGION_WESTERN
6045,0.25647,0.0,0,0,0,0,0.613147,0,0,0,...,0,0,0,0,1,1,0,0,0,0
8580,0.777038,0.0,1,1,1,1,0.613147,0,0,0,...,0,0,0,0,0,1,0,0,0,0
18147,0.25647,0.0,0,0,0,0,0.386853,0,0,0,...,0,0,0,0,0,1,0,0,0,0
6114,0.410678,0.0,0,0,0,0,0.613147,0,0,0,...,0,0,0,0,0,1,0,0,0,0
15263,0.647204,0.0,1,1,1,1,0.386853,0,0,0,...,0,0,0,0,1,1,0,0,0,0


In [None]:
data_enc.to_excel('processed_4A_TB.xlsx', index=False)