# Datasets Preparation

## Import necessary libraries

In [13]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [14]:
# Dataset 1: processed_dataset1
df1 = pd.read_csv(r'..\..\..\Datasets\train\processed\processed_dataset1.csv')
# Dataset 2: processed_dataset2
df2 = pd.read_csv(r'..\..\..\Datasets\train\processed\processed_dataset2.csv')
# Dataset 3: processed_dataset3
df3 = pd.read_csv(r'..\..\..\Datasets\train\processed\processed_dataset3.csv')
# Dataset 4: processed_dataset4
df4 = pd.read_csv(r'..\..\..\Datasets\train\processed\processed_dataset4.csv')
# Dataset 5: processed_dataset5
df5 = pd.read_csv(r'..\..\..\Datasets\train\processed\processed_dataset5.csv')

## Identify and Keep Common Genes (Features) Across All Datasets

In [15]:
# Identify and Keep Common Genes (Features) Across All Datasets (Excluding the 'TB_Status' column)
common_genes = list(set(df1.columns).intersection(df2.columns).intersection(df3.columns).intersection(df4.columns).intersection(df5.columns) - {'TB_Status'})

# Convert the list to a DataFrame and save as CSV
common_genes_df = pd.DataFrame(common_genes, columns=["Gene"])
common_genes_df.to_csv(r"..\..\..\Saved_files\common_features.csv", index=False)


In [16]:
# Standardization & Batch Effect Correction (Median-Centering per Dataset)
scaler = StandardScaler()

def process_and_harmonize(df, common_genes, scaler):
    df_common = df[common_genes]
    
    # Z-score normalization (standardization)
    df_scaled = pd.DataFrame(scaler.fit_transform(df_common), columns=df_common.columns)
    
    # Batch effect correction: median-centering per dataset
    df_harmonized = df_scaled - df_scaled.median()
    
    return pd.concat([df_harmonized, df[['TB_Status']]], axis=1)

## Data Harmonization

In [17]:
df1 = process_and_harmonize(df1, common_genes, scaler)
df2 = process_and_harmonize(df2, common_genes, scaler)
df3 = process_and_harmonize(df3, common_genes, scaler)
df4 = process_and_harmonize(df4, common_genes, scaler)
df5 = process_and_harmonize(df5, common_genes, scaler)

## Merge all 5 datasets

In [18]:
# Concatenate all datasets before harmonization
merged_df = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)

In [19]:
merged_df.sample(5)

Unnamed: 0,ILMN_1822442,ILMN_1877270,ILMN_1683453,ILMN_1697095,ILMN_1747466,ILMN_1710873,ILMN_1661650,ILMN_1710204,ILMN_2138689,ILMN_2371251,...,ILMN_1654516,ILMN_1726391,ILMN_1786326,ILMN_2282641,ILMN_1742611,ILMN_1656962,ILMN_1804654,ILMN_1707062,ILMN_1733174,TB_Status
60,-0.354294,0.116823,-0.507725,-0.5572,-0.535674,-0.575384,-1.256546,-0.372938,-0.075648,-0.349837,...,-0.841895,-0.908766,-0.64208,-0.051818,-0.665907,-1.332607,-2.509156,-1.63595,-0.473011,Active TB
1412,0.609643,-2.121456,-1.224274,-0.226171,-0.56416,0.46829,-0.608728,0.029391,-2.232885,0.630445,...,1.137028,-0.470511,1.636981,1.281223,-0.068899,0.40131,0.813998,-1.122529,-1.079858,Other Disease
22,-1.403656,-0.647864,-0.164053,-0.429701,-0.646807,-1.067757,-1.264663,-0.282027,0.107353,0.672468,...,-0.361086,-1.102605,-0.250384,-0.554242,-1.274624,-0.417769,-1.930317,-1.1943,-0.486207,Active TB
447,2.633814,2.665162,-2.313626,-0.699883,-0.448874,-0.879467,-0.234471,-0.135616,-1.687136,-2.418642,...,0.62647,-0.198571,-1.410018,0.24244,-0.44836,0.970852,0.51532,-0.528278,-0.692011,Healthy Control
702,2.266701,-0.438323,-0.211797,1.383713,0.264607,-0.10329,-0.607875,1.056731,0.49099,0.306108,...,-0.93044,-0.506034,1.586873,-0.835747,-0.590079,1.323516,0.778832,-0.233824,0.249451,Pulmonary TB


## Check for missing values in 'merged_df' dataset

In [20]:
# Check for missing values and print only columns with missing values
missing_values = merged_df.isnull().sum()
print(missing_values[missing_values > 0])

Series([], dtype: int64)


## Split 'merged_df' dataset in to 3 seratate datasets as 'TB_HC_OD' , 'PTB_EPTB' , 'ATB_LTB'

### Dataset 1: 'TB_HC_OD'

In [21]:
# Dataset 1: 'TB_HC_OD'
# Replace 'Active TB', 'Latent TB', 'Pulmonary TB', and 'Extra Pulmonary TB' with 'TB' in 'TB_Status'
TB_HC_OD = merged_df.copy()
TB_HC_OD['TB_Status'] = TB_HC_OD['TB_Status'].replace(
    ['Active TB', 'Latent TB', 'Pulmonary TB', 'Extra Pulmonary TB'], 'TB'
)

In [22]:
# Verifying the shape and preview of each dataset
print("TB_HC_OD Dataset shape:", TB_HC_OD.shape)
print("TB_HC_OD Dataset preview:\n")
TB_HC_OD.sample(5)

TB_HC_OD Dataset shape: (1570, 23190)
TB_HC_OD Dataset preview:



Unnamed: 0,ILMN_1822442,ILMN_1877270,ILMN_1683453,ILMN_1697095,ILMN_1747466,ILMN_1710873,ILMN_1661650,ILMN_1710204,ILMN_2138689,ILMN_2371251,...,ILMN_1654516,ILMN_1726391,ILMN_1786326,ILMN_2282641,ILMN_1742611,ILMN_1656962,ILMN_1804654,ILMN_1707062,ILMN_1733174,TB_Status
1347,-0.693483,-0.322788,0.45594,-0.512827,1.319,0.841667,0.07865,0.185949,-2.53467,0.004334,...,-0.374742,0.135424,0.616931,-0.015103,-0.766706,-0.401534,-1.319323,-0.695472,-0.411021,Other Disease
1062,-1.985296,0.529591,-1.326576,-1.12375,0.25554,-0.684384,0.746381,0.162682,-0.008812,0.315952,...,-0.413642,-0.284799,1.090111,-0.507428,-0.253244,0.552185,0.707379,1.167303,0.55328,TB
216,-0.286075,-0.729645,0.618494,-0.407328,-0.699001,-0.86132,-0.976114,0.202116,-0.484977,-0.705458,...,-0.54101,-0.918738,0.115993,-0.532784,-0.982246,-0.221272,-0.375695,-0.79255,-0.292132,TB
650,-0.780631,0.386419,2.471895,0.161246,-0.701106,1.661442,0.786478,-0.433583,-0.500585,-0.660296,...,-1.13128,1.473257,-1.719721,-2.447063,0.244172,0.64665,-0.179999,-0.545638,0.876181,Healthy Control
576,1.443556,0.703763,-0.190201,0.330967,0.988979,0.087742,0.075202,1.173863,2.225172,1.113223,...,-0.447292,0.724175,0.61029,0.238899,0.263623,1.342661,0.76694,-0.653475,-0.160275,Other Disease


In [23]:
# Get the number of occurrences for each class in the 'TB_Status' column
class_counts = TB_HC_OD['TB_Status'].value_counts()

# Display the class counts
print(class_counts)


TB_Status
TB                 719
Other Disease      608
Healthy Control    243
Name: count, dtype: int64


In [24]:
# Save the DataFrame as a CSV file
output_csv_path = r'..\..\..\Datasets\train\processed\TB_HC_OD.csv'
TB_HC_OD.to_csv(output_csv_path, index=False)
print("CSV file saved to:", output_csv_path)

CSV file saved to: ..\..\..\Datasets\train\processed\TB_HC_OD.csv


### Dataset 2: 'PTB_EPTB'

In [25]:
# Dataset 2: 'PTB_EPTB' - only rows where TB_Status is 'Pulmonary TB' or 'Extra Pulmonary TB'
PTB_EPTB = merged_df[merged_df['TB_Status'].isin(['Pulmonary TB', 'Extra Pulmonary TB'])]

In [26]:
# Verifying the shape and preview of each dataset
print("\nPTB_EPTB Dataset shape:", PTB_EPTB.shape)
print("PTB_EPTB Dataset preview:\n")
PTB_EPTB.sample(5)


PTB_EPTB Dataset shape: (394, 23190)
PTB_EPTB Dataset preview:



Unnamed: 0,ILMN_1822442,ILMN_1877270,ILMN_1683453,ILMN_1697095,ILMN_1747466,ILMN_1710873,ILMN_1661650,ILMN_1710204,ILMN_2138689,ILMN_2371251,...,ILMN_1654516,ILMN_1726391,ILMN_1786326,ILMN_2282641,ILMN_1742611,ILMN_1656962,ILMN_1804654,ILMN_1707062,ILMN_1733174,TB_Status
724,0.43702,0.403718,1.19971,1.901424,0.923068,-0.629359,-0.904603,1.198801,1.167085,1.199689,...,-0.381239,-0.946797,-1.401703,-0.812072,-0.038403,-0.090368,0.141164,2.043654,-0.960562,Pulmonary TB
1025,0.157561,0.154966,-0.04266,0.772971,-0.904773,0.491475,-0.148311,0.235943,-1.410557,1.24713,...,-0.664653,0.755166,-0.323242,-1.55696,-0.467433,0.725994,0.845979,-0.320516,0.236153,Pulmonary TB
952,1.304897,0.109012,-0.256816,0.506569,-0.198132,0.321095,-0.891174,0.175623,0.331529,2.409085,...,1.056338,0.27361,-1.288481,-0.887299,0.241283,0.086478,0.37557,-0.816993,0.475691,Pulmonary TB
1014,0.658401,0.566575,0.212802,0.165429,-0.739819,-0.281228,0.252511,-2.115934,-0.749007,-0.393743,...,0.694809,-0.337933,0.350944,0.315866,0.284283,-1.56816,0.257932,1.494637,-0.076013,Extra Pulmonary TB
988,-0.38815,0.053603,0.581797,1.68305,0.036815,-0.472982,-2.330425,-1.82482,0.450153,-0.193656,...,-0.54263,0.438323,-0.978547,-1.738188,-0.532343,-0.587315,0.156891,-2.06213,2.409096,Pulmonary TB


In [27]:
# Get the number of occurrences for each class in the 'TB_Status' column
class_counts = PTB_EPTB['TB_Status'].value_counts()

# Display the class counts
print(class_counts)


TB_Status
Extra Pulmonary TB    211
Pulmonary TB          183
Name: count, dtype: int64


In [28]:
# Save the DataFrame as a CSV file
output_csv_path = r'..\..\..\Datasets\train\processed\PTB_EPTB.csv'
PTB_EPTB.to_csv(output_csv_path, index=False)
print("CSV file saved to:", output_csv_path)

CSV file saved to: ..\..\..\Datasets\train\processed\PTB_EPTB.csv


### Dataset 3: 'ATB_LTB'

In [29]:
# Dataset 3: 'ATB_LTB' - only rows where TB_Status is 'Latent TB' or 'Active TB'
ATB_LTB = merged_df[merged_df['TB_Status'].isin(['Latent TB', 'Active TB'])]

In [30]:
# Verifying the shape and preview of each dataset
print("\nATB_LTB Dataset shape:", ATB_LTB.shape)
print("ATB_LTB Dataset preview:\n")
ATB_LTB.sample(5)


ATB_LTB Dataset shape: (308, 23190)
ATB_LTB Dataset preview:



Unnamed: 0,ILMN_1822442,ILMN_1877270,ILMN_1683453,ILMN_1697095,ILMN_1747466,ILMN_1710873,ILMN_1661650,ILMN_1710204,ILMN_2138689,ILMN_2371251,...,ILMN_1654516,ILMN_1726391,ILMN_1786326,ILMN_2282641,ILMN_1742611,ILMN_1656962,ILMN_1804654,ILMN_1707062,ILMN_1733174,TB_Status
34,-0.733023,-0.14383,-0.3641,-0.062621,-0.453916,-0.705694,-0.779037,-1.431592,0.621049,-0.990469,...,-0.533183,-0.779728,-1.296151,-0.878631,-0.019101,-1.704274,-0.717145,-1.185737,1.036741,Active TB
193,-1.173696,-0.059062,1.553425,-0.978507,-0.998723,-1.163751,-0.943619,-0.699962,1.272454,-0.234168,...,-0.360867,-1.266455,0.218752,-0.508492,-1.134276,-0.045008,-0.80312,-1.043964,0.268479,Active TB
1527,-1.131232,0.052813,0.265173,1.159118,0.508017,-0.990044,0.277213,0.318932,-0.149933,-1.588994,...,0.18509,-1.256167,-0.75657,0.803234,-0.85477,0.28861,-0.362683,2.063056,0.106253,Latent TB
1446,-0.157885,-1.096304,-0.424997,-0.278684,1.08384,-1.075734,-0.730984,-0.779606,0.068633,0.477959,...,-1.112943,0.575964,0.176207,-1.22506,1.293145,0.43009,-0.021121,0.956978,1.761129,Latent TB
149,-0.296607,0.430653,0.877875,-0.436061,0.718529,-0.789208,-0.400874,0.134163,-0.064961,0.446448,...,-0.484919,-0.41819,-1.29161,-1.840188,-0.066038,2.013848,-0.250592,0.354261,0.50238,Active TB


In [31]:
# Get the number of occurrences for each class in the 'TB_Status' column
class_counts = ATB_LTB['TB_Status'].value_counts()

# Display the class counts
print(class_counts)

TB_Status
Latent TB    180
Active TB    128
Name: count, dtype: int64


In [32]:
# Save the DataFrame as a CSV file
output_csv_path = r'..\..\..\Datasets\train\processed\ATB_LTB.csv'
ATB_LTB.to_csv(output_csv_path, index=False)
print("CSV file saved to:", output_csv_path)

CSV file saved to: ..\..\..\Datasets\train\processed\ATB_LTB.csv
