In [24]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Datasets Preparation

## Import necessary libraries

In [17]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [43]:
# Dataset 1: processed_dataset1
df1 = pd.read_csv(r'/content/drive/MyDrive/Research/TB_new/Datasets/train/processed/processed_dataset1.csv')
# Dataset 2: processed_dataset2
df2 = pd.read_csv(r'/content/drive/MyDrive/Research/TB_new/Datasets/train/processed/processed_dataset2.csv')
# Dataset 3: processed_dataset3
df3 = pd.read_csv(r'/content/drive/MyDrive/Research/TB_new/Datasets/train/processed/processed_dataset3.csv')
# Dataset 4: processed_dataset4
df4 = pd.read_csv(r'/content/drive/MyDrive/Research/TB_new/Datasets/train/processed/processed_dataset4.csv')
# Dataset 5: processed_dataset5
df5 = pd.read_csv(r'/content/drive/MyDrive/Research/TB_new/Datasets/train/processed/processed_dataset5.csv')

## Identify and Keep Common Genes (Features) Across All Datasets

In [44]:
import os

In [45]:
# Identify and Keep Common Genes (Features) Across All Datasets (Excluding the 'TB_Status' column)
common_genes = list(set(df1.columns).intersection(df2.columns).intersection(df3.columns).intersection(df4.columns).intersection(df5.columns) - {'TB_Status'})

# Convert the list to a DataFrame and save as CSV
common_genes_df = pd.DataFrame(common_genes, columns=["Gene"])

csv_path = r'/content/drive/MyDrive/Research/TB_new/Saved_files/common_features.csv'

os.makedirs(os.path.dirname(csv_path), exist_ok=True)

common_genes_df.to_csv(csv_path, index=False)


In [46]:
# Common Genes (Features) Across df1 to df4 Datasets for ATB_OT dataset (for sub model 1)
common_genes_2 = list(set(df1.columns).intersection(df2.columns).intersection(df3.columns).intersection(df4.columns) - {'TB_Status'})

# Convert the list to a DataFrame and save as CSV
common_genes_df_2 = pd.DataFrame(common_genes_2, columns=["Gene"])

csv_path = r'/content/drive/MyDrive/Research/TB_new/Saved_files/common_features_2.csv'

os.makedirs(os.path.dirname(csv_path), exist_ok=True)

common_genes_df_2.to_csv(csv_path, index=False)


In [47]:
# Standardization & Batch Effect Correction (Median-Centering per Dataset)
scaler = StandardScaler()

def process_and_harmonize(df, common_genes, scaler):
    df_common = df[common_genes]

    # Z-score normalization (standardization)
    df_scaled = pd.DataFrame(scaler.fit_transform(df_common), columns=df_common.columns)

    # Batch effect correction: median-centering per dataset
    df_harmonized = df_scaled - df_scaled.median()

    return pd.concat([df_harmonized, df[['TB_Status']]], axis=1)

## Data Harmonization

In [48]:
df1_1 = process_and_harmonize(df1, common_genes, scaler)
df2_1 = process_and_harmonize(df2, common_genes, scaler)
df3_1 = process_and_harmonize(df3, common_genes, scaler)
df4_1 = process_and_harmonize(df4, common_genes, scaler)
df5_1 = process_and_harmonize(df5, common_genes, scaler)

Harmonize common genes in first 4 datasets

In [49]:
df1_2 = process_and_harmonize(df1, common_genes_2, scaler)
df2_2 = process_and_harmonize(df2, common_genes_2, scaler)
df3_2 = process_and_harmonize(df3, common_genes_2, scaler)
df4_2 = process_and_harmonize(df4, common_genes_2, scaler)

## Merge all 5 datasets

In [50]:
# Concatenate all datasets after harmonization
merged_df = pd.concat([df1_1, df2_1, df3_1, df4_1, df5_1], ignore_index=True)

In [51]:
merged_df.sample(5)

Unnamed: 0,ILMN_1759501,ILMN_2326675,ILMN_1788604,ILMN_1756898,ILMN_2126706,ILMN_1685369,ILMN_1690476,ILMN_1716276,ILMN_1700337,ILMN_2222768,...,ILMN_1805396,ILMN_1744912,ILMN_1805863,ILMN_1810953,ILMN_1653165,ILMN_1664912,ILMN_2263718,ILMN_1731268,ILMN_1681249,TB_Status
1327,-1.686024,0.359556,0.155704,-0.847739,-0.1392,1.368778,-1.370276,1.299341,0.350367,0.549698,...,-0.721257,-0.168983,-1.58373,-1.451325,-0.713334,0.181405,-0.678063,0.37767,1.005347,Other Disease
1106,0.332074,-0.368971,0.394717,-1.306862,0.191137,-0.533453,0.489556,0.98475,0.154915,2.445585,...,-0.557659,-1.090315,0.436202,-0.191494,-0.648251,-0.313419,0.646327,0.977472,0.224932,Other Disease
1026,-0.060632,-0.478808,-0.045296,-1.753664,-0.494802,-2.037669,-1.268668,0.692081,-0.506726,-0.928418,...,-2.032469,0.894043,0.963442,0.980897,-0.57729,0.220736,-0.113372,-0.750385,0.11777,Extra Pulmonary TB
77,-0.157132,-0.004367,1.941423,-0.19375,0.991922,0.204565,-0.104754,0.922401,-0.316509,-0.225655,...,0.310852,0.696679,1.273211,0.031628,-0.621386,-0.334344,0.340578,-0.225742,-0.487382,Active TB
1387,2.354572,-0.290256,0.376818,-2.760531,-1.941986,-0.664777,-0.623986,0.434356,-1.519817,-1.416394,...,-0.950251,1.079806,-0.048442,-1.137665,1.42077,0.192826,-1.569662,2.356934,-1.23418,Other Disease


## Merge first 4 datasets

In [52]:
# Concatenate all datasets after harmonization
merged_df_2 = pd.concat([df1_2, df2_2, df3_2, df4_2], ignore_index=True)

In [53]:
merged_df_2.sample(5)

Unnamed: 0,ILMN_1667361,ILMN_2184869,ILMN_1759501,ILMN_2326675,ILMN_1788604,ILMN_1756898,ILMN_1767828,ILMN_1854557,ILMN_2126706,ILMN_1685369,...,ILMN_1805863,ILMN_1810953,ILMN_1653165,ILMN_1664912,ILMN_2263718,ILMN_1731268,ILMN_1681249,ILMN_1744614,ILMN_1666212,TB_Status
488,-1.05933,-0.564708,-0.44794,-0.616201,1.476943,0.136549,-0.049595,-1.462077,1.091656,1.452642,...,-0.730141,1.729903,-1.215997,-0.757961,2.989192,0.195984,-0.813675,-0.214963,-0.086907,Healthy Control
782,-0.488812,-0.416166,-0.754765,-0.379284,1.049719,0.041159,-0.506536,-0.599399,-0.48305,1.623435,...,1.031042,0.220942,0.375698,-1.10771,-0.529196,1.165948,0.29874,-0.437362,-0.634023,Other Disease
766,-0.796232,-0.723392,-1.005901,-0.284237,0.512567,0.774684,-0.811667,-0.880508,0.239543,0.394556,...,1.284096,0.554118,1.347528,-0.336855,-0.098119,1.652247,-0.77844,-0.756763,0.892865,Other Disease
867,-1.211697,-0.492874,-1.446714,-1.030033,1.747143,-2.760531,-1.568602,-1.503768,-0.582151,1.625111,...,1.98288,1.002678,1.803556,-0.71778,-0.490565,2.356934,0.74275,0.167621,1.281119,Extra Pulmonary TB
1276,0.217236,-0.020505,0.474193,0.704449,-0.624604,-0.572445,0.489207,-0.80529,-0.787294,-0.148353,...,-0.545047,-0.987037,-0.293699,0.757433,1.006393,0.129677,-1.344068,-1.943874,-0.348114,Other Disease


In [54]:
# List of features to check
features_to_check = ['ILMN_1774071', 'ILMN_3251610', 'ILMN_1812433', 'ILMN_2105441', 'ILMN_1654875', 'ILMN_1690241']

# Check if each feature exists in the merged_df columns
for feature in features_to_check:
    if feature in merged_df_2.columns:
        print(f"Feature '{feature}' exists in the dataset.")
    else:
        print(f"Feature '{feature}' does not exist in the dataset.")

Feature 'ILMN_1774071' exists in the dataset.
Feature 'ILMN_3251610' does not exist in the dataset.
Feature 'ILMN_1812433' exists in the dataset.
Feature 'ILMN_2105441' exists in the dataset.
Feature 'ILMN_1654875' exists in the dataset.
Feature 'ILMN_1690241' exists in the dataset.


## Check for missing values in 'merged_df' dataset

In [55]:
# Check for missing values and print only columns with missing values
missing_values = merged_df.isnull().sum()
print(missing_values[missing_values > 0])

Series([], dtype: int64)


In [56]:
# Check for missing values and print only columns with missing values
missing_values = merged_df_2.isnull().sum()
print(missing_values[missing_values > 0])

Series([], dtype: int64)


## Split 'merged_df' dataset in to 3 seratate datasets as 'TB_HC_OD' , 'PTB_EPTB' , 'ATB_LTB'

### Dataset 1: 'ATB_OT'

In [59]:
# Dataset 1: 'TB_HC_OD'
# Replace 'Active TB', 'Latent TB', 'Pulmonary TB', and 'Extra Pulmonary TB' with 'TB' in 'TB_Status'
ATB_OT = merged_df_2.copy()
ATB_OT['TB_Status'] = ATB_OT['TB_Status'].replace(
    ['Active TB', 'Pulmonary TB', 'Extra Pulmonary TB', 'TB'], 'Active TB'
)

ATB_OT['TB_Status'] = ATB_OT['TB_Status'].replace(
    ['Latent TB', 'Other Disease', 'Healthy Control'], 'Inactive'
)


In [60]:
# Verifying the shape and preview of each dataset
print("ATB_OT Dataset shape:", ATB_OT.shape)
print("ATB_OT Dataset preview:\n")
ATB_OT.sample(5)

ATB_OT Dataset shape: (1423, 30276)
ATB_OT Dataset preview:



Unnamed: 0,ILMN_1667361,ILMN_2184869,ILMN_1759501,ILMN_2326675,ILMN_1788604,ILMN_1756898,ILMN_1767828,ILMN_1854557,ILMN_2126706,ILMN_1685369,...,ILMN_1805863,ILMN_1810953,ILMN_1653165,ILMN_1664912,ILMN_2263718,ILMN_1731268,ILMN_1681249,ILMN_1744614,ILMN_1666212,TB_Status
276,0.093902,0.523608,1.430607,0.629418,-0.476998,0.320942,0.183553,0.821394,-0.113279,0.088269,...,-1.033206,0.388186,-0.077328,-0.259139,-0.159812,0.0526,-0.541322,-0.863102,0.470025,Inactive
1267,0.076088,-0.255023,-1.842052,-0.602425,-1.307515,1.428125,-2.570351,0.015855,-1.319175,1.688449,...,1.215348,-0.822228,1.635353,0.687457,-1.312801,0.152992,0.430323,1.135337,-1.334339,Inactive
1218,0.039356,-0.954908,-1.636002,0.901846,1.771935,-0.681169,-1.087512,-0.96972,-0.476232,0.11455,...,1.625335,-0.080236,-0.839091,0.694591,1.034675,-1.080848,0.954728,0.834862,0.812572,Inactive
905,0.194353,0.707222,-1.663414,-2.469454,0.414272,-0.563724,0.015705,0.73,0.701295,-1.548855,...,0.587526,1.058525,-0.218546,-1.236507,0.941983,1.113569,-1.852565,0.633743,-0.437955,Active TB
1168,-0.80811,-0.353845,-0.053097,0.985721,0.510247,-0.222746,-1.820095,0.524136,-0.468415,-2.032486,...,1.530619,0.954873,0.13127,0.908861,-1.164948,0.86463,-1.692591,-1.185707,0.393536,Inactive


In [61]:
# Get the number of occurrences for each class in the 'TB_Status' column
class_counts = ATB_OT['TB_Status'].value_counts()

# Display the class counts
print(class_counts)


TB_Status
Inactive     909
Active TB    514
Name: count, dtype: int64


In [62]:
# Save the DataFrame as a CSV file
output_csv_path = r'/content/drive/MyDrive/Research/TB_new/Datasets/train/processed/ATB_OT.csv'
ATB_OT.to_csv(output_csv_path, index=False)
print("CSV file saved to:", output_csv_path)

CSV file saved to: /content/drive/MyDrive/Research/TB_new/Datasets/train/processed/ATB_OT.csv


### Dataset 2: 'PTB_EPTB'

In [63]:
# Dataset 2: 'PTB_EPTB' - only rows where TB_Status is 'Pulmonary TB' or 'Extra Pulmonary TB'
PTB_EPTB = merged_df[merged_df['TB_Status'].isin(['Pulmonary TB', 'Extra Pulmonary TB'])]

In [64]:
# Verifying the shape and preview of each dataset
print("\nPTB_EPTB Dataset shape:", PTB_EPTB.shape)
print("PTB_EPTB Dataset preview:\n")
PTB_EPTB.sample(5)


PTB_EPTB Dataset shape: (394, 23190)
PTB_EPTB Dataset preview:



Unnamed: 0,ILMN_1759501,ILMN_2326675,ILMN_1788604,ILMN_1756898,ILMN_2126706,ILMN_1685369,ILMN_1690476,ILMN_1716276,ILMN_1700337,ILMN_2222768,...,ILMN_1805396,ILMN_1744912,ILMN_1805863,ILMN_1810953,ILMN_1653165,ILMN_1664912,ILMN_2263718,ILMN_1731268,ILMN_1681249,TB_Status
1060,0.098415,1.210795,-1.215686,-0.45059,-1.056052,-1.379326,0.552016,2.771046,1.256803,-1.626294,...,-1.188824,-1.501199,1.341711,0.473252,1.294652,-0.439182,-1.500599,-1.258722,1.801483,Pulmonary TB
851,0.418266,0.379148,-0.395935,-0.760201,-0.383384,-1.910522,-0.603279,-1.628155,0.898094,-0.675716,...,0.348402,0.884479,-0.92546,0.194327,0.392955,1.094063,-0.185065,0.673831,-2.271994,Pulmonary TB
638,-1.057303,0.904497,-0.870034,-0.537957,-0.190468,-0.64441,-0.864911,0.372156,-1.144819,-0.433195,...,-0.928364,-0.26365,-0.404414,-0.200457,0.329004,0.372931,0.286055,-1.211977,2.193285,Extra Pulmonary TB
1059,0.304399,0.06377,-1.220486,0.073935,-0.556188,-1.238186,-0.139288,1.134655,-2.399425,-2.559491,...,-0.779054,-1.832209,0.297129,-0.493289,-2.057133,-0.031814,0.241781,0.519809,0.581637,Extra Pulmonary TB
1057,0.012266,1.468121,1.274925,0.455534,-0.337357,-0.755718,-0.061408,1.492001,0.077981,-0.537811,...,-0.056978,2.763212,-0.273222,0.654586,0.293452,-0.147577,-1.590456,-1.527561,0.286048,Extra Pulmonary TB


In [65]:
# Get the number of occurrences for each class in the 'TB_Status' column
class_counts = PTB_EPTB['TB_Status'].value_counts()

# Display the class counts
print(class_counts)


TB_Status
Extra Pulmonary TB    211
Pulmonary TB          183
Name: count, dtype: int64


In [66]:
# Save the DataFrame as a CSV file
output_csv_path = r'/content/drive/MyDrive/Research/TB_new/Datasets/train/processed/PTB_EPTB.csv'
PTB_EPTB.to_csv(output_csv_path, index=False)
print("CSV file saved to:", output_csv_path)

CSV file saved to: /content/drive/MyDrive/Research/TB_new/Datasets/train/processed/PTB_EPTB.csv


### Dataset 3: 'LTB_OTHER'

In [67]:
# Dataset 3: 'ATB_LTB' - only rows where TB_Status is 'Latent TB' or 'Active TB'
LTB_OT = merged_df[merged_df['TB_Status'].isin(['Latent TB', 'Healthy Control', 'Other Disease'])]

In [68]:
# Verifying the shape and preview of each dataset
print("\nLTB_OT Dataset shape:", LTB_OT.shape)
print("LTB_OT Dataset preview:\n")
LTB_OT.sample(5)


LTB_OT Dataset shape: (1031, 23190)
LTB_OT Dataset preview:



Unnamed: 0,ILMN_1759501,ILMN_2326675,ILMN_1788604,ILMN_1756898,ILMN_2126706,ILMN_1685369,ILMN_1690476,ILMN_1716276,ILMN_1700337,ILMN_2222768,...,ILMN_1805396,ILMN_1744912,ILMN_1805863,ILMN_1810953,ILMN_1653165,ILMN_1664912,ILMN_2263718,ILMN_1731268,ILMN_1681249,TB_Status
645,0.107325,0.10513,-0.302406,-0.989297,1.523181,-0.953129,1.401364,-1.021457,0.259969,0.14304,...,0.464137,-2.083444,-1.421248,0.363259,-0.621282,-0.117534,1.17395,-0.130344,-1.729146,Healthy Control
89,-0.780215,2.367144,-0.037261,0.598804,0.09533,0.155416,-0.142991,2.392735,1.344438,1.883313,...,-0.334247,-0.77218,0.395885,-0.223216,0.912841,1.187924,1.272622,1.079154,1.509466,Latent TB
1151,-0.000632,0.308232,-1.455997,0.403489,-0.496719,0.100967,0.720649,2.730358,0.37705,-0.549462,...,-0.908426,-0.336277,-0.92307,-0.971625,-0.17476,0.172706,0.185634,-0.809239,2.134582,Other Disease
658,-0.156548,1.515728,1.086732,0.81323,-1.237225,-2.839703,-1.541122,-0.150099,-0.058318,-0.607479,...,0.148635,-0.892535,1.805543,-1.665427,1.063117,2.492,-1.112596,0.516437,-0.239653,Healthy Control
66,-0.800015,-0.293998,-1.366648,-1.211261,-0.782087,-1.350378,0.003312,0.041783,-0.497928,-0.310707,...,0.182525,-1.078999,-1.123565,-0.849124,-0.830632,-0.679025,-0.858712,-1.46022,-0.682304,Healthy Control


In [69]:
# Get the number of occurrences for each class in the 'TB_Status' column
class_counts = LTB_OT['TB_Status'].value_counts()

# Display the class counts
print(class_counts)

TB_Status
Other Disease      608
Healthy Control    243
Latent TB          180
Name: count, dtype: int64


In [70]:
# Save the DataFrame as a CSV file
output_csv_path = r'/content/drive/MyDrive/Research/TB_new/Datasets/train/processed/LTB_OT.csv'
LTB_OT.to_csv(output_csv_path, index=False)
print("CSV file saved to:", output_csv_path)

CSV file saved to: /content/drive/MyDrive/Research/TB_new/Datasets/train/processed/LTB_OT.csv
