In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Datasets Preparation

## Import necessary libraries

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [3]:
# Dataset 1: processed_dataset1
df1 = pd.read_csv(r'/content/drive/MyDrive/Research/TB_new/Datasets/train/processed/processed_dataset1.csv')
# Dataset 2: processed_dataset2
df2 = pd.read_csv(r'/content/drive/MyDrive/Research/TB_new/Datasets/train/processed/processed_dataset2.csv')
# Dataset 3: processed_dataset3
df3 = pd.read_csv(r'/content/drive/MyDrive/Research/TB_new/Datasets/train/processed/processed_dataset3.csv')
# Dataset 4: processed_dataset4
df4 = pd.read_csv(r'/content/drive/MyDrive/Research/TB_new/Datasets/train/processed/processed_dataset4.csv')
# Dataset 5: processed_dataset5
df5 = pd.read_csv(r'/content/drive/MyDrive/Research/TB_new/Datasets/train/processed/processed_dataset5.csv')

## Identify and Keep Common Genes (Features) Across All Datasets

In [5]:
import os

In [6]:
# Identify and Keep Common Genes (Features) Across All Datasets (Excluding the 'TB_Status' column)
common_genes = list(set(df1.columns).intersection(df2.columns).intersection(df3.columns).intersection(df4.columns).intersection(df5.columns) - {'TB_Status'})

# Convert the list to a DataFrame and save as CSV
common_genes_df = pd.DataFrame(common_genes, columns=["Gene"])

csv_path = r'/content/drive/MyDrive/Research/TB_new/Saved_files/common_features.csv'

os.makedirs(os.path.dirname(csv_path), exist_ok=True)

common_genes_df.to_csv(csv_path, index=False)


In [7]:
# Standardization & Batch Effect Correction (Median-Centering per Dataset)
scaler = StandardScaler()

def process_and_harmonize(df, common_genes, scaler):
    df_common = df[common_genes]

    # Z-score normalization (standardization)
    df_scaled = pd.DataFrame(scaler.fit_transform(df_common), columns=df_common.columns)

    # Batch effect correction: median-centering per dataset
    df_harmonized = df_scaled - df_scaled.median()

    return pd.concat([df_harmonized, df[['TB_Status']]], axis=1)

## Data Harmonization

In [8]:
df1 = process_and_harmonize(df1, common_genes, scaler)
df2 = process_and_harmonize(df2, common_genes, scaler)
df3 = process_and_harmonize(df3, common_genes, scaler)
df4 = process_and_harmonize(df4, common_genes, scaler)
df5 = process_and_harmonize(df5, common_genes, scaler)

## Merge all 5 datasets

In [9]:
# Concatenate all datasets before harmonization
merged_df = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)

In [10]:
merged_df.sample(5)

Unnamed: 0,ILMN_1699537,ILMN_1733110,ILMN_1764573,ILMN_2354140,ILMN_1796976,ILMN_2332691,ILMN_1688886,ILMN_1746856,ILMN_1801795,ILMN_2082209,...,ILMN_2091084,ILMN_1795976,ILMN_1790218,ILMN_1794333,ILMN_1668353,ILMN_1800164,ILMN_1808748,ILMN_1658310,ILMN_1724555,TB_Status
1380,-0.543004,-0.620529,1.547199,0.525223,0.893056,0.065946,0.591496,-0.651973,-0.149305,-0.782862,...,-1.934995,0.179604,1.944923,-0.710243,0.940244,-0.89238,1.758501,1.250764,0.251521,Other Disease
1087,-2.271033,-0.091703,1.423186,0.889406,1.22304,0.018325,-0.986866,-0.027927,-1.361695,-0.539509,...,-0.929862,0.573806,-2.156319,-0.783665,0.19507,0.506894,-1.504386,0.779176,0.79044,Extra Pulmonary TB
65,1.331152,-1.390768,-0.681425,-0.628532,-0.238184,-0.751776,-0.209124,-0.44937,-0.763975,-0.850587,...,-0.889578,-0.286111,0.015859,-1.350474,1.695598,-0.900486,-0.434043,-1.0082,0.416326,Healthy Control
975,-0.023887,-0.140231,0.379532,-0.244674,-0.386839,1.040478,0.227923,-2.525372,0.404461,-0.658151,...,1.14017,-0.629576,2.251747,0.760345,2.312422,-1.520146,1.502088,-0.922503,1.327656,Extra Pulmonary TB
1437,-0.716106,-1.269562,-0.13671,-0.578637,1.120164,0.663362,0.750123,0.982683,-1.597992,-0.396536,...,-0.608548,-0.909683,-0.32917,0.383593,-1.881045,1.397728,-0.45912,0.035982,1.858876,Active TB


In [11]:
# List of features to check
features_to_check = ['ILMN_1774071', 'ILMN_3251610', 'ILMN_1812433', 'ILMN_2105441', 'ILMN_1654875', 'ILMN_1690241']

# Check if each feature exists in the merged_df columns
for feature in features_to_check:
    if feature in merged_df.columns:
        print(f"Feature '{feature}' exists in the dataset.")
    else:
        print(f"Feature '{feature}' does not exist in the dataset.")

Feature 'ILMN_1774071' does not exist in the dataset.
Feature 'ILMN_3251610' does not exist in the dataset.
Feature 'ILMN_1812433' exists in the dataset.
Feature 'ILMN_2105441' exists in the dataset.
Feature 'ILMN_1654875' exists in the dataset.
Feature 'ILMN_1690241' exists in the dataset.


## Check for missing values in 'merged_df' dataset

In [12]:
# Check for missing values and print only columns with missing values
missing_values = merged_df.isnull().sum()
print(missing_values[missing_values > 0])

Series([], dtype: int64)


## Split 'merged_df' dataset in to 3 seratate datasets as 'TB_HC_OD' , 'PTB_EPTB' , 'ATB_LTB'

### Dataset 1: 'ATB_OT'

In [16]:
# Dataset 1: 'TB_HC_OD'
# Replace 'Active TB', 'Latent TB', 'Pulmonary TB', and 'Extra Pulmonary TB' with 'TB' in 'TB_Status'
ATB_OT = merged_df.copy()
ATB_OT['TB_Status'] = ATB_OT['TB_Status'].replace(
    ['Active TB', 'Pulmonary TB', 'Extra Pulmonary TB', 'TB'], 'Active TB'
)

ATB_OT['TB_Status'] = ATB_OT['TB_Status'].replace(
    ['Latent TB', 'Other Disease', 'Healthy Control'], 'Inactive'
)


In [17]:
# Verifying the shape and preview of each dataset
print("ATB_OT Dataset shape:", ATB_OT.shape)
print("ATB_OT Dataset preview:\n")
ATB_OT.sample(5)

ATB_OT Dataset shape: (1570, 23190)
ATB_OT Dataset preview:



Unnamed: 0,ILMN_1699537,ILMN_1733110,ILMN_1764573,ILMN_2354140,ILMN_1796976,ILMN_2332691,ILMN_1688886,ILMN_1746856,ILMN_1801795,ILMN_2082209,...,ILMN_2091084,ILMN_1795976,ILMN_1790218,ILMN_1794333,ILMN_1668353,ILMN_1800164,ILMN_1808748,ILMN_1658310,ILMN_1724555,TB_Status
450,2.413235,0.226712,0.108083,-0.45577,-0.222183,0.122874,-0.508122,-0.360956,0.971522,-0.505079,...,-0.798289,-1.253833,-2.028023,-1.046917,1.098227,-0.816376,-0.945106,0.716535,0.388801,Inactive
680,-2.222741,1.281556,1.330664,-0.132726,-2.147518,-0.490781,-1.973015,-0.376798,1.141268,-0.534533,...,-0.774798,-2.203761,-1.016806,0.754907,-0.01374,-0.263082,-2.179208,-2.22691,-2.113077,Inactive
538,0.330392,0.8531,1.274479,1.043965,-0.255606,-0.2215,-0.10908,-0.865331,0.977266,0.334187,...,-1.924297,0.717323,0.695741,-0.404083,-0.078298,-0.299799,-0.118463,0.708606,-0.021692,Inactive
36,-0.565012,-1.430618,-0.286996,-0.661421,0.284122,-1.242136,2.592639,-0.661651,-1.077838,-1.122567,...,-1.098371,-1.712011,2.035419,-1.458133,0.092696,-1.006769,-0.894073,-2.044267,1.046369,Active TB
431,-0.692521,1.250124,-0.758507,0.688306,0.561028,0.54876,-0.253939,1.724332,0.737205,0.066498,...,-0.418945,-0.528703,0.037902,-0.04006,0.255545,2.095162,0.926955,1.623567,1.477456,Inactive


In [18]:
# Get the number of occurrences for each class in the 'TB_Status' column
class_counts = ATB_OT['TB_Status'].value_counts()

# Display the class counts
print(class_counts)


TB_Status
Inactive     1031
Active TB     539
Name: count, dtype: int64


In [19]:
# Save the DataFrame as a CSV file
output_csv_path = r'/content/drive/MyDrive/Research/TB_new/Datasets/train/processed/ATB_OT.csv'
ATB_OT.to_csv(output_csv_path, index=False)
print("CSV file saved to:", output_csv_path)

CSV file saved to: /content/drive/MyDrive/Research/TB_new/Datasets/train/processed/ATB_OT.csv


### Dataset 2: 'PTB_EPTB'

In [20]:
# Dataset 2: 'PTB_EPTB' - only rows where TB_Status is 'Pulmonary TB' or 'Extra Pulmonary TB'
PTB_EPTB = merged_df[merged_df['TB_Status'].isin(['Pulmonary TB', 'Extra Pulmonary TB'])]

In [21]:
# Verifying the shape and preview of each dataset
print("\nPTB_EPTB Dataset shape:", PTB_EPTB.shape)
print("PTB_EPTB Dataset preview:\n")
PTB_EPTB.sample(5)


PTB_EPTB Dataset shape: (394, 23190)
PTB_EPTB Dataset preview:



Unnamed: 0,ILMN_1699537,ILMN_1733110,ILMN_1764573,ILMN_2354140,ILMN_1796976,ILMN_2332691,ILMN_1688886,ILMN_1746856,ILMN_1801795,ILMN_2082209,...,ILMN_2091084,ILMN_1795976,ILMN_1790218,ILMN_1794333,ILMN_1668353,ILMN_1800164,ILMN_1808748,ILMN_1658310,ILMN_1724555,TB_Status
723,2.51731,1.460386,-1.197621,-1.05834,0.332911,0.189141,1.273863,-1.325872,1.352713,0.774607,...,2.482752,2.552841,1.029059,-0.128117,2.524193,-0.995706,2.436448,2.738313,2.839644,Pulmonary TB
1084,0.089551,-1.296186,-0.204328,-0.233389,2.171045,0.568942,0.805959,0.968912,-1.470047,-0.450374,...,1.020101,0.434539,0.51157,0.186782,-1.696118,2.460965,0.620045,0.685731,-2.516203,Extra Pulmonary TB
629,0.331551,0.120104,0.973965,-0.47039,-0.692778,-0.055356,-0.583457,-0.745343,1.283117,-0.009309,...,0.187735,0.355146,0.37415,-0.441399,1.75665,-0.551787,0.036312,0.08439,0.024094,Extra Pulmonary TB
1021,0.789189,1.427172,-0.549372,1.077511,-0.380351,0.342609,0.372726,0.586267,0.424966,0.591593,...,-0.891997,1.420388,0.444393,-0.246516,-2.375161,1.040255,0.39444,0.618465,-2.343663,Extra Pulmonary TB
734,-1.754096,-1.120689,-0.900327,1.423411,1.345889,-0.675012,-0.068345,0.731792,-2.094698,-0.87195,...,-0.807785,-1.73427,-0.296918,-1.170159,-1.667086,1.450206,-1.772719,1.068094,-1.237739,Pulmonary TB


In [22]:
# Get the number of occurrences for each class in the 'TB_Status' column
class_counts = PTB_EPTB['TB_Status'].value_counts()

# Display the class counts
print(class_counts)


TB_Status
Extra Pulmonary TB    211
Pulmonary TB          183
Name: count, dtype: int64


In [23]:
# Save the DataFrame as a CSV file
output_csv_path = r'/content/drive/MyDrive/Research/TB_new/Datasets/train/processed/PTB_EPTB.csv'
PTB_EPTB.to_csv(output_csv_path, index=False)
print("CSV file saved to:", output_csv_path)

CSV file saved to: /content/drive/MyDrive/Research/TB_new/Datasets/train/processed/PTB_EPTB.csv


### Dataset 3: 'LTB_OTHER'

In [27]:
# Dataset 3: 'ATB_LTB' - only rows where TB_Status is 'Latent TB' or 'Active TB'
LTB_OT = merged_df[merged_df['TB_Status'].isin(['Latent TB', 'Healthy Control', 'Other Disease'])]

In [29]:
# Verifying the shape and preview of each dataset
print("\nLTB_OT Dataset shape:", LTB_OT.shape)
print("LTB_OT Dataset preview:\n")
LTB_OT.sample(5)


LTB_OT Dataset shape: (1031, 23190)
LTB_OT Dataset preview:



Unnamed: 0,ILMN_1699537,ILMN_1733110,ILMN_1764573,ILMN_2354140,ILMN_1796976,ILMN_2332691,ILMN_1688886,ILMN_1746856,ILMN_1801795,ILMN_2082209,...,ILMN_2091084,ILMN_1795976,ILMN_1790218,ILMN_1794333,ILMN_1668353,ILMN_1800164,ILMN_1808748,ILMN_1658310,ILMN_1724555,TB_Status
1125,0.547021,-0.43106,0.149485,0.031571,-0.405717,-2.325172,0.376023,-0.362569,-0.139488,-1.99343,...,-0.744693,-0.471377,1.159763,-2.499742,0.855799,-0.792439,-1.562051,-2.764898,-0.499982,Other Disease
1233,-0.727235,-0.056324,-0.05601,1.609825,1.225941,-1.308295,1.018338,-0.000823,-0.447736,1.264771,...,-0.064707,0.467013,-0.342778,-0.021862,0.833465,0.644059,1.082685,1.447568,-0.115787,Other Disease
210,-0.020991,0.695694,-0.731925,-1.118786,-0.612041,0.65552,0.910443,-0.892438,-0.706926,-0.49886,...,0.507435,-0.082118,-1.233712,-0.268281,0.6769,-0.707672,-0.194937,0.651564,0.888621,Latent TB
1537,1.942276,-0.615017,-0.13911,0.742084,0.378316,0.091892,0.935026,0.223168,0.017933,1.264106,...,0.03055,-0.599046,-1.074443,0.14188,0.723787,0.44915,1.47958,-1.349606,-0.398304,Latent TB
1122,-0.035737,-0.550925,-0.446301,0.74471,-0.165998,0.14133,-2.288001,0.104317,1.211903,-0.433587,...,0.356738,-0.24147,-0.094964,-0.68636,-0.505317,0.44702,0.121801,-0.009929,0.277146,Other Disease


In [30]:
# Get the number of occurrences for each class in the 'TB_Status' column
class_counts = LTB_OT['TB_Status'].value_counts()

# Display the class counts
print(class_counts)

TB_Status
Other Disease      608
Healthy Control    243
Latent TB          180
Name: count, dtype: int64


In [31]:
# Save the DataFrame as a CSV file
output_csv_path = r'/content/drive/MyDrive/Research/TB_new/Datasets/train/processed/LTB_OT.csv'
LTB_OT.to_csv(output_csv_path, index=False)
print("CSV file saved to:", output_csv_path)

CSV file saved to: /content/drive/MyDrive/Research/TB_new/Datasets/train/processed/LTB_OT.csv
