In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Datasets Preparation

## Import necessary libraries

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [3]:
# Dataset 1: processed_dataset1
df1 = pd.read_csv(r'/content/drive/MyDrive/Research/TB_new/Datasets/train/processed/processed_dataset1.csv')
# Dataset 2: processed_dataset2
df2 = pd.read_csv(r'/content/drive/MyDrive/Research/TB_new/Datasets/train/processed/processed_dataset2.csv')
# Dataset 3: processed_dataset3
df3 = pd.read_csv(r'/content/drive/MyDrive/Research/TB_new/Datasets/train/processed/processed_dataset3.csv')
# Dataset 4: processed_dataset4
df4 = pd.read_csv(r'/content/drive/MyDrive/Research/TB_new/Datasets/train/processed/processed_dataset4.csv')
# Dataset 5: processed_dataset5
df5 = pd.read_csv(r'/content/drive/MyDrive/Research/TB_new/Datasets/train/processed/processed_dataset5.csv')

## Identify and Keep Common Genes (Features) Across All Datasets

In [4]:
import os

In [5]:
# Identify and Keep Common Genes (Features) Across All Datasets (Excluding the 'TB_Status' column)
common_genes = list(set(df1.columns).intersection(df2.columns).intersection(df3.columns).intersection(df4.columns).intersection(df5.columns) - {'TB_Status'})

# Convert the list to a DataFrame and save as CSV
common_genes_df = pd.DataFrame(common_genes, columns=["Gene"])

csv_path = r'/content/drive/MyDrive/Research/TB_new/Saved_files/common_features.csv'

os.makedirs(os.path.dirname(csv_path), exist_ok=True)

common_genes_df.to_csv(csv_path, index=False)


In [6]:
# Common Genes (Features) Across df1 to df4 Datasets for ATB_OT dataset (for sub model 1)
common_genes_2 = list(set(df1.columns).intersection(df2.columns).intersection(df3.columns).intersection(df4.columns) - {'TB_Status'})

# Convert the list to a DataFrame and save as CSV
common_genes_df_2 = pd.DataFrame(common_genes_2, columns=["Gene"])

csv_path = r'/content/drive/MyDrive/Research/TB_new/Saved_files/common_features_2.csv'

os.makedirs(os.path.dirname(csv_path), exist_ok=True)

common_genes_df_2.to_csv(csv_path, index=False)


In [7]:
# Standardization & Batch Effect Correction (Median-Centering per Dataset)
scaler = StandardScaler()

def process_and_harmonize(df, common_genes, scaler):
    df_common = df[common_genes]

    # Z-score normalization (standardization)
    df_scaled = pd.DataFrame(scaler.fit_transform(df_common), columns=df_common.columns)

    # Batch effect correction: median-centering per dataset
    df_harmonized = df_scaled - df_scaled.median()

    return pd.concat([df_harmonized, df[['TB_Status']]], axis=1)

## Data Harmonization

In [8]:
df1_1 = process_and_harmonize(df1, common_genes, scaler)
df2_1 = process_and_harmonize(df2, common_genes, scaler)
df3_1 = process_and_harmonize(df3, common_genes, scaler)
df4_1 = process_and_harmonize(df4, common_genes, scaler)
df5_1 = process_and_harmonize(df5, common_genes, scaler)

Harmonize common genes in first 4 datasets

In [9]:
df1_2 = process_and_harmonize(df1, common_genes_2, scaler)
df2_2 = process_and_harmonize(df2, common_genes_2, scaler)
df3_2 = process_and_harmonize(df3, common_genes_2, scaler)
df4_2 = process_and_harmonize(df4, common_genes_2, scaler)

## Merge all 5 datasets

In [10]:
# Concatenate all datasets after harmonization
merged_df = pd.concat([df1_1, df2_1, df3_1, df4_1, df5_1], ignore_index=True)

In [11]:
merged_df.sample(5)

Unnamed: 0,ILMN_1786125,ILMN_1666269,ILMN_1659156,ILMN_1698020,ILMN_1874678,ILMN_2368597,ILMN_1806745,ILMN_1664878,ILMN_1692100,ILMN_1721651,...,ILMN_1756942,ILMN_1670158,ILMN_1889215,ILMN_1811258,ILMN_2233539,ILMN_2347193,ILMN_2337789,ILMN_1869897,ILMN_1786046,TB_Status
1377,-0.722499,0.705333,-2.272834,0.145878,-1.338563,0.181547,1.391654,-2.589215,-0.629674,-0.625864,...,-0.307302,1.078117,1.293993,-0.209351,-0.595845,1.650365,-0.079398,-0.121352,0.261297,Other Disease
38,-0.294208,0.114136,-0.550466,-0.81733,-0.611966,-0.206649,-0.616088,-0.379741,-0.191303,0.087657,...,-0.846837,-0.667987,-0.814643,-1.078671,-0.716763,0.489724,-0.375425,-0.050618,0.269191,Healthy Control
1367,0.070446,0.356484,1.536543,-0.00289,0.980874,-1.323044,-2.230359,-0.226775,0.56989,0.396774,...,-0.871276,-0.473999,0.188754,-0.849892,0.271863,2.295692,1.545775,-2.156617,-0.138649,Other Disease
752,1.138114,0.472138,0.756374,-0.494707,1.466354,-0.426548,-0.648559,1.386136,-0.383699,1.022863,...,-1.202514,-0.683246,0.54047,0.028078,1.838355,-1.491522,-0.118807,-0.084399,-0.505503,Other Disease
360,-0.377118,-1.094675,-0.395995,-0.228412,-1.318513,0.546608,-0.25295,-0.3879,0.060679,-0.593492,...,-0.088697,0.571531,0.433138,-0.577951,0.372595,2.086482,-0.486047,-0.706481,-0.16015,Other Disease


## Merge first 4 datasets

In [12]:
# Concatenate df1 to df4 datasets after harmonization
merged_df_2 = pd.concat([df1_2, df2_2, df3_2, df4_2], ignore_index=True)

In [13]:
merged_df_2.sample(5)

Unnamed: 0,ILMN_1786125,ILMN_1666269,ILMN_1659156,ILMN_1698020,ILMN_1874678,ILMN_2368597,ILMN_1806745,ILMN_1664878,ILMN_1692100,ILMN_1721651,...,ILMN_1889215,ILMN_1811258,ILMN_2233539,ILMN_1658806,ILMN_2347193,ILMN_1723332,ILMN_2337789,ILMN_1869897,ILMN_1786046,TB_Status
669,-1.217389,-1.931446,-0.353552,-0.361715,0.358744,-2.403005,-0.511835,0.353822,-0.380911,-0.466709,...,0.143432,-0.960949,0.943244,-0.27978,0.992163,-0.272594,-0.816123,-0.092873,0.101696,Healthy Control
362,-0.148601,-0.72552,0.943156,-0.202657,-0.198579,0.414902,0.129539,-0.39189,-0.379837,-0.715462,...,0.866407,0.456303,0.324236,1.076032,0.051352,-0.834908,-0.668687,-0.561887,-0.827377,Other Disease
1258,0.011929,1.058332,1.413101,-0.465477,-1.960706,0.631811,-1.649101,-1.085995,0.368398,0.224198,...,-0.591476,0.866849,0.322388,0.944067,-0.325369,-1.232903,0.696745,-1.302979,-0.450345,Other Disease
786,0.503804,0.577888,2.426259,0.530903,0.216714,-0.374933,0.405839,0.540686,0.398083,2.000997,...,-1.781422,-0.959146,-0.752365,0.818461,-0.904962,0.848593,-0.002549,-0.697147,-1.340411,Other Disease
103,-0.434487,0.940954,-0.188762,1.004997,-0.876088,0.606063,0.199861,-0.221433,1.28137,-0.357939,...,0.812981,-0.398661,-0.302608,-0.63598,0.425842,0.244066,-0.24496,-0.03835,-0.588457,Latent TB


In [14]:
# List of features to check
features_to_check = ['ILMN_1774071', 'ILMN_3251610', 'ILMN_1812433', 'ILMN_2105441', 'ILMN_1654875', 'ILMN_1690241']

# Check if each feature exists in the merged_df columns
for feature in features_to_check:
    if feature in merged_df_2.columns:
        print(f"Feature '{feature}' exists in the dataset.")
    else:
        print(f"Feature '{feature}' does not exist in the dataset.")

Feature 'ILMN_1774071' exists in the dataset.
Feature 'ILMN_3251610' does not exist in the dataset.
Feature 'ILMN_1812433' exists in the dataset.
Feature 'ILMN_2105441' exists in the dataset.
Feature 'ILMN_1654875' exists in the dataset.
Feature 'ILMN_1690241' exists in the dataset.


## Check for missing values in 'merged_df' dataset

In [15]:
# Check for missing values and print only columns with missing values
missing_values = merged_df.isnull().sum()
print(missing_values[missing_values > 0])

Series([], dtype: int64)


In [16]:
# Check for missing values and print only columns with missing values
missing_values = merged_df_2.isnull().sum()
print(missing_values[missing_values > 0])

Series([], dtype: int64)


## Split 'merged_df' dataset in to 3 seratate datasets as 'TB_HC_OD' , 'PTB_EPTB' , 'ATB_LTB'

### Dataset 1: 'ATB_OT'

In [17]:
# Dataset 1: 'TB_HC_OD'
# Replace 'Active TB', 'Latent TB', 'Pulmonary TB', and 'Extra Pulmonary TB' with 'TB' in 'TB_Status'
ATB_OT = merged_df_2.copy()
ATB_OT['TB_Status'] = ATB_OT['TB_Status'].replace(
    ['Active TB', 'Pulmonary TB', 'Extra Pulmonary TB', 'TB'], 'Active TB'
)

ATB_OT['TB_Status'] = ATB_OT['TB_Status'].replace(
    ['Latent TB', 'Other Disease', 'Healthy Control'], 'Inactive'
)


In [18]:
# Verifying the shape and preview of each dataset
print("ATB_OT Dataset shape:", ATB_OT.shape)
print("ATB_OT Dataset preview:\n")
ATB_OT.sample(5)

ATB_OT Dataset shape: (1423, 30276)
ATB_OT Dataset preview:



Unnamed: 0,ILMN_1786125,ILMN_1666269,ILMN_1659156,ILMN_1698020,ILMN_1874678,ILMN_2368597,ILMN_1806745,ILMN_1664878,ILMN_1692100,ILMN_1721651,...,ILMN_1889215,ILMN_1811258,ILMN_2233539,ILMN_1658806,ILMN_2347193,ILMN_1723332,ILMN_2337789,ILMN_1869897,ILMN_1786046,TB_Status
279,-0.234128,1.591982,0.933098,-1.358856,-0.212423,0.172585,0.319557,0.670429,1.892993,0.505734,...,2.476021,2.386501,1.691155,1.418201,1.151636,-0.074147,0.640127,-0.15141,-0.386899,Inactive
150,-0.419281,2.490742,-2.688745,1.72542,0.75535,0.320261,0.242244,-0.306273,-1.306168,-0.859692,...,-0.554588,2.468569,-0.666556,-1.649645,-0.807196,-1.124215,-1.068798,-0.451676,-0.001431,Inactive
635,-1.051177,1.469354,-1.840519,-0.39387,-0.689428,0.527987,-0.160894,-1.111333,-0.038347,-0.129003,...,-2.236127,0.831412,-1.080852,-0.319342,0.373716,-0.312983,1.623471,0.560933,1.16934,Active TB
621,-0.174779,0.81856,1.223852,0.514001,0.200153,-0.310967,1.277447,-0.971865,1.51582,-1.791999,...,-0.075538,0.653649,0.221807,0.797665,0.527975,0.827363,-0.439999,-1.033363,-1.625745,Active TB
246,0.159915,-0.85191,1.632017,-0.43919,1.946044,-0.560446,0.097383,-0.173483,0.779514,0.491828,...,0.421839,0.154425,-0.420784,0.741207,2.04591,1.442281,2.714874,1.512308,1.788101,Inactive


In [19]:
# Get the number of occurrences for each class in the 'TB_Status' column
class_counts = ATB_OT['TB_Status'].value_counts()

# Display the class counts
print(class_counts)


TB_Status
Inactive     909
Active TB    514
Name: count, dtype: int64


In [20]:
# Save the DataFrame as a CSV file
output_csv_path = r'/content/drive/MyDrive/Research/TB_new/Datasets/train/processed/ATB_OT.csv'
ATB_OT.to_csv(output_csv_path, index=False)
print("CSV file saved to:", output_csv_path)

CSV file saved to: /content/drive/MyDrive/Research/TB_new/Datasets/train/processed/ATB_OT.csv


### Dataset 2: 'PTB_EPTB'

In [21]:
# Dataset 2: 'PTB_EPTB' - only rows where TB_Status is 'Pulmonary TB' or 'Extra Pulmonary TB'
PTB_EPTB = merged_df[merged_df['TB_Status'].isin(['Pulmonary TB', 'Extra Pulmonary TB'])]

In [22]:
# Verifying the shape and preview of each dataset
print("\nPTB_EPTB Dataset shape:", PTB_EPTB.shape)
print("PTB_EPTB Dataset preview:\n")
PTB_EPTB.sample(5)


PTB_EPTB Dataset shape: (394, 23190)
PTB_EPTB Dataset preview:



Unnamed: 0,ILMN_1786125,ILMN_1666269,ILMN_1659156,ILMN_1698020,ILMN_1874678,ILMN_2368597,ILMN_1806745,ILMN_1664878,ILMN_1692100,ILMN_1721651,...,ILMN_1756942,ILMN_1670158,ILMN_1889215,ILMN_1811258,ILMN_2233539,ILMN_2347193,ILMN_2337789,ILMN_1869897,ILMN_1786046,TB_Status
720,0.474923,-0.637929,-1.256571,0.669332,0.352354,-2.38316,0.548153,-0.190209,-0.83255,1.051022,...,1.670941,0.499496,-0.074015,0.168123,-0.703194,-1.086831,1.189531,-0.297533,-0.864366,Pulmonary TB
828,0.48796,1.105967,0.139778,-2.512685,0.081495,1.141844,0.620885,0.510938,-1.159385,1.112811,...,2.035421,-1.38421,1.033464,0.572159,1.090151,-2.06607,0.699433,1.31563,-0.765876,Pulmonary TB
1087,1.515203,1.422124,2.300015,-1.426172,-0.091309,-1.114697,0.640508,-1.420215,1.160722,-0.035106,...,0.672093,0.809481,0.871794,0.241775,-1.812429,-0.229192,-0.105009,0.115565,0.667965,Extra Pulmonary TB
627,0.491596,-0.004561,-1.298004,-0.580477,-0.872275,0.335619,0.897896,-1.5883,-0.835596,0.492699,...,-0.102841,-0.639052,-0.791946,0.333456,-0.301056,-0.542108,-0.040715,0.589311,0.363264,Extra Pulmonary TB
957,-1.723561,0.49154,0.603865,-1.442337,0.826891,-0.272243,0.577214,-2.589215,-1.369322,-1.284579,...,1.577055,-0.11793,1.645534,0.092196,1.049513,-0.417619,-0.633928,2.261445,0.690069,Pulmonary TB


In [23]:
# Get the number of occurrences for each class in the 'TB_Status' column
class_counts = PTB_EPTB['TB_Status'].value_counts()

# Display the class counts
print(class_counts)


TB_Status
Extra Pulmonary TB    211
Pulmonary TB          183
Name: count, dtype: int64


In [24]:
# Save the DataFrame as a CSV file
output_csv_path = r'/content/drive/MyDrive/Research/TB_new/Datasets/train/processed/PTB_EPTB.csv'
PTB_EPTB.to_csv(output_csv_path, index=False)
print("CSV file saved to:", output_csv_path)

CSV file saved to: /content/drive/MyDrive/Research/TB_new/Datasets/train/processed/PTB_EPTB.csv


### Dataset 3: 'LTB_OTHER'

In [25]:
# Dataset 3: 'ATB_LTB' - only rows where TB_Status is 'Latent TB' or 'Active TB'
LTB_OT = merged_df[merged_df['TB_Status'].isin(['Latent TB', 'Healthy Control', 'Other Disease'])]

# Relabel 'Healthy Control' and 'Other Disease' as 'Other'
LTB_OT['TB_Status'] = LTB_OT['TB_Status'].replace(
    ['Healthy Control', 'Other Disease'], 'Other'
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  LTB_OT['TB_Status'] = LTB_OT['TB_Status'].replace(


In [26]:
# Verifying the shape and preview of each dataset
print("\nLTB_OT Dataset shape:", LTB_OT.shape)
print("LTB_OT Dataset preview:\n")
LTB_OT.sample(5)


LTB_OT Dataset shape: (1031, 23190)
LTB_OT Dataset preview:



Unnamed: 0,ILMN_1786125,ILMN_1666269,ILMN_1659156,ILMN_1698020,ILMN_1874678,ILMN_2368597,ILMN_1806745,ILMN_1664878,ILMN_1692100,ILMN_1721651,...,ILMN_1756942,ILMN_1670158,ILMN_1889215,ILMN_1811258,ILMN_2233539,ILMN_2347193,ILMN_2337789,ILMN_1869897,ILMN_1786046,TB_Status
283,-0.173271,1.780736,0.693002,2.268473,-0.531632,0.992651,-0.809029,-0.861331,0.990198,1.243209,...,2.82918,-0.057319,2.611001,2.671206,1.47684,0.740324,1.632844,0.752826,0.849715,Other
655,-0.926048,-1.502529,0.288678,-0.81391,-1.101005,-0.087054,-0.779286,0.86759,1.252173,0.245008,...,-1.26735,0.095195,0.262988,-1.984738,0.535979,-0.339576,1.414426,-0.496253,-1.365886,Other
454,-0.953713,0.234795,-0.623661,0.701563,-0.287578,-1.135345,-1.373516,-1.516076,0.798472,0.579415,...,-0.364927,-0.580548,-0.667778,-0.484319,-0.704048,-0.561083,2.161234,-0.073716,1.197342,Other
1348,-0.78406,0.468347,-1.10002,-0.386776,-1.296555,0.711722,0.584973,0.302967,0.556126,-0.140504,...,-0.232801,0.413408,0.171503,0.877063,-1.102869,-0.035734,1.329777,0.688661,1.184594,Other
1411,-0.312581,-0.636206,0.335357,-0.93029,-1.727899,0.854253,-1.541572,0.950829,0.908423,0.982153,...,-0.601286,-0.398869,-1.236482,-0.757853,-1.117454,1.683115,-0.239307,-0.303556,-1.113768,Other


In [27]:
# Get the number of occurrences for each class in the 'TB_Status' column
class_counts = LTB_OT['TB_Status'].value_counts()

# Display the class counts
print(class_counts)

TB_Status
Other        851
Latent TB    180
Name: count, dtype: int64


In [28]:
# Save the DataFrame as a CSV file
output_csv_path = r'/content/drive/MyDrive/Research/TB_new/Datasets/train/processed/LTB_OT.csv'
LTB_OT.to_csv(output_csv_path, index=False)
print("CSV file saved to:", output_csv_path)

CSV file saved to: /content/drive/MyDrive/Research/TB_new/Datasets/train/processed/LTB_OT.csv
