In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from imblearn.over_sampling import SMOTE
import pandas as pd
import os

## Balancing ATB_OT Dataset

In [3]:
# Load dataset
file_path = r'/content/drive/MyDrive/Research/TB_new/Datasets/train/processed/ATB_OT.csv'
df = pd.read_csv(file_path)

# Extract features and target variable
X = df.drop(columns=['TB_Status'])
y = df['TB_Status']

# Apply SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Save the balanced dataset
ATB_OT = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['TB_Status'])], axis=1)

path  = r'/content/drive/MyDrive/Research/TB_new/Datasets/train/balanced/ATB_OT.csv'
os.makedirs(os.path.dirname(path), exist_ok=True)
ATB_OT.to_csv(path, index=False)

print("ATB_OT dataset balanced successfully!")


ATB_OT dataset balanced successfully!


In [4]:
# Get the number of occurrences for each class in the 'TB_Status' column
class_counts = ATB_OT['TB_Status'].value_counts()

# Display the class counts
print(class_counts)

TB_Status
Inactive     909
Active TB    909
Name: count, dtype: int64


## Balancing PTB_EPTB Dataset

In [5]:
# Load dataset
file_path = r'/content/drive/MyDrive/Research/TB_new/Datasets/train/processed/PTB_EPTB.csv'
df = pd.read_csv(file_path)

# Extract features and target variable
X = df.drop(columns=['TB_Status'])
y = df['TB_Status']

# Apply SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Save the balanced dataset
PTB_EPTB = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['TB_Status'])], axis=1)
PTB_EPTB.to_csv(r'/content/drive/MyDrive/Research/TB_new/Datasets/train/balanced/PTB_EPTB.csv', index=False)

print("PTB_EPTB dataset balanced successfully!")


PTB_EPTB dataset balanced successfully!


In [6]:
# Get the number of occurrences for each class in the 'TB_Status' column
class_counts = PTB_EPTB['TB_Status'].value_counts()

# Display the class counts
print(class_counts)

TB_Status
Extra Pulmonary TB    211
Pulmonary TB          211
Name: count, dtype: int64


## Balancing LTB_OT Dataset

In [7]:
# Load dataset
file_path = r'/content/drive/MyDrive/Research/TB_new/Datasets/train/processed/LTB_OT.csv'
df = pd.read_csv(file_path)

# Extract features and target variable
X = df.drop(columns=['TB_Status'])
y = df['TB_Status']

# Apply SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Save the balanced dataset
LTB_OT = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['TB_Status'])], axis=1)
LTB_OT.to_csv(r'/content/drive/MyDrive/Research/TB_new/Datasets/train/balanced/LTB_OT.csv', index=False)

print("LTB_OT dataset balanced successfully!")


LTB_OT dataset balanced successfully!


In [8]:
# Get the number of occurrences for each class in the 'TB_Status' column
class_counts = LTB_OT['TB_Status'].value_counts()

# Display the class counts
print(class_counts)

TB_Status
Healthy Control    608
Latent TB          608
Other Disease      608
Name: count, dtype: int64
