In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_excel("../db/data.xlsx")
df.head(3)

Unnamed: 0,Class,Notes
0,4,Catheterization laboratory events and hospital...
1,5,Renal abscess in children. Three cases of rena...
2,2,Hyperplastic polyps seen at sigmoidoscopy are ...


In [4]:
""" 
1. We first separate the features (X) and the target variable (y).

2. We use train_test_split twice to split the data into three subsets: train, test, and validation.
   The stratify parameter is set to y to ensure that each subset maintains the same class distribution
   as the original dataset.

3. Split the data into train (75%), test (20%), and validation (5%) sets with stratified sampling.
"""

X = df.drop(columns=['Class'])  # Features
y = df['Class']  # Target variable

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.20, stratify=y_temp, random_state=42)


train_distribution = y_train.value_counts(normalize=True).sort_index()
test_distribution = y_test.value_counts(normalize=True).sort_index()
val_distribution = y_val.value_counts(normalize=True).sort_index()

print("Class Distribution in Train Set:")
print(train_distribution)

print("\nClass Distribution in Test Set:")
print(test_distribution)

print("\nClass Distribution in Validation Set:")
print(val_distribution)

Class Distribution in Train Set:
1    0.219062
2    0.103436
3    0.133358
4    0.211304
5    0.332841
Name: Class, dtype: float64

Class Distribution in Test Set:
1    0.219183
2    0.103532
3    0.133310
4    0.211219
5    0.332756
Name: Class, dtype: float64

Class Distribution in Validation Set:
1    0.218837
2    0.103878
3    0.132964
4    0.211911
5    0.332410
Name: Class, dtype: float64


In [8]:
X_train.shape,X_test.shape,X_val.shape

((10828, 1), (2888, 1), (722, 1))

* Now, we have 10.8k records for training and 2.8k records for testing and 700 records for validation of ML models.

In [9]:
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)

# Define file paths for Excel export
train_file = "../db/train_data.xlsx"
test_file = "../db/test_data.xlsx"
val_file = "../db/validation_data.xlsx"

# Export DataFrames to Excel
train_df.to_excel(train_file, index=False)
test_df.to_excel(test_file, index=False)
val_df.to_excel(val_file, index=False)