In [4]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# Importing Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os


In [6]:
# Set file path from Drive
file_path = '/content/drive/MyDrive/00_Final_Project_MENG/Dataset_1/IQVIA_training_data.csv'

# Load the dataset
df = pd.read_csv(file_path)
print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

Dataset loaded: 2142 rows, 1877 columns


In [7]:
# Checking for missing values
if df.isnull().values.any():
    raise ValueError("The dataset has missing values.")
else:
    print("No missing values detected.")

# Validate unique patient_id per row
if df.shape[0] != df['patient_id'].nunique():
    raise ValueError("Each row does not represent a unique patient.")
else:
    print("Each row represents a unique patient.")

No missing values detected.
Each row represents a unique patient.


In [8]:
# Drop unneeded columns and create target variable
df['__target__'] = df['cohort_flag'].map(str).map({'1': 1, '0': 0})
df = df.drop(columns=['patient_id', 'cohort_type', 'cohort_flag'])

# Drop potential data leakage columns
leak_features = ['cardiomyopathy_in_diseases_classified_elsewhere', 'other_forms_of_heart_disease']
df = df.drop(columns=leak_features)

# First split: 70% train, 30% temp (val + test)
train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df['__target__'], random_state=42)

# Second split: 15% val, 15% test from the 30% temp
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['__target__'], random_state=42)

# Confirm sizes
print(f"Train set: {train_df.shape}")
print(f"Validation set: {val_df.shape}")
print(f"Test set: {test_df.shape}")

# Save to CSV
train_df.to_csv('/content/drive/MyDrive/00_Final_Project_MENG/Dataset_1/train_data.csv', index=False)
val_df.to_csv('/content/drive/MyDrive/00_Final_Project_MENG/Dataset_1/val_data.csv', index=False)
test_df.to_csv('/content/drive/MyDrive/00_Final_Project_MENG/Dataset_1/test_data.csv', index=False)
print("Saved train_data.csv, val_data.csv, and test_data.csv")


Train set: (1499, 1873)
Validation set: (321, 1873)
Test set: (322, 1873)
Saved train_data.csv, val_data.csv, and test_data.csv
