In [None]:
import pandas as pd

try:
  
    df = pd.read_csv("../Dataset/hd_dataset.csv")
    
    print(" Dataset loaded successfully.")
    
    
    display(df.head())

except FileNotFoundError:
    print("Error: 'Huntington_Disease_Dataset.csv' not found.")
    print("Please make sure the file is in the same directory as your notebook or has been uploaded.")

✅ Dataset loaded successfully.


Unnamed: 0,Patient_ID,Age,Sex,Family_History,HTT_CAG_Repeat_Length,Age_of_Onset,Motor_Score,Cognitive_Score,Chorea_Score,Functional_Capacity_Score,Gene/Factor,Function,Effect,Category,Disease_Stage
0,HD_0001,69,Male,No,48,55,54,37,10.0,35,HTT,CAG Trinucleotide Repeat Expansion,Neurodegeneration,Primary Cause,Severe
1,HD_0002,80,Female,No,38,78,0,94,0.0,98,HTT,CAG Trinucleotide Repeat Expansion,Neurodegeneration,Primary Cause,Middle
2,HD_0003,62,Female,Yes,44,62,22,86,10.0,62,HTT,CAG Trinucleotide Repeat Expansion,Neurodegeneration,Primary Cause,Middle
3,HD_0004,80,Female,Yes,35,80,0,93,0.0,96,HTT (Somatic Expansion),CAG Repeat Instability,Faster Disease Onset,Cis-acting Modifier,No Disease
4,HD_0005,67,Male,Yes,48,57,35,66,10.0,66,HTT,CAG Trinucleotide Repeat Expansion,Neurodegeneration,Primary Cause,Middle


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import warnings

warnings.filterwarnings('ignore')



RAW_DATA_FILE = "../Dataset/hd_dataset.csv"

# Load the dataset
print(f"Loading raw data from '{RAW_DATA_FILE}'...")
df = pd.read_csv(RAW_DATA_FILE)

print("\nFirst 5 rows of the raw dataset:")
display(df.head())

print(f"\nInitial dataset shape: {df.shape}")

Loading raw data from 'hd_dataset.csv'...

First 5 rows of the raw dataset:


Unnamed: 0,Patient_ID,Age,Sex,Family_History,HTT_CAG_Repeat_Length,Age_of_Onset,Motor_Score,Cognitive_Score,Chorea_Score,Functional_Capacity_Score,Gene/Factor,Function,Effect,Category,Disease_Stage
0,HD_0001,69,Male,No,48,55,54,37,10.0,35,HTT,CAG Trinucleotide Repeat Expansion,Neurodegeneration,Primary Cause,Severe
1,HD_0002,80,Female,No,38,78,0,94,0.0,98,HTT,CAG Trinucleotide Repeat Expansion,Neurodegeneration,Primary Cause,Middle
2,HD_0003,62,Female,Yes,44,62,22,86,10.0,62,HTT,CAG Trinucleotide Repeat Expansion,Neurodegeneration,Primary Cause,Middle
3,HD_0004,80,Female,Yes,35,80,0,93,0.0,96,HTT (Somatic Expansion),CAG Repeat Instability,Faster Disease Onset,Cis-acting Modifier,No Disease
4,HD_0005,67,Male,Yes,48,57,35,66,10.0,66,HTT,CAG Trinucleotide Repeat Expansion,Neurodegeneration,Primary Cause,Middle



Initial dataset shape: (48768, 15)


In [14]:
# Fill missing numerical values with the median of their respective column
for col in df.select_dtypes(include=['int64', 'float64']).columns:
    if df[col].isnull().sum() > 0:
        median_val = df[col].median()
        df[col].fillna(median_val, inplace=True)
        


In [15]:
print("Performing feature extraction...")

if 'Age' in df.columns and 'Age_of_Onset' in df.columns:
    df['Disease_Duration'] = df['Age'] - df['Age_of_Onset']
   
    df['Disease_Duration'] = df['Disease_Duration'].clip(lower=0)
    print("Created new feature: 'Disease_Duration'.")

    print("\nDataset with the new 'Disease_Duration' column:")
    display(df.head())
else:
    print("Could not create 'Disease_Duration' as 'Age' or 'Age_of_Onset' is missing.")

Performing feature extraction...
Created new feature: 'Disease_Duration'.

Dataset with the new 'Disease_Duration' column:


Unnamed: 0,Patient_ID,Age,Sex,Family_History,HTT_CAG_Repeat_Length,Age_of_Onset,Motor_Score,Cognitive_Score,Chorea_Score,Functional_Capacity_Score,Gene/Factor,Function,Effect,Category,Disease_Stage,Disease_Duration
0,HD_0001,69,Male,No,48,55,54,37,10.0,35,HTT,CAG Trinucleotide Repeat Expansion,Neurodegeneration,Primary Cause,Severe,14
1,HD_0002,80,Female,No,38,78,0,94,0.0,98,HTT,CAG Trinucleotide Repeat Expansion,Neurodegeneration,Primary Cause,Middle,2
2,HD_0003,62,Female,Yes,44,62,22,86,10.0,62,HTT,CAG Trinucleotide Repeat Expansion,Neurodegeneration,Primary Cause,Middle,0
3,HD_0004,80,Female,Yes,35,80,0,93,0.0,96,HTT (Somatic Expansion),CAG Repeat Instability,Faster Disease Onset,Cis-acting Modifier,No Disease,0
4,HD_0005,67,Male,Yes,48,57,35,66,10.0,66,HTT,CAG Trinucleotide Repeat Expansion,Neurodegeneration,Primary Cause,Middle,10


In [17]:
print("Performing Label Encoding on all text-based columns...")
print(f"Shape of DataFrame before encoding: {df.shape}")


le = LabelEncoder()


for col in df.select_dtypes(include=['object']).columns:
    df[col] = le.fit_transform(df[col])
    print(f"Label encoded '{col}'.")
    
print(f"\nShape of DataFrame after encoding: {df.shape} (No new columns added)")

print("\nFirst 5 rows of the fully numerical DataFrame:")
display(df.head())

Performing Label Encoding on all text-based columns...
Shape of DataFrame before encoding: (48768, 16)

Shape of DataFrame after encoding: (48768, 16) (No new columns added)

First 5 rows of the fully numerical DataFrame:


Unnamed: 0,Patient_ID,Age,Sex,Family_History,HTT_CAG_Repeat_Length,Age_of_Onset,Motor_Score,Cognitive_Score,Chorea_Score,Functional_Capacity_Score,Gene/Factor,Function,Effect,Category,Disease_Stage,Disease_Duration
0,0,69,1,0,48,55,54,37,10.0,35,1,1,3,1,3,14
1,1,80,0,0,38,78,0,94,0.0,98,1,1,3,1,1,2
2,2,62,0,1,44,62,22,86,10.0,62,1,1,3,1,1,0
3,3,80,0,1,35,80,0,93,0.0,96,2,0,2,0,2,0
4,4,67,1,1,48,57,35,66,10.0,66,1,1,3,1,1,10


In [None]:

PROCESSED_CLEAN_FILE = "../Dataset/pre_processed_dataset.csv"

try:
    df.to_csv(PROCESSED_CLEAN_FILE, index=False)
    print(f"\nSuccessfully saved fully preprocessed data to '{PROCESSED_CLEAN_FILE}'")
except Exception as e:
    print(f"Error saving file: {e}")


Successfully saved fully preprocessed data to 'pre_processed_dataset.csv'


In [None]:
import pandas as pd

try:
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv("../Dataset/pre_processed_dataset.csv")
    
    print("✅ Dataset loaded successfully.")
    
    # Display the first 5 rows of the DataFrame
    display(df.head())

except FileNotFoundError:
    print("Error: 'Huntington_Disease_Dataset.csv' not found.")
    print("Please make sure the file is in the same directory as your notebook or has been uploaded.")

✅ Dataset loaded successfully.


Unnamed: 0,Patient_ID,Age,Sex,Family_History,HTT_CAG_Repeat_Length,Age_of_Onset,Motor_Score,Cognitive_Score,Chorea_Score,Functional_Capacity_Score,Gene/Factor,Function,Effect,Category,Disease_Stage,Disease_Duration
0,0,69,1,0,48,55,54,37,10.0,35,1,1,3,1,3,14
1,1,80,0,0,38,78,0,94,0.0,98,1,1,3,1,1,2
2,2,62,0,1,44,62,22,86,10.0,62,1,1,3,1,1,0
3,3,80,0,1,35,80,0,93,0.0,96,2,0,2,0,2,0
4,4,67,1,1,48,57,35,66,10.0,66,1,1,3,1,1,10
