<a href="https://colab.research.google.com/github/Nzau1/DR.-DIAGNOSIS/blob/main/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import os

# Define file path
drive_path = "/content/drive/MyDrive/diagnosis/Final_Augmented_dataset_Diseases_and_Symptoms.csv"

# Load dataset
df = pd.read_csv(drive_path)

# Display dataset info
print("\nDataset Info:")
print(df.info())

# Display first few rows
print("\nFirst 5 rows:")
print(df.head())

# Handle missing values
df.fillna(method='ffill', inplace=True)  # Forward fill for missing values

# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Encode categorical features
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoders for later use

# Normalize numerical features (if applicable)
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Splitting into features and target
X = df.drop(columns=['diseases'])  # Changed 'Disease' to 'diseases'
y = df['diseases']  # Changed 'Disease' to 'diseases'

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save preprocessed data back to Google Drive
output_folder = "/content/drive/MyDrive/diagnosis/processed_data"
os.makedirs(output_folder, exist_ok=True)

X_train.to_csv(f"{output_folder}/X_train.csv", index=False)
X_test.to_csv(f"{output_folder}/X_test.csv", index=False)
y_train.to_csv(f"{output_folder}/y_train.csv", index=False)
y_test.to_csv(f"{output_folder}/y_test.csv", index=False)

print("\n✅ Data Preprocessing Complete! Preprocessed data saved in Google Drive.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246945 entries, 0 to 246944
Columns: 378 entries, diseases to neck weakness
dtypes: int64(377), object(1)
memory usage: 712.2+ MB
None

First 5 rows:
         diseases  anxiety and nervousness  depression  shortness of breath  \
0  panic disorder                        1           0                    1   
1  panic disorder                        0           0                    1   
2  panic disorder                        1           1                    1   
3  panic disorder                        1           0                    0   
4  panic disorder                        1           1                    0   

   depressive or psychotic symptoms  sharp chest pain  dizziness  insomnia  \
0                                 1                 0          0         0   
1                        

  df.fillna(method='ffill', inplace=True)  # Forward fill for missing values



✅ Data Preprocessing Complete! Preprocessed data saved in Google Drive.


In [3]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define the folder path
folder_path = "/content/drive/MyDrive/diagnosis"

# Create the folder if it doesn't exist
if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    print(f"✅ Folder 'diagnosis' created successfully in Google Drive!")
else:
    print(f"⚠️ Folder 'diagnosis' already exists in Google Drive.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
⚠️ Folder 'diagnosis' already exists in Google Drive.
