**TASK 4: Disease Prediction from Medical Data**
**Objective:** Predict the possibility of diseases based on patient data.
**Approach:** Apply classification techniques to structured medical datasets.
**Key Features:**
**● Use features** like symptoms, age, blood test results, etc.
**● Algorithms:** SVM, Logistic Regression, Random Forest, XGBoost.
**● Datasets:** Heart disease, Diabetes, Breast Cancer (UCI ML Repository).

In [None]:
# Library Imports & Environment Setup
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Define Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

**DIABETES PREDICTION**

In [None]:
# Diabetes Dataset Loading
diabetes_df = pd.read_csv("/content/diabetes_Kaggle_CA_Task_4.csv")
diabetes_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
# Diabetes Data Preprocessing
X_diabetes = diabetes_df.drop("Outcome", axis=1)
y_diabetes = diabetes_df["Outcome"]

X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(
    X_diabetes, y_diabetes, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_d = scaler.fit_transform(X_train_d)
X_test_d = scaler.transform(X_test_d)

In [None]:
# Diabetes Disease Prediction (Train the Model)
print("----- Diabetes Prediction Results -----")
for name, model in models.items():
    model.fit(X_train_d, y_train_d)
    y_pred = model.predict(X_test_d)
    acc = accuracy_score(y_test_d, y_pred)
    print(f"{name} Accuracy: {acc:.2f}")

----- Diabetes Prediction Results -----
Logistic Regression Accuracy: 0.75
SVM Accuracy: 0.73
Random Forest Accuracy: 0.72


**BREAST CANCER PREDICTION**

In [None]:
# Breast Cancer Dataset Loading
cancer_df = pd.read_csv("/content/breast cancer data_kaggle_CA_Task_4.csv")
cancer_df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [None]:
# Breast Cancer Data Preprocessing
# Remove unnecessary columns
cancer_df = cancer_df.drop(columns=["id", "Unnamed: 32"], errors="ignore")

# Encode target column
cancer_df["diagnosis"] = cancer_df["diagnosis"].map({"M": 1, "B": 0})

X_cancer = cancer_df.drop("diagnosis", axis=1)
y_cancer = cancer_df["diagnosis"]

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_cancer, y_cancer, test_size=0.2, random_state=42
)

X_train_c = scaler.fit_transform(X_train_c)
X_test_c = scaler.transform(X_test_c)

In [None]:
# Breast Cancer Disease Prediction (Train the Model)
print("\n----- Breast Cancer Prediction Results -----")
for name, model in models.items():
    model.fit(X_train_c, y_train_c)
    y_pred = model.predict(X_test_c)
    acc = accuracy_score(y_test_c, y_pred)
    print(f"{name} Accuracy: {acc:.2f}")


----- Breast Cancer Prediction Results -----
Logistic Regression Accuracy: 0.97
SVM Accuracy: 0.98
Random Forest Accuracy: 0.96


**HEART DISEASE PREDICTION (UCI)**

In [None]:
# Heart Disease Dataset Loading
heart_df = pd.read_csv("/content/heart_disease_uci_kaggle_CA_Task_4.csv")
heart_df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [None]:
# Check which columns are object (string)
heart_df.select_dtypes(include="object").columns

Index(['sex', 'dataset', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal'], dtype='object')

In [None]:
# Convert categorical columns to numeric
heart_df = pd.get_dummies(heart_df, drop_first=True)

In [None]:
# Heart Disease Data Preprocessing
# Handle missing values
heart_df = heart_df.dropna()

# Identify target column safely
if "target" in heart_df.columns:
    target_col = "target"
elif "num" in heart_df.columns:
    target_col = "num"
else:
    raise ValueError("Target column not found in heart dataset")

# Separate features and target
X_heart = heart_df.drop(target_col, axis=1)
y_heart = heart_df[target_col]

# Convert multi-class to binary (if needed)
y_heart = y_heart.apply(lambda x: 1 if x > 0 else 0)

# Drop 'id' column as it's not a feature
if 'id' in X_heart.columns:
    X_heart = X_heart.drop('id', axis=1)

# Convert boolean columns to integers
for col in ['fbs', 'exang']:
    if col in X_heart.columns:
        X_heart[col] = X_heart[col].astype(int)

# Identify categorical columns for one-hot encoding
categorical_cols = X_heart.select_dtypes(include=['object']).columns
X_heart = pd.get_dummies(X_heart, columns=categorical_cols, drop_first=True) # drop_first to avoid multicollinearity

X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(
    X_heart, y_heart, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_h = scaler.fit_transform(X_train_h)
X_test_h = scaler.transform(X_test_h)

In [None]:
# Heart Disease Prediction (Test the Model)
print("\n----- Heart Disease Prediction -----")
for name, model in models.items():
    model.fit(X_train_h, y_train_h)
    y_pred = model.predict(X_test_h)
    print(f"{name} Accuracy: {accuracy_score(y_test_h, y_pred):.2f}")


----- Heart Disease Prediction -----
Logistic Regression Accuracy: 0.87
SVM Accuracy: 0.84
Random Forest Accuracy: 0.85
