In [16]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

lung_cancer_df = pd.read_csv('survey lung cancer.csv')
heart_disease_df = pd.read_csv('heart.csv')
diabetes_df = pd.read_csv('diabetes.csv')
kidney_disease_df = pd.read_csv('kidney_disease.csv')

lung_cancer_df['Disease'] = 'Lung_Cancer'
heart_disease_df['Disease'] = 'Heart_Disease'
diabetes_df['Disease'] = 'Diabetes'
kidney_disease_df['Disease'] = 'Kidney_Disease'

combined_df = pd.concat([lung_cancer_df, heart_disease_df, diabetes_df, kidney_disease_df], axis=0, ignore_index=True)

le = LabelEncoder()
combined_df['Disease'] = le.fit_transform(combined_df['Disease'])

numeric_cols = combined_df.select_dtypes(include=['float64', 'int64']).columns
combined_df[numeric_cols] = combined_df[numeric_cols].fillna(combined_df[numeric_cols].mean())

X = combined_df.drop('Disease', axis=1)
y = combined_df['Disease']

X = pd.get_dummies(X, drop_first=True)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

clf = RandomForestClassifier(
    n_estimators=10,         
    max_depth=2,             
    max_features='sqrt',      
    min_samples_split=7,    
    min_samples_leaf=3,       
    random_state=42
)

cv_scores = cross_val_score(clf, X_scaled, y, cv=skf, scoring='accuracy')
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", cv_scores.mean())

clf.fit(X_scaled, y)
y_pred = clf.predict(X_scaled)
print("Final Model Accuracy:", accuracy_score(y, y_pred))
print("Classification Report:\n", classification_report(y, y_pred, target_names=le.classes_))


Cross-Validation Accuracy Scores: [0.94411178 0.90219561 0.902      0.862      0.878     ]
Mean Cross-Validation Accuracy: 0.8976614770459082
Final Model Accuracy: 0.8972821742605915
Classification Report:
                 precision    recall  f1-score   support

      Diabetes       0.75      1.00      0.86       768
 Heart_Disease       1.00      1.00      1.00      1025
Kidney_Disease       1.00      0.91      0.95       400
   Lung_Cancer       1.00      0.30      0.47       309

      accuracy                           0.90      2502
     macro avg       0.94      0.80      0.82      2502
  weighted avg       0.92      0.90      0.88      2502

