In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

# 1. Reading the Datasets
Train_df = pd.read_csv(r"C:\Users\hp\Downloads\Diseases prediction dataset\Training.csv")
Test_df = pd.read_csv(r"C:\Users\hp\Downloads\Diseases prediction dataset\Testing.csv")

# Drop the unnecessary column if it exists
if 'Unnamed: 133' in Train_df.columns:
    Train_df.drop('Unnamed: 133', axis=1, inplace=True)
if 'Unnamed: 133' in Test_df.columns:
    Test_df.drop('Unnamed: 133', axis=1, inplace=True)

# Splitting features and target variable
X_train = Train_df.drop('prognosis', axis=1)
y_train = Train_df['prognosis']

X_test = Test_df.drop('prognosis', axis=1)
y_test = Test_df['prognosis']

# Diagnose: Check if any column in X_train has all NaN values
all_nan_columns = X_train.columns[X_train.isnull().all()].tolist()
if all_nan_columns:
    print("Columns with all NaN values:", all_nan_columns)
    X_train = X_train.drop(columns=all_nan_columns)
    X_test = X_test.drop(columns=all_nan_columns)

# 2. Handle Missing Values Using Imputation
num_imputer = SimpleImputer(strategy="mean")  # for numeric columns

# Impute and retain the column names
X_train_imputed = pd.DataFrame(num_imputer.fit_transform(X_train), columns=X_train.columns)
X_test_imputed = pd.DataFrame(num_imputer.transform(X_test), columns=X_test.columns)

# 3. Train the Model
model = RandomForestClassifier()
model.fit(X_train_imputed, y_train)

# 4. Test the Model and Display Results
y_pred = model.predict(X_test_imputed)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.9761904761904762
Classification Report:
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00         1
                                   AIDS       1.00      1.00      1.00         1
                                   Acne       1.00      1.00      1.00         1
                    Alcoholic hepatitis       1.00      1.00      1.00         1
                                Allergy       1.00      1.00      1.00         1
                              Arthritis       1.00      1.00      1.00         1
                       Bronchial Asthma       1.00      1.00      1.00         1
                   Cervical spondylosis       1.00      1.00      1.00         1
                            Chicken pox       0.50      1.00      0.67         1
                    Chronic cholestasis       1.00      1.00      1.00         1
                            Common Cold       1.00      

In [None]:
# Model Evaluation and Optimization:

## a. Cross-Validation:

In [2]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X_train_imputed, y_train, cv=5)
print("Average cross-validation score: {:.2f}".format(scores.mean()))


Average cross-validation score: 1.00
