In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load dataset
data = pd.read_csv('C:/Users/adhar/Jupiter notebook/Python/Data Analsis/Data-analysis/PCA/ilpd_data.csv')

# Preprocess data (convert categorical variables, handle missing values)
data['GENDER'] = data['GENDER'].map({'Male': 0, 'Female': 1})
data.fillna(data.mean(), inplace=True)

# Separate features and target variable
X = data.drop(columns=['SELECTOR'])
y = data['SELECTOR'].map({1: 1, 2: 0})  # Convert to binary classification

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate model performance
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Feature importance
importances = rf_model.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print(importance_df)


[[ 7 23]
 [15 70]]
              precision    recall  f1-score   support

           0       0.32      0.23      0.27        30
           1       0.75      0.82      0.79        85

    accuracy                           0.67       115
   macro avg       0.54      0.53      0.53       115
weighted avg       0.64      0.67      0.65       115

     Feature  Importance
4    ALKPHOS    0.142484
6       SGOT    0.141762
0        AGE    0.139777
5       SGPT    0.125022
2         TB    0.097638
7         TP    0.087892
8        ALB    0.086558
9  A/G RATIO    0.079805
3         DB    0.079717
1     GENDER    0.019346
