In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv('Hepatitis-Train.csv')
test_data = pd.read_csv('Hepatitis-Test.csv')

In [3]:
train_data.head()

Unnamed: 0,Age,Sex,Steroid,Antivirals,Fatigue,Malaise,Anorexia,Liver Big,Liver Firm,Spleen Palpable,Spiders,Ascites,Varices,Bilirubin,ALK Phosphate,SGOT,Albumin,PROTIME,Histology,TARGET
0,30,Female,no,yes,yes,yes,yes,no,yes,yes,yes,yes,yes,1.0,85,18,4.0,62.16,no,2
1,50,Male,no,yes,no,yes,yes,no,yes,yes,yes,yes,yes,0.9,135,42,3.5,62.16,no,2
2,78,Male,yes,yes,no,yes,yes,yes,yes,yes,yes,yes,yes,0.7,96,32,4.0,62.16,no,2
3,31,Male,no,no,yes,yes,yes,yes,yes,yes,yes,yes,yes,0.7,46,52,4.0,80.0,no,2
4,34,Male,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes,1.0,104,200,4.0,62.16,no,2


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134 entries, 0 to 133
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Age              134 non-null    int64  
 1   Sex              134 non-null    object 
 2   Steroid          134 non-null    object 
 3   Antivirals       134 non-null    object 
 4   Fatigue          134 non-null    object 
 5   Malaise          134 non-null    object 
 6   Anorexia         134 non-null    object 
 7   Liver Big        134 non-null    object 
 8   Liver Firm       134 non-null    object 
 9   Spleen Palpable  134 non-null    object 
 10  Spiders          134 non-null    object 
 11  Ascites          134 non-null    object 
 12  Varices          134 non-null    object 
 13  Bilirubin        134 non-null    float64
 14  ALK Phosphate    134 non-null    int64  
 15  SGOT             134 non-null    int64  
 16  Albumin          134 non-null    float64
 17  PROTIME         

In [5]:
test_data.head(5)

Unnamed: 0,Age,Sex,Steroid,Antivirals,Fatigue,Malaise,Anorexia,Liver Big,Liver Firm,Spleen Palpable,Spiders,Ascites,Varices,Bilirubin,ALK Phosphate,SGOT,Albumin,PROTIME,Histology,TARGET
0,54,Male,no,yes,no,no,yes,yes,yes,no,yes,no,yes,3.9,120,28,3.5,43.0,yes,1
1,49,Male,no,yes,no,no,yes,yes,yes,no,no,yes,yes,1.4,85,70,3.5,35.0,yes,1
2,45,Male,yes,yes,no,no,no,yes,yes,yes,no,no,yes,1.9,104,114,2.4,62.16,yes,1
3,41,Male,yes,yes,no,yes,yes,yes,no,no,no,yes,no,4.2,65,120,3.4,62.16,yes,1
4,46,Male,yes,yes,no,no,no,yes,yes,yes,no,no,no,7.6,104,242,3.3,50.0,yes,1


In [6]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Age              11 non-null     int64  
 1   Sex              11 non-null     object 
 2   Steroid          11 non-null     object 
 3   Antivirals       11 non-null     object 
 4   Fatigue          11 non-null     object 
 5   Malaise          11 non-null     object 
 6   Anorexia         11 non-null     object 
 7   Liver Big        11 non-null     object 
 8   Liver Firm       11 non-null     object 
 9   Spleen Palpable  11 non-null     object 
 10  Spiders          11 non-null     object 
 11  Ascites          11 non-null     object 
 12  Varices          11 non-null     object 
 13  Bilirubin        11 non-null     float64
 14  ALK Phosphate    11 non-null     int64  
 15  SGOT             11 non-null     int64  
 16  Albumin          11 non-null     float64
 17  PROTIME          1

In [7]:
train_data.describe()

Unnamed: 0,Age,Bilirubin,ALK Phosphate,SGOT,Albumin,PROTIME,TARGET
count,134.0,134.0,134.0,134.0,134.0,134.0,134.0
mean,40.567164,1.358209,103.985075,81.253731,3.8645,62.80209,1.828358
std,12.382651,1.106461,46.328832,82.963321,0.617837,17.983733,0.378484
min,7.0,0.3,26.0,14.0,2.1,0.0,1.0
25%,31.25,0.7,76.5,31.25,3.625,57.0,2.0
50%,38.5,1.0,95.0,55.0,4.0,62.16,2.0
75%,50.0,1.5,124.5,96.5,4.2,67.0,2.0
max,78.0,8.0,280.0,648.0,6.4,100.0,2.0


In [8]:
X_train = train_data.drop(columns=['TARGET'])
y_train = train_data['TARGET']
X_test = test_data.drop(columns=['TARGET'])
y_test = test_data['TARGET']

In [9]:
categorical_cols = X_train.select_dtypes(include=['object']).columns
label_encoders = {col: LabelEncoder() for col in categorical_cols}

for col in categorical_cols:
    label_encoders[col].fit(X_train[col].astype(str))
    X_train[col] = label_encoders[col].transform(X_train[col].astype(str))
    X_test[col] = label_encoders[col].transform(X_test[col].astype(str))

In [10]:
X_test.head()

Unnamed: 0,Age,Sex,Steroid,Antivirals,Fatigue,Malaise,Anorexia,Liver Big,Liver Firm,Spleen Palpable,Spiders,Ascites,Varices,Bilirubin,ALK Phosphate,SGOT,Albumin,PROTIME,Histology
0,54,1,0,1,0,0,1,1,1,0,1,0,1,3.9,120,28,3.5,43.0,1
1,49,1,0,1,0,0,1,1,1,0,0,1,1,1.4,85,70,3.5,35.0,1
2,45,1,1,1,0,0,0,1,1,1,0,0,1,1.9,104,114,2.4,62.16,1
3,41,1,1,1,0,1,1,1,0,0,0,1,0,4.2,65,120,3.4,62.16,1
4,46,1,1,1,0,0,0,1,1,1,0,0,0,7.6,104,242,3.3,50.0,1


In [11]:
X_train.head()

Unnamed: 0,Age,Sex,Steroid,Antivirals,Fatigue,Malaise,Anorexia,Liver Big,Liver Firm,Spleen Palpable,Spiders,Ascites,Varices,Bilirubin,ALK Phosphate,SGOT,Albumin,PROTIME,Histology
0,30,0,0,1,1,1,1,0,1,1,1,1,1,1.0,85,18,4.0,62.16,0
1,50,1,0,1,0,1,1,0,1,1,1,1,1,0.9,135,42,3.5,62.16,0
2,78,1,1,1,0,1,1,1,1,1,1,1,1,0.7,96,32,4.0,62.16,0
3,31,1,0,0,1,1,1,1,1,1,1,1,1,0.7,46,52,4.0,80.0,0
4,34,1,1,1,1,1,1,1,1,1,1,1,1,1.0,104,200,4.0,62.16,0


In [12]:
train_data.isnull().sum()

Unnamed: 0,0
Age,0
Sex,0
Steroid,0
Antivirals,0
Fatigue,0
Malaise,0
Anorexia,0
Liver Big,0
Liver Firm,0
Spleen Palpable,0


In [13]:
test_data.isnull().sum()

Unnamed: 0,0
Age,0
Sex,0
Steroid,0
Antivirals,0
Fatigue,0
Malaise,0
Anorexia,0
Liver Big,0
Liver Firm,0
Spleen Palpable,0


In [14]:
scaler = StandardScaler()
numerical_cols = X_train.select_dtypes(include=[np.number]).columns
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

In [15]:
y_train.head()

Unnamed: 0,TARGET
0,2
1,2
2,2
3,2
4,2


In [16]:
y_test.head()

Unnamed: 0,TARGET
0,1
1,1
2,1
3,1
4,1


In [17]:
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "LinearSVC": LinearSVC(random_state=42, max_iter=10000),  # Ensure convergence with higher iterations
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier()
}

In [18]:
results = {name: {} for name in models.keys()}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    results[name]['Accuracy'] = accuracy_score(y_test, y_pred)
    results[name]['Precision'] = precision_score(y_test, y_pred, zero_division=0)
    results[name]['Recall'] = recall_score(y_test, y_pred, zero_division=0)
    results[name]['F1 Score'] = f1_score(y_test, y_pred, zero_division=0)


In [19]:
results

{'Random Forest': {'Accuracy': 0.7272727272727273,
  'Precision': 0.8,
  'Recall': 0.6666666666666666,
  'F1 Score': 0.7272727272727273},
 'LinearSVC': {'Accuracy': 0.7272727272727273,
  'Precision': 0.7142857142857143,
  'Recall': 0.8333333333333334,
  'F1 Score': 0.7692307692307693},
 'Decision Tree': {'Accuracy': 0.8181818181818182,
  'Precision': 0.8333333333333334,
  'Recall': 0.8333333333333334,
  'F1 Score': 0.8333333333333334},
 'K-Nearest Neighbors': {'Accuracy': 0.7272727272727273,
  'Precision': 0.8,
  'Recall': 0.6666666666666666,
  'F1 Score': 0.7272727272727273}}

In [20]:
results_df = pd.DataFrame(results).T
results_df

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
Random Forest,0.727273,0.8,0.666667,0.727273
LinearSVC,0.727273,0.714286,0.833333,0.769231
Decision Tree,0.818182,0.833333,0.833333,0.833333
K-Nearest Neighbors,0.727273,0.8,0.666667,0.727273


In [21]:
from sklearn.model_selection import RandomizedSearchCV

# Defining the hyperparameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [22]:
rf_clf = RandomForestClassifier(random_state=42)

# Performing the randomized search with cross-validation
random_search_rf = RandomizedSearchCV(estimator=rf_clf, param_distributions=param_grid_rf,
                                      n_iter=50, cv=5, scoring='f1', random_state=42, n_jobs=-1)

random_search_rf.fit(X_train, y_train)

# Finding the best parameters
best_params_rf = random_search_rf.best_params_

  _data = np.array(data, dtype=dtype, copy=copy,


In [25]:
best_params_rf

{'n_estimators': 50,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_depth': 30,
 'bootstrap': False}

In [29]:
best_rf_clf = random_search_rf.best_estimator_
y_pred_rf_tuned = best_rf_clf.predict(X_test)
tuned_rf_metrics = {
    'Accuracy': accuracy_score(y_test, y_pred_rf_tuned),
    'Precision': precision_score(y_test, y_pred_rf_tuned, zero_division=0),
    'Recall': recall_score(y_test, y_pred_rf_tuned, zero_division=0),
    'F1 Score': f1_score(y_test, y_pred_rf_tuned, zero_division=0)
}

best_params_rf, tuned_rf_metrics

({'n_estimators': 50,
  'min_samples_split': 10,
  'min_samples_leaf': 2,
  'max_depth': 30,
  'bootstrap': False},
 {'Accuracy': 0.6363636363636364,
  'Precision': 0.75,
  'Recall': 0.5,
  'F1 Score': 0.6})

In [32]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import StackingClassifier

#Identifying the top 5 features using Random Forest feature_importances_
if "best_rf_clf" in locals():
    feature_importances = best_rf_clf.feature_importances_
else:
    # Use default Random Forest if the tuned model is unavailable
    rf_default = RandomForestClassifier(random_state=42)
    rf_default.fit(X_train, y_train)
    feature_importances = rf_default.feature_importances_

# Creating a DataFrame to rank feature importances
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

top_5_features = feature_importance_df.head(5)

In [33]:
top_5_features

Unnamed: 0,Feature,Importance
16,Albumin,0.233185
13,Bilirubin,0.159112
17,PROTIME,0.129479
12,Varices,0.079912
15,SGOT,0.075768


In [36]:
# Ensemble predictions with stacking
base_learners = [
    ('dt', models["Decision Tree"]),
    ('rf', models["Random Forest"]),
    ('knn', models["K-Nearest Neighbors"]),
    ('svc', models["LinearSVC"])
]

In [37]:
# Using MLPClassifier as the stacking classifier
stacking_clf = StackingClassifier(estimators=base_learners, final_estimator=MLPClassifier(random_state=42, max_iter=1000))

In [38]:
stacking_clf.fit(X_train, y_train)

# Predict and evaluate the stacking model
y_pred_stacking = stacking_clf.predict(X_test)
stacking_metrics = {
    'Accuracy': accuracy_score(y_test, y_pred_stacking),
    'Precision': precision_score(y_test, y_pred_stacking, zero_division=0),
    'Recall': recall_score(y_test, y_pred_stacking, zero_division=0),
    'F1 Score': f1_score(y_test, y_pred_stacking, zero_division=0)
}


In [39]:
stacking_metrics = pd.DataFrame(stacking_metrics, index=['Value']).T
stacking_metrics

Unnamed: 0,Value
Accuracy,0.909091
Precision,1.0
Recall,0.833333
F1 Score,0.909091
