In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/indian-liver-patients/indian_liver_patient.csv


In [2]:
import pandas as pd
df=pd.read_csv("/kaggle/input/indian-liver-patients/indian_liver_patient.csv")
df.head(2)

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1


In [3]:
#Single classifier
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

#handling missing and null values
print(df.isnull().sum())

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    4
Dataset                       0
dtype: int64


In [4]:
df.fillna(df.mean(numeric_only=True), inplace=True)
print(df.isnull().sum())

#converting gender to numerical values - males-1 , females=0
le = LabelEncoder()
df['Gender']=le.fit_transform(df['Gender'])

print('after converting gender to categorical:')
print(df.head(2))

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    0
Dataset                       0
dtype: int64
after converting gender to categorical:
   Age  Gender  Total_Bilirubin  Direct_Bilirubin  Alkaline_Phosphotase  \
0   65       0              0.7               0.1                   187   
1   62       1             10.9               5.5                   699   

   Alamine_Aminotransferase  Aspartate_Aminotransferase  Total_Protiens  \
0                        16                          18             6.8   
1                        64                         100             7.5   

   Albumin  Albumin_and_Globulin_Ratio  Dataset  
0      3.3                        0.90        1  
1      3.2                        0.74     

In [7]:
#Evaluation metrics
def evaluate_model(name, y_true, y_pred):
    print(f"\n{name}")
    print(f"Accuracy : {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall   : {recall_score(y_true, y_pred):.4f}")
    print(f"F1-score : {f1_score(y_true, y_pred):.4f}")

In [8]:
X=df.drop('Dataset', axis=1)
y=df['Dataset']
#converting target to binary classes
y=np.where(y==2,1,0) # changes 2--> condition if true 0 else 1
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.3, random_state=42)
#preprocessing
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

evaluate_model("Decision Tree", y_test, y_pred_dt)


Decision Tree
Accuracy : 0.6971
Precision: 0.4423
Recall   : 0.4894
F1-score : 0.4646


In [9]:
#ensemble classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=5, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
evaluate_model("Random Forest", y_test, y_pred_rf)
     


Random Forest
Accuracy : 0.7029
Precision: 0.4510
Recall   : 0.4894
F1-score : 0.4694


In [10]:
for n in [1,5,10, 50, 100, 200, 300]:
    rf_temp = RandomForestClassifier(n_estimators=n, random_state=42)
    rf_temp.fit(X_train, y_train)
    y_pred_temp = rf_temp.predict(X_test)
    acc = accuracy_score(y_test, y_pred_temp)
    print(f"Estimators: {n:3d} | Accuracy: {acc:.4f}")
    print(f"Precision: {precision_score(y_test, y_pred_temp ):.4f}")
    print(f"Recall   : {recall_score(y_test, y_pred_temp ):.4f}")
    print(f"F1-score : {f1_score(y_test, y_pred_temp ):.4f}")

Estimators:   1 | Accuracy: 0.6686
Precision: 0.3830
Recall   : 0.3830
F1-score : 0.3830
Estimators:   5 | Accuracy: 0.7029
Precision: 0.4510
Recall   : 0.4894
F1-score : 0.4694
Estimators:  10 | Accuracy: 0.6971
Precision: 0.4118
Recall   : 0.2979
F1-score : 0.3457
Estimators:  50 | Accuracy: 0.7029
Precision: 0.4138
Recall   : 0.2553
F1-score : 0.3158
Estimators: 100 | Accuracy: 0.7314
Precision: 0.5000
Recall   : 0.3404
F1-score : 0.4051
Estimators: 200 | Accuracy: 0.7143
Precision: 0.4615
Recall   : 0.3830
F1-score : 0.4186
Estimators: 300 | Accuracy: 0.7086
Precision: 0.4444
Recall   : 0.3404
F1-score : 0.3855


In [13]:
from sklearn.linear_model import LogisticRegression
dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(n_estimators=5, random_state=42)
lr = LogisticRegression(random_state=42, max_iter=1000)

dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
lr.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)
y_pred_rf = rf.predict(X_test)
y_pred_lr = lr.predict(X_test)

#majority voting
predictions = np.array([y_pred_dt, y_pred_rf, y_pred_lr])
y_pred_max = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=predictions)
     

#Average Voting (Probabilities)
y_prob_dt = dt.predict_proba(X_test)[:,1]
y_prob_rf = rf.predict_proba(X_test)[:,1]
y_prob_lr = lr.predict_proba(X_test)[:,1]
y_prob_avg = (y_prob_dt + y_prob_rf + y_prob_lr) / 3
y_pred_avg = (y_prob_avg >= 0.5).astype(int)

#Weighted Average Voting
# Assign weights based on model performance (accuracy)
acc_dt = accuracy_score(y_test, y_pred_dt)
acc_rf = accuracy_score(y_test, y_pred_rf)
acc_lr = accuracy_score(y_test, y_pred_lr)

weights = np.array([acc_dt, acc_rf, acc_lr])
y_prob_weighted = (y_prob_dt*weights[0] + y_prob_rf*weights[1] + y_prob_lr*weights[2]) / weights.sum()
y_pred_weighted = (y_prob_weighted >= 0.5).astype(int)

#Evaluation function
def evaluate_model(name, y_true, y_pred):
    print(f"\n{name}")
    print(f"Accuracy : {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall   : {recall_score(y_true, y_pred):.4f}")
    print(f"F1-score : {f1_score(y_true, y_pred):.4f}")

Majority voting

Works with class predictions (not probabilities) from multiple models. Each base classifier “votes” for a class. The class with the most votes wins → becomes the final prediction.

Average Voting

Take the average of predicted probabilities from all models ,if threshold 0.5 → final prediction

Weighted average voting

Take a weighted average of probabilities based on model performance → threshold 0.5 → final prediction

In [15]:
evaluate_model("Decision Tree", y_test, y_pred_dt)
evaluate_model("Random Forest", y_test, y_pred_rf)
evaluate_model("Logistic Regression", y_test, y_pred_lr)
evaluate_model("Max Voting Ensemble", y_test, y_pred_max)
evaluate_model("Average Voting Ensemble", y_test, y_pred_avg)
evaluate_model("Weighted Average Voting Ensemble", y_test, y_pred_weighted)


Decision Tree
Accuracy : 0.6971
Precision: 0.4423
Recall   : 0.4894
F1-score : 0.4646

Random Forest
Accuracy : 0.7029
Precision: 0.4510
Recall   : 0.4894
F1-score : 0.4694

Logistic Regression
Accuracy : 0.7200
Precision: 0.4500
Recall   : 0.1915
F1-score : 0.2687

Max Voting Ensemble
Accuracy : 0.6914
Precision: 0.4000
Recall   : 0.2979
F1-score : 0.3415

Average Voting Ensemble
Accuracy : 0.7200
Precision: 0.4773
Recall   : 0.4468
F1-score : 0.4615

Weighted Average Voting Ensemble
Accuracy : 0.7200
Precision: 0.4773
Recall   : 0.4468
F1-score : 0.4615


In [17]:
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier

dt = DecisionTreeClassifier(random_state=42)
lr = LogisticRegression(random_state=42, max_iter=1000)
knn = KNeighborsClassifier(n_neighbors=5)


hard_voting_clf = VotingClassifier(
    estimators=[('dt', dt), ('lr', lr), ('knn', knn)],
    voting='hard'
)

#Define Soft Voting Classifier
soft_voting_clf = VotingClassifier(
    estimators=[('dt', dt), ('lr', lr), ('knn', knn)],
    voting='soft'  # uses predicted probabilities
)

#Train classifiers
hard_voting_clf.fit(X_train, y_train)
soft_voting_clf.fit(X_train, y_train)

#Make predictions
y_pred_hard = hard_voting_clf.predict(X_test)
y_pred_soft = soft_voting_clf.predict(X_test)

#Evaluation function
def evaluate_model(name, y_true, y_pred):
    print(f"\n=== {name} ===")
    print(f"Accuracy : {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall   : {recall_score(y_true, y_pred):.4f}")
    print(f"F1-score : {f1_score(y_true, y_pred):.4f}")

# Step 14: Evaluate both voting classifiers
evaluate_model("Hard Voting Classifier", y_test, y_pred_hard)
evaluate_model("Soft Voting Classifier", y_test, y_pred_soft)


=== Hard Voting Classifier ===
Accuracy : 0.6914
Precision: 0.4000
Recall   : 0.2979
F1-score : 0.3415

=== Soft Voting Classifier ===
Accuracy : 0.7029
Precision: 0.4419
Recall   : 0.4043
F1-score : 0.4222


In [21]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Load dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Experiment with different parameters
params = [
    {"n_estimators": 10, "max_depth": None, "random_state": 42},
    {"n_estimators": 50, "max_depth": None, "random_state": 42},
    {"n_estimators": 100, "max_depth": None, "random_state": 42},
    {"n_estimators": 100, "max_depth": 3, "random_state": 42},
    {"n_estimators": 100, "max_depth": 5, "random_state": 42},
]

results = []

# Train and evaluate multiple Random Forests
for p in params:
    rf = RandomForestClassifier(
        n_estimators=p["n_estimators"],
        max_depth=p["max_depth"],   
        random_state=p["random_state"]
    )
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results.append({
        "n_estimators": p["n_estimators"],
        "max_depth": p["max_depth"],
        "accuracy": round(acc, 4)
    })

# Display results
df_results = pd.DataFrame(results)
print("=== Random Forest Results ===")
print(df_results)
print("\nBest Model Parameters:")
best_model = df_results.loc[df_results['accuracy'].idxmax()]
print(best_model)

# Final model evaluation
final_rf = RandomForestClassifier(
    n_estimators=int(best_model["n_estimators"]),
    max_depth=None if pd.isna(best_model["max_depth"]) else int(best_model["max_depth"]), 
    random_state=42
)
final_rf.fit(X_train, y_train)
y_pred_final = final_rf.predict(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred_final, target_names=iris.target_names))


=== Random Forest Results ===
   n_estimators  max_depth  accuracy
0            10        NaN    0.9111
1            50        NaN    0.8889
2           100        NaN    0.8889
3           100        3.0    0.9111
4           100        5.0    0.8889

Best Model Parameters:
n_estimators    10.0000
max_depth           NaN
accuracy         0.9111
Name: 0, dtype: float64

Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        15
  versicolor       0.82      0.93      0.87        15
   virginica       0.92      0.80      0.86        15

    accuracy                           0.91        45
   macro avg       0.92      0.91      0.91        45
weighted avg       0.92      0.91      0.91        45



  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Bagging Technique

It is an ensemble learning technique that improves model stability and accuracy by training multiple models on different random subsets of the original dataset.

In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
     

X=df.drop('Dataset', axis=1)
y=df['Dataset']
#converting target to binary classes
y=np.where(y==2,1,0) # changes 2--> condition if true 0 else 1
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.3, random_state=42)
#preprocessing
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
     

n_estimators_list = [10, 50, 100, 200]
max_depth_list = [None, 3, 5, 10]

results = []

print("Exploring Random Forest (Bagging) Performance:\n")
for n in n_estimators_list:
    for depth in max_depth_list:
        rf = RandomForestClassifier(n_estimators=n, max_depth=depth, random_state=42)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        results.append((n, depth, acc))
        print(f"n_estimators={n}, max_depth={depth}, Accuracy={acc:.4f}")

Exploring Random Forest (Bagging) Performance:

n_estimators=10, max_depth=None, Accuracy=0.6971
n_estimators=10, max_depth=3, Accuracy=0.7143
n_estimators=10, max_depth=5, Accuracy=0.7257
n_estimators=10, max_depth=10, Accuracy=0.7029
n_estimators=50, max_depth=None, Accuracy=0.7029
n_estimators=50, max_depth=3, Accuracy=0.7143
n_estimators=50, max_depth=5, Accuracy=0.7029
n_estimators=50, max_depth=10, Accuracy=0.7257
n_estimators=100, max_depth=None, Accuracy=0.7314
n_estimators=100, max_depth=3, Accuracy=0.7086
n_estimators=100, max_depth=5, Accuracy=0.6971
n_estimators=100, max_depth=10, Accuracy=0.7200
n_estimators=200, max_depth=None, Accuracy=0.7143
n_estimators=200, max_depth=3, Accuracy=0.7029
n_estimators=200, max_depth=5, Accuracy=0.7086
n_estimators=200, max_depth=10, Accuracy=0.6971


In [24]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
rf_clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    oob_score=True,
    random_state=42
)

# Train
rf_clf.fit(X_train, y_train)

# Predictions
y_pred = rf_clf.predict(X_test)

# Performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("OOB Score:", rf_clf.oob_score_)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
     

Accuracy: 0.6971428571428572
OOB Score: 0.6740196078431373

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.87      0.81       128
           1       0.39      0.23      0.29        47

    accuracy                           0.70       175
   macro avg       0.57      0.55      0.55       175
weighted avg       0.66      0.70      0.67       175



In [28]:
def print_metrics(model_name, y_true, y_pred):
    print(f"--- {model_name} ---")
    print("Accuracy:", round(accuracy_score(y_true, y_pred), 4))
    print("Precision:", round(precision_score(y_true, y_pred), 4))
    print("Recall:", round(recall_score(y_true, y_pred), 4))
    print("F1-Score:", round(f1_score(y_true, y_pred), 4))
    print()
     

# 1. AdaBoost
ada = AdaBoostClassifier(n_estimators=100, learning_rate=0.5, random_state=42)
ada.fit(X_train, y_train)
y_pred_ada = ada.predict(X_test)
print_metrics("AdaBoost", y_test, y_pred_ada)

gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
print_metrics("Gradient Boosting", y_test, y_pred_gb)


xgb = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3,
                    use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
print_metrics("XGBoost", y_test, y_pred_xgb)

cat = CatBoostClassifier(iterations=100, learning_rate=0.1, depth=3, verbose=0, random_state=42)
cat.fit(X_train, y_train)
y_pred_cat = cat.predict(X_test)
print_metrics("CatBoost", y_test, y_pred_cat)
     

--- AdaBoost ---
Accuracy: 0.7086
Precision: 0.4565
Recall: 0.4468
F1-Score: 0.4516

--- Gradient Boosting ---
Accuracy: 0.7029
Precision: 0.4359
Recall: 0.3617
F1-Score: 0.3953

--- XGBoost ---
Accuracy: 0.7086
Precision: 0.4333
Recall: 0.2766
F1-Score: 0.3377

--- CatBoost ---
Accuracy: 0.7029
Precision: 0.4074
Recall: 0.234
F1-Score: 0.2973



Boosting Techniques

AdaBoost: Focuses on misclassified samples, improves gradually, but less powerful on complex datasets.

Gradient Boosting: More flexible than AdaBoost, sequentially reduces errors, slightly slower.

XGBoost & CatBoost: Best trade-off between accuracy, speed, and robustness.

Best Overall: XGBoost or CatBoost. They provide high accuracy, are fast to train, and handle overfitting well.

XGBoost or CatBoost provide the best trade-off between accuracy, speed, and robustness.

How does AdaBoost focus on misclassified samples?

AdaBoost increases the weights of misclassified samples so that the next weak learner pays more attention to them, correcting earlier mistakes.

What makes Gradient Boosting more flexible than AdaBoost?

Gradient Boosting can optimize different loss functions (not just classification error), allowing it to adapt better to various datasets and tasks.

How do XGBoost and CatBoost optimize training speed and accuracy?

XGBoost: Uses parallel computation, handles missing values efficiently, and applies regularization to reduce overfitting.

CatBoost: Uses ordered boosting, handles categorical features natively, and prevents overfitting, making training fast and stable.