In [None]:
                                                       Ensemble Learning 

In [None]:
#Ensemble Learning is a machine learning technique that combines multiple models to improve overall performance, reduce errors, and increase robustness.
#Instead of relying on a single model, ensemble methods merge predictions from multiple models to get a better final prediction.

#How Does Ensemble Learning Work?
#Ensemble learning combines multiple models (weak or strong) to make better predictions than a single model. 
#It does this by using different strategies like Bagging, Boosting, and Stacking.


#Bagging (Bootstrap Aggregating)
How it Works?
Takes multiple samples from the training data (with replacement).
Trains multiple independent models on these samples.
Final prediction = Average (Regression) or Majority Voting (Classification)

Example: Random Forest
1)Trains multiple decision trees on different samples of the data.
2)Takes the average of predictions for final output

Benefits:
 Reduces variance (prevents overfitting).
Works well with high-variance models (e.g., decision trees).

                                  

#Boosting (Sequential Learning)                  
How it Works?
Models train sequentially, each trying to fix the mistakes of the previous one.
Increases weights of misclassified points to improve learning.
Final prediction = Weighted sum of weak learners

Examples:
🔹 AdaBoost (Adaptive Boosting) → Focuses on misclassified data points.
🔹 Gradient Boosting → Corrects previous errors by minimizing a loss function.

Benefits:
Reduces bias (makes weak learners stronger).
Works well for complex problems (text classification, fraud detection).


    
# Stacking (Stacked Generalization)
How it Works?

Trains multiple diverse models on the same data.
A meta-model (like Logistic Regression) learns from their outputs to make final predictions.

Example: 
1) Train Random Forest, AdaBoost, and Gradient Boosting separately
2)Get their predictions & pass them as input to a Logistic Regression meta-model.️⃣ 
3)The meta-model combines their predictions for the final output.


Benefits:
Leverages multiple learning algorithms for better accuracy.
More powerful than Bagging & Boosting but computationally expensive.

In [105]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [65]:
import pandas as pd

In [67]:
df=pd.read_csv(r"C:\Users\vaibh\OneDrive\Desktop\File\titanic.csv")

In [69]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [71]:
# Drop unnecessary columns
df.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1, inplace=True)

In [73]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


In [75]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np

In [77]:
# Step 1: Handle Missing Values
df["Age"].fillna(df["Age"].median(), inplace=True)
df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)

In [79]:
# Step 2: Encode Categorical Variables (One-Hot Encoding)
df = pd.get_dummies(df, columns=["Sex", "Embarked"], drop_first=True)

In [81]:
# Step 3: Scale Numerical Features
scaler = StandardScaler()
df[["Age", "Fare"]] = scaler.fit_transform(df[["Age", "Fare"]])

In [85]:
# Step 4: Split into Training and Test Sets
X = df.drop("Survived", axis=1)
y = df["Survived"]

In [87]:
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,3,-0.565736,1,0,-0.502445,True,False,True
1,1,0.663861,1,0,0.786845,False,False,False
2,3,-0.258337,0,0,-0.488854,False,False,True
3,1,0.433312,1,0,0.420730,False,False,True
4,3,0.433312,0,0,-0.486337,True,False,True
...,...,...,...,...,...,...,...,...
886,2,-0.181487,0,0,-0.386671,True,False,True
887,1,-0.796286,0,0,-0.044381,False,False,True
888,3,-0.104637,1,2,-0.176263,False,False,True
889,1,-0.258337,0,0,-0.044381,True,False,False


In [89]:
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [93]:
from sklearn.model_selection import train_test_split

In [97]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [99]:
# Check final shapes
X_train.shape, X_test.shape

((712, 8), (179, 8))

In [190]:
# Train Random Forest (Bagging)
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)


In [192]:
y_pred_rf = rf_classifier.predict(X_test)


In [194]:
rf_accuracy = accuracy_score(y_test, y_pred_rf)

In [198]:
rf_accuracy

0.8156424581005587

In [200]:
!pip install xgboost



In [202]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

In [204]:
# Initialize Boosting Models
adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
gradient_boost = GradientBoostingClassifier(n_estimators=100, random_state=42)
xgboost = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric="logloss", random_state=42)


In [206]:
# Train and evaluate AdaBoost
adaboost.fit(X_train, y_train)
y_pred_ada = adaboost.predict(X_test)
ada_accuracy = accuracy_score(y_test, y_pred_ada)

In [208]:
# Train and evaluate Gradient Boosting
gradient_boost.fit(X_train, y_train)
y_pred_gb = gradient_boost.predict(X_test)
gb_accuracy = accuracy_score(y_test, y_pred_gb)


In [210]:
# Train and evaluate XGBoost
xgboost.fit(X_train, y_train)
y_pred_xgb = xgboost.predict(X_test)
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)

In [212]:
# Display results
ada_accuracy, gb_accuracy, xgb_accuracy

(0.7597765363128491, 0.7988826815642458, 0.8268156424581006)

In [214]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression


In [216]:
# Define base models for stacking
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('ada', AdaBoostClassifier(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42))]

In [218]:
# Define Stacking Classifier with Logistic Regression as meta-model
stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=LogisticRegression())

In [220]:
# Train the Stacking Classifier
stacking_classifier.fit(X_train, y_train)


In [222]:
# Make predictions
y_pred_stack = stacking_classifier.predict(X_test)

In [224]:
# Evaluate accuracy
stacking_accuracy = accuracy_score(y_test, y_pred_stack)
stacking_accuracy

0.8100558659217877

In [226]:
# Create a comparison table
model_results = pd.DataFrame({
    "Model": ["Random Forest", "AdaBoost", "Gradient Boosting", "Stacking"],
    "Accuracy": [rf_accuracy, ada_accuracy, gb_accuracy, stacking_accuracy]})

In [228]:
# Sort by accuracy
model_results.sort_values(by="Accuracy", ascending=False)

Unnamed: 0,Model,Accuracy
0,Random Forest,0.815642
3,Stacking,0.810056
2,Gradient Boosting,0.798883
1,AdaBoost,0.759777


In [230]:
# Display results
print(model_results)

               Model  Accuracy
0      Random Forest  0.815642
1           AdaBoost  0.759777
2  Gradient Boosting  0.798883
3           Stacking  0.810056


In [None]:
Random Forest performed best (81.56%) → Strong bagging technique!
Stacking performed close to Random Forest (81.00%) → Combining models helps!
Gradient Boosting (79.88%) showed better performance than AdaBoost.
AdaBoost (75.98%) had the lowest accuracy but may improve with tuning.