<a href="https://colab.research.google.com/github/Sumitjh26997/CSS581-ML/blob/main/CSS588Boosting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Bagging, Boosting and Ensemble
> In this notebook we will explore some of the concepts we covered in terms of improving weak learners. In particular this is an opportunity to try Adaboost, Gradiant Boost. Compare its performance with Bagging.





Recap on the UCI census

In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.tree import DecisionTreeClassifier

# Load the UCI adult census data
data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", header=None,
                   names=["Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Martial Status",
                          "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss",
                          "Hours per week", "Country", "Target"])

# Pre-processing to convert categorical variables to numerical
data = pd.get_dummies(data, columns=["Workclass", "Education", "Martial Status", "Occupation",
                                     "Relationship", "Race", "Sex", "Country"])


# Split data into training and testing sets
X = data.drop("Target", axis=1)
y=data["Target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
y

0         <=50K
1         <=50K
2         <=50K
3         <=50K
4         <=50K
          ...  
32556     <=50K
32557      >50K
32558     <=50K
32559     <=50K
32560      >50K
Name: Target, Length: 32561, dtype: object

## Baseline

In [3]:
rnd = RandomForestClassifier()
rnd.fit(X_train, y_train)
y_pred = rnd.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8550516941345071


In [4]:
y.value_counts()

 <=50K    24720
 >50K      7841
Name: Target, dtype: int64

## Oversampling

In [27]:
from imblearn.over_sampling import SMOTE
X_oversampled, y_oversampled = SMOTE().fit_resample(X, y)
X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_oversampled, y_oversampled, test_size=0.3, random_state=42)

In [28]:
y_oversampled.value_counts()

 <=50K    24720
 >50K     24720
Name: Target, dtype: int64

In [29]:
y_pred_over = RandomForestClassifier().fit(X_train_smote, y_train_smote).predict(X_test_smote)
print(accuracy_score(y_test_smote, y_pred_over))

0.9005528586839266


In [30]:
from imblearn.over_sampling import ADASYN
X_ADASYN, y_ADASYN = ADASYN().fit_resample(X, y)
X_train_ad, X_test_ad, y_train_ad, y_test_ad = train_test_split(X_ADASYN, y_ADASYN, test_size=0.3, random_state=42)
y_ADASYN.value_counts()

 >50K     24857
 <=50K    24720
Name: Target, dtype: int64

In [31]:
y_pred_ADASYN = RandomForestClassifier().fit(X_train_ad, y_train_ad).predict(X_test_ad)
print(accuracy_score(y_test_ad, y_pred_ADASYN))

0.8976065617856662


## Undersampling

In [32]:
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
renn = RepeatedEditedNearestNeighbours()
X_undersampled, y_undersampled = renn.fit_resample(X, y)
X_train_under, X_test_under, y_train_under, y_test_under = train_test_split(X_undersampled, y_undersampled, test_size=0.3, random_state=42)

In [33]:
y_undersampled.value_counts()

 <=50K    10037
 >50K      7841
Name: Target, dtype: int64

In [34]:
y_pred_under = RandomForestClassifier().fit(X_train_under, y_train_under).predict(X_test_under)
print(accuracy_score(y_test_under, y_pred_under))

0.8480611483967189


## Combined

In [35]:
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
X_train_comb, X_test_comb, y_train_comb, y_test_comb = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

y_resampled.value_counts()

 >50K     13999
 <=50K    10276
Name: Target, dtype: int64

In [36]:
y_pred_combined = RandomForestClassifier().fit(X_train_comb, y_train_comb).predict(X_test_comb)
print(accuracy_score(y_test_comb, y_pred_combined))

0.935740766167788
