<a href="https://colab.research.google.com/github/RifatMuhtasim/Data_Science_Workflow/blob/main/6.1.Ensemble_Methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Ensemble Methods follow 4 Technique:
1. Voting
4. Stacking
2. Bagging
3. Boosting

# 1. Voting

In [None]:
# Ensemble Method for Classification Voting

from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Initialize individual models
model1 = DecisionTreeClassifier()
model2 = SVC(probability=True)  # Ensure SVC is trained with probability=True for soft voting
model3 = LogisticRegression()

# For hard voting, you don't need `voting='soft'` argument
ensemble_model = VotingClassifier(estimators=[('dt', model1), ('svm', model2), ('lr', model3)], voting='soft')

# You can also set weights for individual models if some are more important than others
# ensemble_model = VotingClassifier(estimators=[('dt', model1), ('svm', model2), ('lr', model3)], voting='soft', weights=[2,1,1])

# Evaluate the ensemble model
scores = cross_val_score(ensemble_model, X_train, y_train, cv=5)
print("Cross-validation scores:", scores)
print("Mean accuracy:", scores.mean())

ensemble_model.fit(X_train, y_train)

predictions = ensemble_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: ", accuracy)

In [None]:
# Ensemble Method for Regression Voting

from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the individual regression models
model1 = LinearRegression()
model2 = RandomForestRegressor(n_estimators=50, random_state=42)
model3 = SVR(kernel='linear')

# Create the voting regressor ensemble
voting_regressor = VotingRegressor(estimators=[
    ('lr', model1),
    ('rf', model2),
    ('svr', model3)
])

voting_regressor.fit(X_train, y_train)

y_pred = voting_regressor.predict(X_test)

# Evaluate the performance of the ensemble
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# 2. Stacking

In [None]:
# Stacking for Classification

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Define your base estimators
estimators = [
    ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
    ('knn', KNeighborsClassifier(n_neighbors=10)),
    ('gbdt', GradientBoostingClassifier())
]

# Define your stacking classifier with logistic regression as the final estimator
clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    cv=5
)

# Train the stacking classifier
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

In [None]:
# Stacking for Regression

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Define your base estimators
estimators = [
    ('rf', RandomForestRegressor(n_estimators=10, random_state=42)),
    ('knn', KNeighborsRegressor(n_neighbors=10)),
    ('gbdt', GradientBoostingRegressor())
]

# Define your stacking regressor with linear regression as the final estimator
reg = StackingRegressor(
    estimators=estimators,
    final_estimator=LinearRegression()
)

# Train the stacking regressor
reg.fit(X_train, y_train)

# Predict on the test set
y_pred = reg.predict(X_test)

# Calculate mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: ", mse)


# 3. Bagging

In [None]:
# Baggin Technique for Classification (SVM)

from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

# Initialize base classifier
base_classifier = SVC(kernel='rbf', random_state=42)

# Initialize bagging classifier
bagging_classifier = BaggingClassifier(base_estimator=base_classifier, n_estimators=10)

# Evaluate bagging classifier using cross-validation
scores = cross_val_score(bagging_classifier, X_train, y_train, cv=5)
print("Cross-validation scores:", scores)
print("Mean accuracy:", scores.mean())

# Train bagging classifier on the full dataset
bagging_classifier.fit(X_train, y_train)

# Make predictions
predictions = bagging_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: ", accuracy)

In [None]:
# Bagging Technique for Regression (Linear Regression)

from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

# Initialize base regressor
base_regressor = LinearRegression()

# Initialize bagging regressor
bagging_regressor = BaggingRegressor(base_estimator=base_regressor, n_estimators=10)

# Evaluate bagging regressor using cross-validation
scores = cross_val_score(bagging_regressor, X_train, y_train, cv=5)
print("Cross-validation scores:", scores)
print("Mean accuracy:", scores.mean())

# Train bagging regressor on the full dataset
bagging_regressor.fit(X_train, y_train)

# Make predictions
predictions = bagging_regressor.predict(X_test)


# Calculate mean squared error (MSE)
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error: ", mse)

# 4. Boosting

In [None]:
# Classification

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

# Initialize base classifier
base_classifier = SVC(kernel='linear', probability=True)  # SVM with linear kernel

# Initialize Gradient Boosting classifier
gradient_boosting_classifier = GradientBoostingClassifier(base_estimator=base_classifier, n_estimators=50, random_state=42)

# Evaluate Gradient Boosting classifier using cross-validation
scores = cross_val_score(gradient_boosting_classifier, X_train, y_train, cv=5)
print("Cross-validation scores:", scores)
print("Mean accuracy:", scores.mean())

# Train Gradient Boosting classifier on the full dataset
gradient_boosting_classifier.fit(X_train, y_train)

# Make predictions
predictions = gradient_boosting_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: ", accuracy)

In [None]:
# Regression

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

# Initialize base regressor
# base_regressor = SVR(kernel='linear')  # SVR with linear kernel
base_regressor = LinearRegression()

# Initialize Gradient Boosting regressor
gradient_boosting_regressor = GradientBoostingRegressor(base_estimator=base_regressor, n_estimators=50, random_state=42)

# Evaluate Gradient Boosting regressor using cross-validation
scores = cross_val_score(gradient_boosting_regressor, X_train, y_train, cv=5)
print("Cross-validation scores:", scores)
print("Mean R^2 score:", scores.mean())

# Train Gradient Boosting regressor on the full dataset
gradient_boosting_regressor.fit(X_train, y_train)

# Make predictions
predictions = gradient_boosting_regressor.predict(X_test)

# Calculate mean squared error (MSE)
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error: ", mse)

# 5. Gradient Boosting ML Algo

## Light GBM

In [None]:
# Light GBM Regressor

from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


X = df.drop(['labels'], axis="columns")
y = df['labels']

obj_feat = list(X.loc[:, X.dtypes="object"].columns.values)
for feature in obj_feat:
    X[feature] = pd.Series(X[feature], dtype="category")


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

lgbm = LGBMRegressor()
lgbm.fit(X_train, y_train)

y_train_pred = lgbm.predict(X_train)
y_val_pred = lgbm.predict(X_val)

# Calculate Mean Squared Error (MSE)
train_mse = mean_squared_error(y_train, y_train_pred)
val_mse = mean_squared_error(y_val, y_val_pred)

# Calculate R-Squared Score
train_r2 = r2_score(y_train, y_train_pred)
val_r2 = r2_score(y_val, y_val_pred )

# Calculate RMSE
train_rmse = math.sqrt(mean_squared_error(y_train, y_train_pred))
val_rmse = math.sqrt(mean_squared_error(y_val, y_val_pred))

print("Light GBM: ")
print("Train R-squared:", train_r2)
print("Validation R-squared:", val_r2)
print("Train RMSE:", train_rmse)
print("Validation RMSE:", val_rmse)

In [None]:
# Light GBM Classifier

from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

X = df.drop(['labels'], axis="columns")
y = df['labels']

obj_feat = list(X.loc[:, X.dtypes="object"].columns.values)
for feature in obj_feat:
    X[feature] = pd.Series(X[feature], dtype="category")


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

lgbm = LGBMClassifier()
lgbm.fit(X_train, y_train)

y_train_pred = lgbm.predict(X_train)
y_val_pred = lgbm.predict(X_val)

# Calculate Accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
val_accuracy = accuracy_score(y_val, y_val_pred)

print("LightGBM: ")
print("Training Accuracy:", train_accuracy)
print("Validation Accuracy:", val_accuracy)

## CatBoost

In [None]:
# CatBoost Regressor

from catboost import CatBoostRegressor, Pool
from sklearn.metrics import r2_score, mean_squared_error


# Constructing the pools
pool_train = Pool(X_train, y_train, cat_features=['Country', 'Status', 'Year'])
pool_val = Pool(X_val, y_val, cat_features=['Country', 'Status', 'Year'])  # Include y_val

# Model training
cbr = CatBoostRegressor()
cbr.fit(pool_train)

# Prediction
y_train_pred = cbr.predict(pool_train)
y_val_pred = cbr.predict(pool_val)  # Use pool_val

# Metrics Calculation
train_mse = mean_squared_error(y_train, y_train_pred)
val_mse = mean_squared_error(y_val, y_val_pred)

train_r2 = r2_score(y_train, y_train_pred)
val_r2 = r2_score(y_val, y_val_pred)

train_rmse = math.sqrt(train_mse)
val_rmse = math.sqrt(val_mse)

print("CatBoost Regressor: ")
print("Train R-squared:", train_r2)
print("Validation R-squared:", val_r2)
print("Train RMSE:", train_rmse)
print("Validation RMSE:", val_rmse)

In [None]:
# CatBoost Classifier

from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score


# Constructing the pools
pool_train = Pool(X_train, y_train, cat_features=['Country', 'Status', 'Year'])
pool_test = Pool(X_test, cat_features=['Country', 'Status', 'Year'])

# Model training
cbc = CatBoostClassifier()
cbc.fit(pool_train)

# Prediction
y_train_pred = cbc.predict(pool_train)
y_test_pred = cbc.predict(pool_test)

# Evaluation
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("CatBoost Classifier: ")
print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

## Hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV

def Hyperparameter_tuning(X_train, y_train, model_params):
  scores = []

  for model, mp in model_params.items():
    cv = GridSearchCV(mp['model'], mp['params'], cv=5,  return_train_score=False)
    cv.fit(X_train, y_train)
    scores.append({
      "Model": model,
      "Best_score": cv.best_score_,
      "Best_params": cv.best_params_
    })

  df = pd.DataFrame(scores, columns=['Model', 'Best_score', 'Best_params'])
  return df

In [None]:
model_params = {
    'light_gbm': {
        'model': LGBMClassifier(),
        'params': {
            'learning_rate': [0.01, 0.05, 0.1],
            'n_estimators': [100, 200, 300],
            'num_leaves': [20, 30, 40],
            'max_depth': [5, 10, 15],
            'min_child_samples': [20, 30, 40],
            'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
            'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
            'reg_alpha': [0.0, 0.1, 0.5, 1.0],
            'reg_lambda': [0.0, 0.1, 0.5, 1.0]
        }
    },
    'decision_tree':{
        'model': CatBoostClassifier(),
        'params': {
            'iterations': [100, 200, 300],                 # Number of trees in the model
            'learning_rate': [0.01, 0.05, 0.1],            # Step size shrinkage used in update to prevent overfitting
            'depth': [4, 6, 8],                            # Depth of the trees
            'l2_leaf_reg': [1, 3, 5],                      # L2 regularization coefficient
            'random_strength': [0.1, 0.5, 1],              # Random strength parameter to make the model more robust to overfitting
            'bagging_temperature': [0.1, 0.5, 1],          # Controls intensity of the Bayesian bootstrap method
            'border_count': [32, 64, 128],                 # Number of splits for numerical features
            'scale_pos_weight': [1, 2, 5]
        }
    }
}


results = Hyperparameter_tuning(X_train, y_train, model_params)
pd.set_option('display.max_colwidth', None)
results