<a href="https://colab.research.google.com/github/OmarIraqy/Circus-of-Plates/blob/master/Assignment3_Bagging_Boosting/Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [109]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from scipy.stats import pointbiserialr
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier
import pandas as pd

In [84]:
data = pd.read_csv('/content/drive/MyDrive/colab_files/ML Assignment 3/output.csv')
# del data['Unnamed: 0']
data.head(5)

Unnamed: 0,systolic,fasting blood sugar,AST,dental caries,Gtp,hemoglobin,serum creatinine,age,height(cm),triglyceride,smoking
0,-0.353802,-1.262423,0.1567,0,0.53787,1.818767,-0.517239,-2.052535,0.536694,1.048274,1
1,0.667465,-0.479628,-0.582878,1,-0.199202,0.770702,0.597927,-0.785883,1.670617,1.138923,0
2,-0.118125,-0.479628,-0.688532,0,-0.615808,0.421347,-0.517239,-1.2081,-0.030268,-0.613639,1
3,2.160088,-0.936258,-0.89984,0,-0.647854,-1.255558,-1.074823,0.058551,-0.597229,0.383508,0
4,1.13882,0.107468,-1.005494,0,-0.647854,-0.207493,-1.074823,0.902985,-1.164191,0.564808,0


In [85]:
# Assuming df is your DataFrame and 'target' is the column name of the target variable
correlation_matrix = data.corr()
correlation_with_target = correlation_matrix['smoking'].sort_values(ascending=False)

# Display the correlation with the target variable
print(correlation_with_target)

smoking                1.000000
hemoglobin             0.441425
height(cm)             0.430535
Gtp                    0.367657
triglyceride           0.321262
serum creatinine       0.272032
dental caries          0.108755
fasting blood sugar    0.092788
systolic               0.052680
AST                    0.034115
age                   -0.193191
Name: smoking, dtype: float64


In [86]:
threshold = 0.3
# Get the features with correlation lower than the threshold
low_correlation_features = correlation_matrix[correlation_matrix['smoking'].abs() < threshold].index

# Remove the low-correlation features from the DataFrame
df_filtered = data.drop(low_correlation_features, axis=1)

# Display the updated DataFrame
df_filtered.head(5)

Unnamed: 0,Gtp,hemoglobin,height(cm),triglyceride,smoking
0,0.53787,1.818767,0.536694,1.048274,1
1,-0.199202,0.770702,1.670617,1.138923,0
2,-0.615808,0.421347,-0.030268,-0.613639,1
3,-0.647854,-1.255558,-0.597229,0.383508,0
4,-0.647854,-0.207493,-1.164191,0.564808,0


In [87]:
x = df_filtered.drop(columns = 'smoking')
y = df_filtered['smoking']
x.head(5)

Unnamed: 0,Gtp,hemoglobin,height(cm),triglyceride
0,0.53787,1.818767,0.536694,1.048274
1,-0.199202,0.770702,1.670617,1.138923
2,-0.615808,0.421347,-0.030268,-0.613639
3,-0.647854,-1.255558,-0.597229,0.383508
4,-0.647854,-0.207493,-1.164191,0.564808


In [88]:
# Split the data into training and testing sets
x_train, x_test_val, y_train, y_test_val = train_test_split(x, y, test_size=0.3, random_state=42)

x_val, x_test, y_val, y_test = train_test_split(x_test_val, y_test_val, test_size=0.5, random_state=42)

len(x_train),len(x_val),len(x_test)


(88613, 18989, 18989)

#Bagging Model

##Bagging using decision stump

In [105]:
# Create a Decision Tree stump
tree_stump = DecisionTreeClassifier(max_depth=1)

# Create a Bagging classifier with DecisionTree stump as base model
bagging_classifier = BaggingClassifier(estimator=tree_stump, random_state=42)

# Define a parameter grid with different values for n_estimators
param_grid = {'n_estimators': [50, 100, 200, 300, 400]}

# Perform a grid search with cross-validation
grid_search = GridSearchCV(bagging_classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)

# Print the best parameters and corresponding performance
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

Best Parameters: {'n_estimators': 50}
Best Accuracy: 0.6889508482245204


In [113]:
# Create a decision tree stump (a tree with max_depth=1)
base_model = DecisionTreeClassifier(max_depth=3)

# Set the number of base models in the ensemble
n_estimators = 50

# Create the BaggingClassifier
bagging_model = BaggingClassifier(base_model, n_estimators=n_estimators, random_state=42 , max_samples=10000)

# Train the Bagging ensemble
bagging_model.fit(x_train, y_train)

# Make predictions on the test set
y_pred = bagging_model.predict(x_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
acc1 = accuracy
print(f'Accuracy: {accuracy:.2f}')

# Get feature importances
feature_importances = bagging_model.estimators_[0].feature_importances_

# Create a DataFrame to display feature names and their importance scores
feature_importance_df = pd.DataFrame({'Feature': x_train.columns, 'Importance': feature_importances})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the feature importance
print(feature_importance_df)

Accuracy: 0.74
        Feature  Importance
2    height(cm)    0.662639
0           Gtp    0.174129
1    hemoglobin    0.132141
3  triglyceride    0.031092


#Boosting Model

In [103]:
# Create a Decision Tree stump
tree_stump = DecisionTreeClassifier(max_depth=1)

# Create an AdaBoost classifier with DecisionTree stump as base model
ada_classifier = AdaBoostClassifier(estimator=tree_stump, random_state=42)

# Define a parameter grid with different values for n_estimators
param_grid = {'n_estimators': [50, 100, 200, 300, 400]}

# Perform a grid search with cross-validation
grid_search = GridSearchCV(ada_classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)

# Print the best parameters and corresponding performance
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

Best Parameters: {'n_estimators': 300}
Best Accuracy: 0.7384130631625047


In [112]:
# Create a decision tree stump (a tree with max_depth=1)
base_model = DecisionTreeClassifier(max_depth=1)

# Set the number of base models in the ensemble (n_estimators)
n_estimators = 300

# Create the AdaBoostClassifier
boosting_model = AdaBoostClassifier(base_model, n_estimators=n_estimators, random_state=42)

# Train the AdaBoost ensemble
boosting_model.fit(x_train, y_train)

# Make predictions on the validation set
y_pred = boosting_model.predict(x_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
acc2 = accuracy
print(f'Accuracy: {accuracy:.2f}')

feature_importances = boosting_model.feature_importances_

# Create a DataFrame to display feature importances

feature_importance_df = pd.DataFrame({'Feature': x_train.columns , 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display feature importances
print("Feature Importances:")
print(feature_importance_df)

Accuracy: 0.74
Feature Importances:
        Feature  Importance
3  triglyceride    0.403333
0           Gtp    0.393333
1    hemoglobin    0.150000
2    height(cm)    0.053333


#Random Forests

In [94]:
# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define a parameter grid with different values for n_estimators
param_grid = {'n_estimators': [50, 100, 200, 300, 400]}

# Perform a grid search with cross-validation
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)

# Print the best parameters and corresponding performance
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)



Best Parameters: {'n_estimators': 300}
Best Accuracy: 0.7146694473568894


In [111]:
rf_classifier = RandomForestClassifier(n_estimators=300, random_state=42)

# Train the model on the training data
rf_classifier.fit(x_train, y_train)

# Make predictions on the test data
y_pred = rf_classifier.predict(x_val)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_val, y_pred)
acc3 = accuracy
print("Accuracy:", accuracy)

Accuracy: 0.7140976354731686


In [108]:
feature_importances = rf_classifier.feature_importances_

# Create a DataFrame to display feature importances

feature_importance_df = pd.DataFrame({'Feature': x_train.columns , 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display feature importances
print("Feature Importances:")
print(feature_importance_df)

Feature Importances:
        Feature  Importance
3  triglyceride    0.306608
1    hemoglobin    0.287745
0           Gtp    0.211819
2    height(cm)    0.193828


#Final Model

In [114]:
# Make predictions on the test data
pred1 = bagging_model.predict(x_test)
pred2 = boosting_model.predict(x_test)
pred3 = rf_classifier.predict(x_test)

# Weight the predictions based on the accuracy of each model
weighted_pred = (acc1 * pred1 + acc2 * pred2 + acc3 * pred3) / (acc1 + acc2 + acc3)

# Round the weighted predictions to get the final result
final_predictions = [round(value) for value in weighted_pred]

# Evaluate the final predictions
accuracy = accuracy_score(y_test, final_predictions)
print("Weighted Ensemble Accuracy:", accuracy)

Weighted Ensemble Accuracy: 0.7399020485544262
