<a href="https://colab.research.google.com/github/SnehaDharne/GDM-diagnosis/blob/main/Ensemble_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
path='/content/drive/MyDrive/datasets for colab/pima-indians-diabetes.csv'

This notebook includes Classifiers without wrapper method for:
1. Voting Ensemble with LR, RF, NB and SVM
2. Weighted Average Ensemble with LR, RF and SVM
3. Stacking Ensemble with Xgboost, Ngboost and AdaBoost
4. Bagging Decision Tree

In [None]:
pip install vecstack

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting vecstack
  Downloading vecstack-0.4.0.tar.gz (18 kB)
Building wheels for collected packages: vecstack
  Building wheel for vecstack (setup.py) ... [?25l[?25hdone
  Created wheel for vecstack: filename=vecstack-0.4.0-py3-none-any.whl size=19879 sha256=e172aeca5b255f32524ee23ac7cc6e6826681838a3db6b642c44e5ef0ca5a83a
  Stored in directory: /root/.cache/pip/wheels/17/89/0b/21d5484cbf713c95b641ec1bdc40dd7ae798cbdea2337e3535
Successfully built vecstack
Installing collected packages: vecstack
Successfully installed vecstack-0.4.0


In [None]:
import pandas as pd
import numpy as np

from scipy import stats



from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
import warnings
warnings.filterwarnings('ignore')

from vecstack import stacking
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error, accuracy_score

In [None]:
df=pd.read_csv(path)
# make age category column which will help age-appropriate in filling values which are 0
df['age_category'] = pd.cut(df['Age'], bins=[0, 21, 31, 41, 51, 61, 71, 81, np.inf], labels=[1,2,3,4,5,6,7,8])
df['age_category'] = df['age_category'].astype(int)
agecat = [1,2,3,4,5,6,7,8]


features = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI','Insulin']
for feature in features:
  for agec in agecat:
        if df[feature].dtype == 'float64':
            df[feature].replace(0, round(df[df['age_category'] == agec][feature].mean(), 1), inplace=True)
        else:
            df[feature].replace(0, round(df[df['age_category'] == agec][feature].mean(), 0), inplace=True)





z= np.abs(stats.zscore(df))

threshold = 3

df = df[(z < 3).all(axis=1)]

q1 = df.quantile(0.25)
q3 = df.quantile(0.75)
iqr= q3-q1


df = df[~((df <(q1 - 1.5 * iqr))|(df>(q3+1.5*iqr))).any(axis=1)]

X = df.drop(['Outcome'], axis=1)
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)




In [None]:
# parameter grid
parameters = {
    'penalty' : ['l1','l2','elastinet'],    #‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’
    'C'       : np.logspace(-3,3,7),
    'solver'  : ['newton-cg', 'lbfgs', 'liblinear','sag','saga'],
}

logreg = LogisticRegression()
clf = GridSearchCV(logreg,                    # model
                   param_grid = parameters,   # hyperparameters
                   scoring='accuracy',        # metric for scoring
                   cv=10)

clf.fit(X_train,y_train)
#lr=(solver='liblinear', C=10.0, penalty='l1')
print("Tuned Hyperparameters :", clf.best_params_)
print("Accuracy :",clf.best_score_)

Tuned Hyperparameters : {'C': 10.0, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy : 0.8058080808080808


In [None]:
lr=LogisticRegression(solver='liblinear', C=10.0, penalty='l1')
lr.fit(X_train,y_train)
y_pred=lr.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.97      0.88        79
           1       0.87      0.41      0.55        32

    accuracy                           0.81       111
   macro avg       0.83      0.69      0.72       111
weighted avg       0.82      0.81      0.79       111



In [None]:
print(clf.best_params_)
grid_predictions = clf.predict(X_test)
print(classification_report(y_test, grid_predictions))
mean_squared_error(y_test,grid_predictions)

{'C': 10.0, 'penalty': 'l1', 'solver': 'liblinear'}
              precision    recall  f1-score   support

           0       0.80      0.97      0.88        79
           1       0.87      0.41      0.55        32

    accuracy                           0.81       111
   macro avg       0.83      0.69      0.72       111
weighted avg       0.82      0.81      0.79       111



0.1891891891891892

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.svm import SVC
# fitting the model for grid search

clf_svm=svm.SVC()

param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

grid = GridSearchCV(clf_svm, param_grid, refit = True, verbose = 3)

# fitting the model for grid search
grid.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.708 total time=   0.0s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.708 total time=   0.0s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.708 total time=   0.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.705 total time=   0.0s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.705 total time=   0.0s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.708 total time=   0.0s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.708 total time=   0.0s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.708 total time=   0.0s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.705 total time=   0.0s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.705 total time=   0.0s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.708 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             verbose=3)

In [None]:
print(grid.best_params_)
grid_predictions = grid.predict(X_test)
print(classification_report(y_test, grid_predictions))
mean_squared_error(y_test,grid_predictions)

{'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
              precision    recall  f1-score   support

           0       0.81      0.97      0.89        79
           1       0.88      0.44      0.58        32

    accuracy                           0.82       111
   macro avg       0.84      0.71      0.73       111
weighted avg       0.83      0.82      0.80       111



0.18018018018018017

In [None]:
from sklearn import svm
from sklearn.svm import SVC
svc=svm.SVC(gamma=0.0001, C=10, kernel='rbf')
svc.fit(X_train,y_train)
y_pred=svc.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.97      0.89        79
           1       0.88      0.44      0.58        32

    accuracy                           0.82       111
   macro avg       0.84      0.71      0.73       111
weighted avg       0.83      0.82      0.80       111



In [None]:
rf = RandomForestClassifier(n_jobs=-1 ,max_features= 'sqrt', oob_score = True)
param_grid = {
    'n_estimators': [200, 700]
}

rfc = GridSearchCV(estimator=rf, param_grid=param_grid, cv= 5)
rfc.fit(X_train,y_train)
print(classification_report(y_test,rfc.predict(X_test)))

print(rfc.best_params_)


              precision    recall  f1-score   support

           0       0.85      0.90      0.87        79
           1       0.70      0.59      0.64        32

    accuracy                           0.81       111
   macro avg       0.77      0.75      0.76       111
weighted avg       0.80      0.81      0.81       111

{'n_estimators': 700}
              precision    recall  f1-score   support

           0       0.85      0.90      0.87        79
           1       0.70      0.59      0.64        32

    accuracy                           0.81       111
   macro avg       0.77      0.75      0.76       111
weighted avg       0.80      0.81      0.81       111



0.1891891891891892

In [None]:
rf = RandomForestClassifier(n_jobs=-1 ,n_estimators=700, oob_score = True, max_features='sqrt')
rf.fit(X_train,y_train)
y_pred=rf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.90      0.87        79
           1       0.70      0.59      0.64        32

    accuracy                           0.81       111
   macro avg       0.77      0.75      0.76       111
weighted avg       0.80      0.81      0.81       111



In [None]:
gcf = GaussianNB()
gcf.fit(X_train,y_train)


gcf_predictions = gcf.predict(X_test)
print(classification_report(y_test, gcf_predictions))
mean_squared_error(y_test,gcf_predictions)


              precision    recall  f1-score   support

           0       0.80      0.90      0.85        79
           1       0.64      0.44      0.52        32

    accuracy                           0.77       111
   macro avg       0.72      0.67      0.68       111
weighted avg       0.75      0.77      0.75       111



0.23423423423423423

VOTING CLASSIFIER

In [None]:
#If we use max_iter > 1100 then accuracy degrades

voting = VotingClassifier(estimators = [('lr', lr), ('rf', rf), ('svc',svc), ('gnb',gcf)],voting= "hard")
voting.fit(X_train,y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=10.0, penalty='l1',
                                                 solver='liblinear')),
                             ('rf',
                              RandomForestClassifier(max_features='sqrt',
                                                     n_estimators=700,
                                                     n_jobs=-1,
                                                     oob_score=True)),
                             ('svc', SVC(C=10, gamma=0.0001)),
                             ('gnb', GaussianNB())])

In [None]:
print(classification_report(voting.predict(X_test),y_test))

              precision    recall  f1-score   support

           0       0.97      0.80      0.88        96
           1       0.41      0.87      0.55        15

    accuracy                           0.81       111
   macro avg       0.69      0.83      0.72       111
weighted avg       0.90      0.81      0.84       111



In [None]:
vote_pred=voting.predict(X_test)
mean_squared_error(y_test,vote_pred)

0.1891891891891892

Weighted Average

In [None]:
lr=LogisticRegression(solver='liblinear', C=10.0, penalty='l1')
rf = RandomForestClassifier(n_jobs=-1 ,n_estimators=700, oob_score = True, max_features='sqrt')
svc=svm.SVC(gamma=0.0001, C=10, kernel='rbf')
lr.fit(X_train, y_train)
rf.fit(X_train, y_train)
svc.fit(X_train, y_train)
final_predict = 0.5*lr.predict(X_test) + 0.2*rf.predict(X_test) +  0.3*svc.predict(X_test)
#print(final_predict)
for i in range(len(final_predict)):
    if final_predict[i] >= 0.5 :
        final_predict[i] = 1
    else:
        final_predict[i] = 0
#final_predict
print(classification_report(y_test,final_predict))

              precision    recall  f1-score   support

           0       0.81      0.96      0.88        79
           1       0.82      0.44      0.57        32

    accuracy                           0.81       111
   macro avg       0.82      0.70      0.73       111
weighted avg       0.81      0.81      0.79       111



Stacking

In [None]:
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
from ngboost import NGBClassifier
from ngboost.distns import Bernoulli

In [None]:
models = [AdaBoostClassifier(n_estimators=50,learning_rate=1),
         NGBClassifier(Dist=Bernoulli),
         xgb.XGBClassifier(learning_rate = 0.01,n_estimators= 2000,
                           max_depth= 9,
                            min_child_weight= 2,
                            #gamma=1,
                             gamma=0.4,
                             subsample=0.8,
                             colsample_bytree=0.8,
                             objective= 'binary:logistic',
                              nthread= -1,
                              scale_pos_weight=1)]
         #KNeighborsClassifier(n_neighbors=5,n_jobs=-1)]
level1_train,level1_test = stacking(models,X_train,y_train,X_test,
                                   regression=False,
                                   mode='oof_pred_bag',
                                   needs_proba=False,
                                   save_dir=None,
                                   metric=accuracy_score,
                                   n_folds=4,
                                   stratified=True,
                                   shuffle=True,
                                   random_state=0,
                                   verbose=2)

task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [3]

model  0:     [AdaBoostClassifier]
    fold  0:  [0.64864865]
    fold  1:  [0.77477477]
    fold  2:  [0.74774775]
    fold  3:  [0.70909091]
    ----
    MEAN:     [0.72006552] + [0.04738201]
    FULL:     [0.72009029]

model  1:     [NGBClassifier]
[iter 0] loss=0.6067 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.3243 val_loss=0.0000 scale=2.0000 norm=2.9798
[iter 200] loss=0.2989 val_loss=0.0000 scale=1.0000 norm=1.4621
[iter 300] loss=0.2926 val_loss=0.0000 scale=0.5000 norm=0.7278
[iter 400] loss=0.2903 val_loss=0.0000 scale=0.5000 norm=0.7270
    fold  0:  [0.76576577]
[iter 0] loss=0.6041 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.3438 val_loss=0.0000 scale=2.0000 norm=3.0533
[iter 200] loss=0.3068 val_loss=0.0000 scale=1.0000 norm=1.4878
[iter 300] loss=0.2937 val_loss=0.0000 scale=1.0000 norm=1.4739
[iter 400] loss=0.287

In [None]:
model = xgb.XGBClassifier(learning_rate = 0.1,n_estimators= 2000,max_depth= 3)
model.fit(level1_train,y_train)
predicted = model.predict(level1_test)
print(classification_report(predicted,y_test))

              precision    recall  f1-score   support

           0       0.91      0.84      0.87        86
           1       0.56      0.72      0.63        25

    accuracy                           0.81       111
   macro avg       0.74      0.78      0.75       111
weighted avg       0.83      0.81      0.82       111



In [None]:
X = df.drop(['Outcome'], axis=1)
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

xgbc = xgb.XGBClassifier(
 learning_rate = 0.01,
 n_estimators= 2000,
 max_depth= 9,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.4,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1).fit(X_train, y_train)

In [None]:

print(classification_report(y_test,xgbc.predict(X_test)))

              precision    recall  f1-score   support

           0       0.83      0.91      0.87        79
           1       0.71      0.53      0.61        32

    accuracy                           0.80       111
   macro avg       0.77      0.72      0.74       111
weighted avg       0.79      0.80      0.79       111



Bagging Decision Tree

In [None]:
dt = DecisionTreeClassifier(random_state=42)
params = {
    'max_depth': [2, 3, 5, 10, 20, 30],
    'min_samples_leaf': [5, 10, 20, 50, 100, 200, 400],
    'criterion': ["gini", "entropy","log_loss"],

}
X = df.drop(['Outcome'], axis=1)
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
dtcv = GridSearchCV(estimator=dt, param_grid=params, cv= 5, n_jobs=-1, verbose=1, scoring = "accuracy")
dtcv.fit(X_train,y_train)
print(classification_report(y_test,dtcv.predict(X_test)))

#rf = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=100, oob_score = True, max_features)
print(dtcv.best_params_)
dtcv_predictions = dtcv.predict(X_test)
mean_squared_error(y_test,dtcv_predictions)

Fitting 5 folds for each of 126 candidates, totalling 630 fits
              precision    recall  f1-score   support

           0       0.81      0.92      0.86        79
           1       0.71      0.47      0.57        32

    accuracy                           0.79       111
   macro avg       0.76      0.70      0.71       111
weighted avg       0.78      0.79      0.78       111

{'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 20}


0.2072072072072072

In [None]:
X = df.drop(['Outcome'], axis=1)
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
bagging = BaggingClassifier(DecisionTreeClassifier(random_state=42, criterion = 'entropy',max_depth=10, min_samples_leaf=20 ))
bagging.fit(X_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy',
                                                        max_depth=10,
                                                        min_samples_leaf=20,
                                                        random_state=42))

In [None]:
y_pred=bagging.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.95      0.88        79
           1       0.80      0.50      0.62        32

    accuracy                           0.82       111
   macro avg       0.81      0.72      0.75       111
weighted avg       0.82      0.82      0.81       111

