In [122]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier,VotingRegressor,BaggingClassifier, BaggingRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,ElasticNet, LogisticRegression
from sklearn.preprocessing import OneHotEncoder,LabelEncoder, OrdinalEncoder
from sklearn.compose import make_column_selector,make_column_transformer
from sklearn.metrics import classification_report,f1_score,log_loss,accuracy_score,r2_score,log_loss

<h1 style = color:orange>Voting Classifier</h1>

In [36]:
sonar = pd.read_csv('../Datasets/Sonar.csv')

le = LabelEncoder()

X = sonar.drop('Class',axis = 1)
y = sonar['Class']

y_le = le.fit_transform(y)

X_train,X_test,y_train,y_test = train_test_split(X,y_le,test_size=0.3,stratify=y,random_state=25)

dtc1 = DecisionTreeClassifier(random_state=25)
dtc2 = DecisionTreeClassifier(random_state=25,max_depth=3)

knn1 = KNeighborsClassifier(n_neighbors=5)
knn2 = KNeighborsClassifier(n_neighbors=3)

nb = GaussianNB()

voting = VotingClassifier(estimators=[('tree1',dtc1),('tree2',dtc2),('knn1',knn1),('knn2',knn2),('nb',nb)],voting='soft')  # model esembling

voting.fit(X_train,y_train)
y_pred = voting.predict(X_test)
print(classification_report(y_test,y_pred))

#evaluating individual estimators 
for i in range(len(voting.estimators_)):
    print('Estimator : ',voting.estimators_[i])
    print('Accuracy Score = ',accuracy_score(y_test,voting.estimators_[i].predict(X_test)))

              precision    recall  f1-score   support

           0       0.68      0.88      0.77        34
           1       0.79      0.52      0.62        29

    accuracy                           0.71        63
   macro avg       0.74      0.70      0.70        63
weighted avg       0.73      0.71      0.70        63

Estimator :  DecisionTreeClassifier(random_state=25)
Accuracy Score =  0.6984126984126984
Estimator :  DecisionTreeClassifier(max_depth=3, random_state=25)
Accuracy Score =  0.6666666666666666
Estimator :  KNeighborsClassifier()
Accuracy Score =  0.746031746031746
Estimator :  KNeighborsClassifier(n_neighbors=3)
Accuracy Score =  0.8095238095238095
Estimator :  GaussianNB()
Accuracy Score =  0.6349206349206349


<h1 style = color:orange>Soft Voting</h1>

In [34]:
sonar = pd.read_csv('../Datasets/Sonar.csv')

# le = LabelEncoder()

X = sonar.drop('Class',axis = 1)
y = sonar['Class']

y_le = le.fit_transform(y)

X_train,X_test,y_train,y_test = train_test_split(X,y_le,test_size=0.3,stratify=y,random_state=25)

dtc1 = DecisionTreeClassifier(random_state=25)
dtc2 = DecisionTreeClassifier(random_state=25,max_depth=3)

knn1 = KNeighborsClassifier(n_neighbors=5)
knn2 = KNeighborsClassifier(n_neighbors=3)

nb = GaussianNB()


voting = VotingClassifier(estimators=[('tree1',dtc1),('tree2',dtc2),('knn1',knn1),('knn2',knn2),('nb',nb)],voting='soft')  # model esembling

voting.fit(X_train,y_train)
y_pred = voting.predict(X_test)
print(classification_report(y_test,y_pred))



              precision    recall  f1-score   support

           0       0.68      0.88      0.77        34
           1       0.79      0.52      0.62        29

    accuracy                           0.71        63
   macro avg       0.74      0.70      0.70        63
weighted avg       0.73      0.71      0.70        63



<h1 style = color:orange>Weighted Average</h1>

In [38]:
sonar = pd.read_csv('../Datasets/Sonar.csv')

# le = LabelEncoder()

X = sonar.drop('Class',axis = 1)
y = sonar['Class']

y_le = le.fit_transform(y)

X_train,X_test,y_train,y_test = train_test_split(X,y_le,test_size=0.3,stratify=y,random_state=25)

dtc1 = DecisionTreeClassifier(random_state=25)
dtc2 = DecisionTreeClassifier(random_state=25,max_depth=3)

knn1 = KNeighborsClassifier(n_neighbors=5)
knn2 = KNeighborsClassifier(n_neighbors=3)

nb = GaussianNB()


voting = VotingClassifier(estimators=[('tree1',dtc1),('tree2',dtc2),('knn1',knn1),('knn2',knn2),('nb',nb)],voting='soft',weights=[2,1,3,4,1])  # model esembling

voting.fit(X_train,y_train)
y_pred = voting.predict(X_test)
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.73      0.97      0.84        34
           1       0.94      0.59      0.72        29

    accuracy                           0.79        63
   macro avg       0.84      0.78      0.78        63
weighted avg       0.83      0.79      0.78        63



<h1>HR Dataset</h1>

In [45]:
ohe = OneHotEncoder(sparse_output=False,drop='first',).set_output(transform='pandas')
oe = OrdinalEncoder(categories=[['low','medium','high']]).set_output(transform='pandas')

hr = pd.read_csv('../Cases/HRAnalytics/HR_comma_sep.csv')

column_transform = make_column_transformer((ohe,['Department']),(oe,['salary']),remainder='passthrough',verbose_feature_names_out=False).set_output(transform='pandas')

X = hr.drop('left',axis = 1)
y = hr['left']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=25,stratify=y)

X_train_ct = column_transform.fit_transform(X_train)
X_test_ct =column_transform.transform(X_test)


dtc1 = DecisionTreeClassifier(random_state=25)
dtc2 = DecisionTreeClassifier(random_state=25,max_depth=3)

knn1 = KNeighborsClassifier(n_neighbors=5)
knn2 = KNeighborsClassifier(n_neighbors=3)

nb = GaussianNB()


voting = VotingClassifier(estimators=[('tree1',dtc1),('tree2',dtc2),('knn1',knn1),('knn2',knn2),('nb',nb)])  # model esembling

voting.fit(X_train_ct,y_train)
y_pred = voting.predict(X_test_ct)
print(classification_report(y_test,y_pred))
print(log_loss(y_test, voting.predict(X_test_ct)))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98      3429
           1       0.90      0.95      0.93      1070

    accuracy                           0.96      4499
   macro avg       0.94      0.96      0.95      4499
weighted avg       0.97      0.96      0.96      4499

1.2978599353271791


In [44]:
voting = VotingClassifier(estimators=[('tree1',dtc1),('tree2',dtc2),('knn1',knn1),('knn2',knn2),('nb',nb)], voting='soft')  # model esembling

voting.fit(X_train_ct,y_train)
y_pred = voting.predict(X_test_ct)
print(classification_report(y_test,y_pred))
print(log_loss(y_test, voting.predict(X_test_ct)))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      3429
           1       0.93      0.95      0.94      1070

    accuracy                           0.97      4499
   macro avg       0.96      0.97      0.96      4499
weighted avg       0.97      0.97      0.97      4499

0.9934236542010508


<h1 style = color:orange>Voting Regressor</h1>

In [82]:
concrete = pd.read_csv('../Cases/Concrete_Strength/Concrete_Data.csv')

X = concrete.drop('Strength', axis =1)
y = concrete['Strength']

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=25,test_size=0.3)

dtr = DecisionTreeRegressor(random_state = 25)
dtr2 = DecisionTreeRegressor(random_state = 25,max_depth=5)

lr  = LinearRegression()
en = ElasticNet()

voting = VotingRegressor(estimators=[('Tree',dtr),('Tree2',dtr2),('LR',lr),("EL",en)],weights=[10,5,1,2])
voting.fit(X_train,y_train)

y_pred = voting.predict(X_test)
print(r2_score(y_test,y_pred))

for i in range(len(voting.estimators_)):
    print(voting.estimators_[i], ':',end = ' ')
    print(r2_score(y_test,voting.estimators_[i].predict(X_test)))
    


0.8550749354114082
DecisionTreeRegressor(random_state=25) : 0.8127760533837747
DecisionTreeRegressor(max_depth=5, random_state=25) : 0.7311101169515943
LinearRegression() : 0.6351839142464111
ElasticNet() : 0.6345321364921961


<h1 style = 'color: orange'>Bagging</h1>

In [102]:
sonar = pd.read_csv('../Datasets/Sonar.csv')

le = LabelEncoder()

X = sonar.drop('Class',axis = 1)
y = sonar['Class']

# y_le = le.fit_transform(y)

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,stratify=y,random_state=25)

dtc1 = DecisionTreeClassifier(random_state=25)
dtc2 = DecisionTreeClassifier(random_state=25,max_depth=3)

knn1 = KNeighborsClassifier(n_neighbors=5)
knn2 = KNeighborsClassifier(n_neighbors=3)

nb = GaussianNB()

est_list = (dtc1,dtc2,knn1,knn2,nb)
n_est = [10,15,25,50]
scores = []

bagging = BaggingClassifier(estimator=nb,n_estimators=10)  # model esembling

bagging.fit(X_train,y_train)
y_pred = bagging.predict(X_test)
print(classification_report(y_test,y_pred))



for e in est_list:
    for n in n_est:
        bagging = BaggingClassifier(estimator=e,n_estimators=n)
        bagging.fit(X_train,y_train)
        y_pred = bagging.predict_proba(X_test)
        scores.append([e,n,log_loss(y_test,y_pred)])

scores = pd.DataFrame(scores,columns=['Estimators','Samples','Score'])
scores.sort_values('Score')



              precision    recall  f1-score   support

           M       0.62      0.74      0.68        34
           R       0.61      0.48      0.54        29

    accuracy                           0.62        63
   macro avg       0.62      0.61      0.61        63
weighted avg       0.62      0.62      0.61        63



Unnamed: 0,Estimators,Samples,Score
14,KNeighborsClassifier(n_neighbors=3),25,0.411369
15,KNeighborsClassifier(n_neighbors=3),50,0.426945
10,KNeighborsClassifier(),25,0.43548
12,KNeighborsClassifier(n_neighbors=3),10,0.437166
13,KNeighborsClassifier(n_neighbors=3),15,0.450139
8,KNeighborsClassifier(),10,0.45353
11,KNeighborsClassifier(),50,0.466235
9,KNeighborsClassifier(),15,0.468036
1,DecisionTreeClassifier(random_state=25),15,0.487108
3,DecisionTreeClassifier(random_state=25),50,0.496331


In [118]:
ohe = OneHotEncoder(sparse_output=False,drop='first',).set_output(transform='pandas')
oe = OrdinalEncoder(categories=[['low','medium','high']]).set_output(transform='pandas')

hr = pd.read_csv('../Cases/HRAnalytics/HR_comma_sep.csv')

column_transform = make_column_transformer((ohe,['Department']),(oe,['salary']),remainder='passthrough',verbose_feature_names_out=False).set_output(transform='pandas')

X = hr.drop('left',axis = 1)
y = hr['left']

X = column_transform.fit_transform(X)


X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=25,stratify=y)


dtc1 = DecisionTreeClassifier(random_state=25)
dtc2 = DecisionTreeClassifier(random_state=25,max_depth=7)

knn1 = KNeighborsClassifier(n_neighbors=5)
knn2 = KNeighborsClassifier(n_neighbors=3)

nb = GaussianNB()

est_list = (dtc1,dtc2,knn1,knn2,nb)
n_est = [10,15,25,50]
scores = []

bagging = BaggingClassifier(estimator=dtc2,n_estimators=10)  # model esembling

bagging.fit(X_train,y_train)
y_pred = bagging.predict(X_test)
print(classification_report(y_test,y_pred))



for e in est_list:
    for n in n_est:
        bagging = BaggingClassifier(estimator=e,n_estimators=n)
        bagging.fit(X_train,y_train)
        y_pred = bagging.predict_proba(X_test)
        scores.append([e,n,log_loss(y_test,y_pred)])

scores = pd.DataFrame(scores,columns=['Estimators','Samples','Score'])
scores.sort_values('Score')
scores

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3429
           1       0.99      0.93      0.96      1070

    accuracy                           0.98      4499
   macro avg       0.98      0.97      0.97      4499
weighted avg       0.98      0.98      0.98      4499



Unnamed: 0,Estimators,Samples,Score
0,DecisionTreeClassifier(random_state=25),10,0.194617
1,DecisionTreeClassifier(random_state=25),15,0.171108
2,DecisionTreeClassifier(random_state=25),25,0.156765
3,DecisionTreeClassifier(random_state=25),50,0.134554
4,"DecisionTreeClassifier(max_depth=7, random_sta...",10,0.08899
5,"DecisionTreeClassifier(max_depth=7, random_sta...",15,0.08023
6,"DecisionTreeClassifier(max_depth=7, random_sta...",25,0.079041
7,"DecisionTreeClassifier(max_depth=7, random_sta...",50,0.07974
8,KNeighborsClassifier(),10,0.486537
9,KNeighborsClassifier(),15,0.446389


In [116]:
depths = [None, 3, 5, 7]
scores = []

for d in depths:
    dtc = DecisionTreeClassifier(random_state=25, max_depth=d)
    bagg = BaggingClassifier(random_state=25, n_estimators=50, estimator=dtc)
    bagg.fit(X_train, y_train)
    y_pred = bagg.predict_proba(X_test)
    scores.append(['dtc',d,log_loss(y_test,y_pred)])

scores = pd.DataFrame(scores,columns=['Estimators','depth','Score'])
scores.sort_values('Score')
scores

dtc = DecisionTreeClassifier(random_state=25, max_depth=7)
y_pred = bagg.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3429
           1       0.99      0.94      0.96      1070

    accuracy                           0.98      4499
   macro avg       0.98      0.97      0.97      4499
weighted avg       0.98      0.98      0.98      4499



In [139]:
concrete = pd.read_csv('../Cases/Concrete_Strength/Concrete_Data.csv')

X = concrete.drop('Strength', axis =1)
y = concrete['Strength']

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=25,test_size=0.3)

dtc = DecisionTreeRegressor(random_state=25)
knn = KNeighborsRegressor()
lr = LinearRegression()

est_list = [dtc, knn, lr]
n_est = [10, 15, 25, 50]
scores = []
for e in est_list:
    for n in n_est:
        bagging = BaggingRegressor(random_state=25, estimator=e,n_estimators=n)
        bagging.fit(X_train,y_train)
        y_pred = bagging.predict(X_test)
        scores.append([e,n,r2_score(y_test,y_pred)])

scores = pd.DataFrame(scores,columns=['Estimators','Samples','Score'])
scores.sort_values('Score', ascending=False)

Unnamed: 0,Estimators,Samples,Score
2,DecisionTreeRegressor(random_state=25),25,0.881644
3,DecisionTreeRegressor(random_state=25),50,0.880861
1,DecisionTreeRegressor(random_state=25),15,0.877123
0,DecisionTreeRegressor(random_state=25),10,0.876503
7,KNeighborsRegressor(),50,0.71214
6,KNeighborsRegressor(),25,0.708045
5,KNeighborsRegressor(),15,0.702541
4,KNeighborsRegressor(),10,0.696449
8,LinearRegression(),10,0.634298
9,LinearRegression(),15,0.634216


In [136]:
depths = []
scores = []

for d in depths:
    dtc = DecisionTreeRegressor(random_state=25, max_depth=d)
    bagg = BaggingRegressor(random_state=25, n_estimators=50, estimator=dtc)
    bagg.fit(X_train, y_train)
    y_pred = bagg.predict(X_test)
    scores.append(['dtc',d,r2_score(y_test,y_pred)])

scores = pd.DataFrame(scores,columns=['Estimators','depth','Score'])
scores.sort_values('Score', ascending=False)

Unnamed: 0,Estimators,depth,Score
14,dtc,15,0.884802
13,dtc,14,0.884309
10,dtc,11,0.8833
11,dtc,12,0.882118
12,dtc,13,0.882047
16,dtc,17,0.881662
15,dtc,16,0.881413
18,dtc,19,0.881394
9,dtc,10,0.880934
17,dtc,18,0.880266


<h1 style = 'color: orange'>OOB(Out Of Bag) score</h1>

In [140]:
concrete = pd.read_csv('../Cases/Concrete_Strength/Concrete_Data.csv')

X = concrete.drop('Strength', axis =1)
y = concrete['Strength']

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=25,test_size=0.3)

dtr = DecisionTreeRegressor(random_state = 25,max_depth=None)
dtr2 = DecisionTreeRegressor(random_state = 25,max_depth=5)

lr  = LinearRegression()
en = ElasticNet()

bagg = BaggingRegressor(random_state=25,n_estimators=25,estimator= dtr,oob_score=True)
bagg.fit(X_train,y_train)
print('OOB score :',bagg.oob_score_)

OOB score : 0.894564632485158
