## Using different Ensemble techniques

## Bagging Ensemble

In [9]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.model_selection import KFold # for sub samples
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
data = pd.read_csv('Company_Data.csv')
data

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.50,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.40,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,Good,33,14,Yes,Yes
396,6.14,139,23,3,37,120,Medium,55,11,No,Yes
397,7.41,162,26,12,368,159,Medium,40,18,Yes,Yes
398,5.94,100,79,7,284,95,Bad,50,12,Yes,Yes


In [3]:
data = pd.get_dummies(data)

In [4]:
data

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,Age,Education,ShelveLoc_Bad,ShelveLoc_Good,ShelveLoc_Medium,Urban_No,Urban_Yes,US_No,US_Yes
0,9.50,138,73,11,276,120,42,17,1,0,0,0,1,0,1
1,11.22,111,48,16,260,83,65,10,0,1,0,0,1,0,1
2,10.06,113,35,10,269,80,59,12,0,0,1,0,1,0,1
3,7.40,117,100,4,466,97,55,14,0,0,1,0,1,0,1
4,4.15,141,64,3,340,128,38,13,1,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,33,14,0,1,0,0,1,0,1
396,6.14,139,23,3,37,120,55,11,0,0,1,1,0,0,1
397,7.41,162,26,12,368,159,40,18,0,0,1,0,1,0,1
398,5.94,100,79,7,284,95,50,12,1,0,0,0,1,0,1


In [5]:
data['Sales'] = ['High Sales' if data['Sales'].iloc[i] >= 7.49 else 'Low Sales' for i in range(0, (data.shape)[0])]
data

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,Age,Education,ShelveLoc_Bad,ShelveLoc_Good,ShelveLoc_Medium,Urban_No,Urban_Yes,US_No,US_Yes
0,High Sales,138,73,11,276,120,42,17,1,0,0,0,1,0,1
1,High Sales,111,48,16,260,83,65,10,0,1,0,0,1,0,1
2,High Sales,113,35,10,269,80,59,12,0,0,1,0,1,0,1
3,Low Sales,117,100,4,466,97,55,14,0,0,1,0,1,0,1
4,Low Sales,141,64,3,340,128,38,13,1,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,High Sales,138,108,17,203,128,33,14,0,1,0,0,1,0,1
396,Low Sales,139,23,3,37,120,55,11,0,0,1,1,0,0,1
397,Low Sales,162,26,12,368,159,40,18,0,0,1,0,1,0,1
398,Low Sales,100,79,7,284,95,50,12,1,0,0,0,1,0,1


In [6]:
x = data.iloc[:, 1:]
y = data.iloc[:, 0]

In [7]:
kfold = KFold(n_splits=10, random_state=42, shuffle = True)
cart = DecisionTreeClassifier()
num_trees = 500
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=None)
results = cross_val_score(model, x, y, cv=kfold)
print(results.mean()) 

0.8150000000000001


## Random Forest Ensemble

In [12]:
from sklearn.ensemble import RandomForestClassifier

num_trees = 500
max_features = 3
kfold = KFold(n_splits=10, random_state=42, shuffle = True)
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
results = cross_val_score(model, x, y, cv=kfold)
print(results.mean())

0.805


## AdaBoost Ensemble

In [17]:
from sklearn.ensemble import AdaBoostClassifier

num_trees = 500
kfold = KFold(n_splits=10, random_state=42, shuffle = True)
model = AdaBoostClassifier(n_estimators=num_trees, random_state=42)
results = cross_val_score(model, x, y, cv=kfold)
print(results.mean())

0.85


## Stacking with Logistic, Decision Tree, and SVC

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

kfold = KFold(n_splits=10, random_state=42, shuffle = True)

# create the sub models
estimators = []
model1 = LogisticRegression(max_iter=500)
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = SVC()
estimators.append(('svm', model3))

# create the ensemble model
ensemble = VotingClassifier(estimators)
results = cross_val_score(ensemble, x, y, cv=kfold)
print(results.mean())

0.8475000000000001
