In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
# import numpy as np
import time

### Q2.) Ensamble Approachs Algorithms

In [2]:
## Decision Tree Algorithm

from sklearn.tree import DecisionTreeClassifier

begin_time = time.time()
tree = DecisionTreeClassifier(criterion = 'gini', max_depth = None, random_state = 1)

In [3]:
## Bagging Algorithm

from sklearn.ensemble import BaggingClassifier

begin_time = time.time()
bag = BaggingClassifier(base_estimator = tree, n_estimators = 500, max_samples = 1.0, max_features = 1.0, bootstrap = True, bootstrap_features = False, n_jobs = 1, random_state = 1)


In [4]:
## Random Forest Algorithm

from sklearn.ensemble import RandomForestClassifier

begin_time = time.time()
forest = RandomForestClassifier(criterion = 'gini', n_estimators = 100, bootstrap = True, random_state = 1, n_jobs = 2)

In [5]:
## AdaBoost Algorithm

from sklearn.ensemble import AdaBoostClassifier

begin_time = time.time()
ada = AdaBoostClassifier(base_estimator = tree, n_estimators = 500, learning_rate = 0.1, random_state = 1)

### Q3 & Q4.) Testing the Algorithms with the datasets

#### Digits Dataset

In [6]:
digits = datasets.load_digits()
X = digits.data
y = digits.target
df = pd.DataFrame(X)
df.shape
df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)


## Decision Tree Algorithm

tree_train = tree.fit(X_train_std, y_train)
y_pred = tree.predict(X_test_std)
tree_test = accuracy_score(y_test, y_pred)
end_time = time.time()
print("Decision Tree Test Accuracy: ", tree_test)
print("Time taken: ", end_time - begin_time)


## Bagging Algorithm

bag_train = bag.fit(X_train_std, y_train)
y_pred = bag.predict(X_test_std)
bag_test = accuracy_score(y_test, y_pred)
end_time = time.time()
print("Bagging Test Accuracy: ", bag_test)
print("Time taken: ", end_time - begin_time)


## Random Forest Algorithm

forest_train = forest.fit(X_train_std, y_train)
y_pred = forest.predict(X_test_std)
forest_test = accuracy_score(y_test, y_pred)
end_time = time.time()
print("Random Forest Test Accuracy: ", forest_test)
print("Time taken: ", end_time - begin_time)


## AdaBoost Algorithm

ada_train = ada.fit(X_train_std, y_train)
y_pred = ada.predict(X_test_std)
ada_test = accuracy_score(y_test, y_pred)
end_time = time.time()
print("AdaBoost Test Accuracy: ", ada_test)
print("Time taken: ", end_time - begin_time)

Decision Tree Test Accuracy:  0.8666666666666667
Time taken:  0.17541933059692383
Bagging Test Accuracy:  0.9611111111111111
Time taken:  6.901275396347046
Random Forest Test Accuracy:  0.9638888888888889
Time taken:  7.177224636077881
AdaBoost Test Accuracy:  0.8416666666666667
Time taken:  7.200253009796143


#### Mammographic Mass Dataset

In [7]:
## Pre-Processing the data

mammo = pd.read_csv('mammographic_masses.data', header = None, na_values = '?')
df = pd.DataFrame(mammo)
df.columns = ['BI-RADS', 'age', 'shape', 'margin', 'density', 'severity']
df.info()
df.isnull().sum()
df.dropna(inplace = True)
df.shape
df.head()
df.info()
df
# df.to_csv('mammographic_masses_cleaned.csv', index = False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 961 entries, 0 to 960
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   BI-RADS   959 non-null    float64
 1   age       956 non-null    float64
 2   shape     930 non-null    float64
 3   margin    913 non-null    float64
 4   density   885 non-null    float64
 5   severity  961 non-null    int64  
dtypes: float64(5), int64(1)
memory usage: 45.2 KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 830 entries, 0 to 960
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   BI-RADS   830 non-null    float64
 1   age       830 non-null    float64
 2   shape     830 non-null    float64
 3   margin    830 non-null    float64
 4   density   830 non-null    float64
 5   severity  830 non-null    int64  
dtypes: float64(5), int64(1)
memory usage: 45.4 KB


Unnamed: 0,BI-RADS,age,shape,margin,density,severity
0,5.0,67.0,3.0,5.0,3.0,1
2,5.0,58.0,4.0,5.0,3.0,1
3,4.0,28.0,1.0,1.0,3.0,0
8,5.0,57.0,1.0,5.0,3.0,1
10,5.0,76.0,1.0,4.0,3.0,1
...,...,...,...,...,...,...
956,4.0,47.0,2.0,1.0,3.0,0
957,4.0,56.0,4.0,5.0,3.0,1
958,4.0,64.0,4.0,5.0,3.0,0
959,5.0,66.0,4.0,5.0,3.0,1


In [8]:
## Standardizing the data

df2 = df[df['BI-RADS'] == 55.0]
df = df[df['BI-RADS'] != 55.0]
X1 = df.iloc[:, 1:6]
y1 = df.iloc[:, 0]
y1 = y1.astype('object')
# print(y1.unique())
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size = 0.4, random_state = 1, stratify = y1)
X_train = X_train.append(df2.iloc[:, 1:6])
y_train = y_train.append(df2.iloc[:, 0])
y_train = y_train.astype('int')
y_test = y_test.astype('int')
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)


## Decision Tree Algorithm

tree_train = tree.fit(X_train_std, y_train)
y_pred = tree.predict(X_test_std)
tree_test = accuracy_score(y_test, y_pred)
end_time = time.time()
print("Decision Tree Test Accuracy: ", tree_test)
print("Time taken: ", end_time - begin_time)


## Bagging Algorithm

bag_train = bag.fit(X_train_std, y_train)
y_pred = bag.predict(X_test_std)
bag_test = accuracy_score(y_test, y_pred)
end_time = time.time()
print("Bagging Test Accuracy: ", bag_test)
print("Time taken: ", end_time - begin_time)


## Random Forest Algorithm

forest_train = forest.fit(X_train_std, y_train)
y_pred = forest.predict(X_test_std)
forest_test = accuracy_score(y_test, y_pred)
end_time = time.time()
print("Random Forest Test Accuracy: ", forest_test)
print("Time taken: ", end_time - begin_time)


## AdaBoost Algorithm

ada_train = ada.fit(X_train_std, y_train)
y_pred = ada.predict(X_test_std)
ada_test = accuracy_score(y_test, y_pred)
end_time = time.time()
print("AdaBoost Test Accuracy: ", ada_test)
print("Time taken: ", end_time - begin_time)

Decision Tree Test Accuracy:  0.677710843373494
Time taken:  7.399904012680054
Bagging Test Accuracy:  0.7259036144578314
Time taken:  8.31338882446289
Random Forest Test Accuracy:  0.7349397590361446
Time taken:  8.545458316802979
AdaBoost Test Accuracy:  0.6897590361445783
Time taken:  9.780546188354492
