# Import Relevant Packages, Classes and Functions

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Import Dataset

In [3]:
raw_data=pd.read_csv("../Data Preprocessing/Outliers Removed Dataset.csv",sep=",")

In [4]:
raw_data

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,5.971861,204.890456,20791.31898,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
1,3.716080,129.422921,18630.05786,6.635246,252.848888,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.54173,9.275884,252.848888,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.41744,8.059332,356.886136,363.266516,18.436525,100.341674,4.628771,0
4,9.092223,181.101509,17978.98634,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
...,...,...,...,...,...,...,...,...,...,...
2881,6.069616,186.659040,26138.78019,7.747547,345.700257,415.886955,12.067620,60.419921,3.669712,1
2882,7.808856,193.553212,17329.80216,8.061362,256.321194,392.449580,19.903225,63.676082,2.798243,1
2883,9.419510,175.762646,33155.57822,7.350233,256.321194,432.044783,11.039070,69.845400,3.298875,1
2884,5.126763,230.603758,11983.86938,6.303357,256.321194,402.883113,11.168946,77.488213,4.708658,1


# Exploring the Dataset

In [5]:
print(raw_data.shape)

(2886, 10)


In [6]:
#the first 5 raws of data frame
raw_data.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,5.971861,204.890456,20791.31898,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.05786,6.635246,252.848888,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.54173,9.275884,252.848888,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.41744,8.059332,356.886136,363.266516,18.436525,100.341674,4.628771,0
4,9.092223,181.101509,17978.98634,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [7]:
# data types of each column
raw_data.dtypes

ph                 float64
Hardness           float64
Solids             float64
Chloramines        float64
Sulfate            float64
Conductivity       float64
Organic_carbon     float64
Trihalomethanes    float64
Turbidity          float64
Potability           int64
dtype: object

In [8]:
#checking whether there are null values
raw_data.isnull().sum()

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

# Data Transformation

## 01). Split of Input(x) and Output(y) 

In [9]:
#converting dataframe into numpy array
raw_data=raw_data.to_numpy()

In [10]:
x,y=raw_data[:,:-1],raw_data[:,-1]

In [11]:
print(x)
print(x.shape)

[[5.97186050e+00 2.04890456e+02 2.07913190e+04 ... 1.03797831e+01
  8.69909705e+01 2.96313538e+00]
 [3.71608007e+00 1.29422921e+02 1.86300579e+04 ... 1.51800131e+01
  5.63290763e+01 4.50065627e+00]
 [8.09912419e+00 2.24236259e+02 1.99095417e+04 ... 1.68686369e+01
  6.64200925e+01 3.05593375e+00]
 ...
 [9.41951032e+00 1.75762646e+02 3.31555782e+04 ... 1.10390697e+01
  6.98454003e+01 3.29887550e+00]
 [5.12676292e+00 2.30603758e+02 1.19838694e+04 ... 1.11689462e+01
  7.74882131e+01 4.70865847e+00]
 [7.87467136e+00 1.95102299e+02 1.74041771e+04 ... 1.61403676e+01
  7.86984463e+01 2.30914906e+00]]
(2886, 9)


In [12]:
print(y)
print(y.shape)

[0. 0. 0. ... 1. 1. 1.]
(2886,)


## 02). Train Set and Test Set Splitting

In [13]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30)

In [14]:
print(x_test.shape)
print(x_train.shape)
x_train

(866, 9)
(2020, 9)


array([[5.97186050e+00, 2.22980630e+02, 3.22372454e+04, ...,
        9.84243325e+00, 7.72419842e+01, 4.00663157e+00],
       [8.04014417e+00, 2.24863482e+02, 6.87976465e+03, ...,
        1.24059131e+01, 7.32179902e+01, 3.98645981e+00],
       [9.79455870e+00, 1.29870958e+02, 2.06827317e+04, ...,
        1.18532281e+01, 7.87114694e+01, 2.94541874e+00],
       ...,
       [8.55107775e+00, 2.16415932e+02, 2.89092903e+04, ...,
        1.62023714e+01, 7.98593229e+01, 4.04637969e+00],
       [5.97186050e+00, 2.37677383e+02, 2.60090753e+04, ...,
        1.44422383e+01, 7.26962405e+01, 3.70660179e+00],
       [5.97186050e+00, 1.90929256e+02, 2.06614111e+04, ...,
        9.22598527e+00, 6.00235228e+01, 4.96914973e+00]])

In [15]:
x_test

array([[6.39656993e+00, 2.43724707e+02, 1.22789853e+04, ...,
        1.76380395e+01, 6.27527643e+01, 4.18824971e+00],
       [7.41328834e+00, 1.96452303e+02, 1.74653456e+04, ...,
        1.31780689e+01, 8.06047763e+01, 3.29010591e+00],
       [6.14918517e+00, 1.50563593e+02, 2.05963912e+04, ...,
        1.28293802e+01, 6.43949066e+01, 4.27561487e+00],
       ...,
       [7.02550433e+00, 2.05400529e+02, 2.18845385e+04, ...,
        1.58437437e+01, 6.55803377e+01, 2.94155512e+00],
       [7.83398048e+00, 2.03169408e+02, 1.11742748e+04, ...,
        1.62765919e+01, 4.79316388e+01, 4.25447718e+00],
       [7.70810999e+00, 1.88997850e+02, 1.54562201e+04, ...,
        1.60901935e+01, 5.72857632e+01, 4.89662591e+00]])

In [16]:
print(y_train.shape)
print(y_test.shape)


(2020,)
(866,)


## 03). Normalization (Min-Max Scalling)

In [17]:
# rescale all values between 0-1
scaler=MinMaxScaler(feature_range=(0,1))
x_train_scaled=scaler.fit_transform(x_train)
print(x_train_scaled.shape)
print(x_train)
print(x_train_scaled)

(2020, 9)
[[5.97186050e+00 2.22980630e+02 3.22372454e+04 ... 9.84243325e+00
  7.72419842e+01 4.00663157e+00]
 [8.04014417e+00 2.24863482e+02 6.87976465e+03 ... 1.24059131e+01
  7.32179902e+01 3.98645981e+00]
 [9.79455870e+00 1.29870958e+02 2.06827317e+04 ... 1.18532281e+01
  7.87114694e+01 2.94541874e+00]
 ...
 [8.55107775e+00 2.16415932e+02 2.89092903e+04 ... 1.62023714e+01
  7.98593229e+01 4.04637969e+00]
 [5.97186050e+00 2.37677383e+02 2.60090753e+04 ... 1.44422383e+01
  7.26962405e+01 3.70660179e+00]
 [5.97186050e+00 1.90929256e+02 2.06614111e+04 ... 9.22598527e+00
  6.00235228e+01 4.96914973e+00]]
[[0.3731538  0.67368526 0.73139228 ... 0.24434734 0.62910106 0.50356204]
 [0.65792634 0.68631424 0.15030161 ... 0.38899459 0.57726974 0.49874098]
 [0.89948367 0.04916446 0.46660966 ... 0.35780871 0.64802886 0.2499315 ]
 ...
 [0.72827445 0.62965342 0.65512904 ... 0.60321403 0.66281386 0.51306187]
 [0.3731538  0.77226179 0.58866787 ... 0.50389653 0.57054931 0.43185473]
 [0.3731538  0.45870

In [18]:
x_test_scaled=scaler.fit_transform(x_test)
print(x_test_scaled.shape)
print(x_test)
print(x_test_scaled)

(866, 9)
[[6.39656993e+00 2.43724707e+02 1.22789853e+04 ... 1.76380395e+01
  6.27527643e+01 4.18824971e+00]
 [7.41328834e+00 1.96452303e+02 1.74653456e+04 ... 1.31780689e+01
  8.06047763e+01 3.29010591e+00]
 [6.14918517e+00 1.50563593e+02 2.05963912e+04 ... 1.28293802e+01
  6.43949066e+01 4.27561487e+00]
 ...
 [7.02550433e+00 2.05400529e+02 2.18845385e+04 ... 1.58437437e+01
  6.55803377e+01 2.94155512e+00]
 [7.83398048e+00 2.03169408e+02 1.11742748e+04 ... 1.62765919e+01
  4.79316388e+01 4.25447718e+00]
 [7.70810999e+00 1.88997850e+02 1.54562201e+04 ... 1.60901935e+01
  5.72857632e+01 4.89662591e+00]]
[[0.43039653 0.80682936 0.25815056 ... 0.74853559 0.46092911 0.54782225]
 [0.57584944 0.48514537 0.38067753 ... 0.47195308 0.69473744 0.33534739]
 [0.39500538 0.17287729 0.45464801 ... 0.45032936 0.4824363  0.56849032]
 ...
 [0.52037261 0.54603716 0.48508029 ... 0.63726337 0.49796192 0.25289036]
 [0.63603415 0.5308546  0.23205194 ... 0.6641062  0.2668164  0.56348975]
 [0.61802697 0.434418

# The Model-01

## 01). Import Relevant Packages, Classes and Functions

In [110]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier

## 02). Create the Model

In [186]:
#base estimators
#create decison tree object
dt=DecisionTreeClassifier(max_leaf_nodes=20,random_state=0)

In [175]:
# Create adaboost classifer object
# n_estimators=> Number of week learners
abc = AdaBoostClassifier(n_estimators=200,estimator=dt,learning_rate=0.01)

## 03). Train the Model

In [176]:
model = abc.fit(x_train_scaled, y_train)

## 04). Make Predictions

In [177]:
y_pred = model.predict(x_test_scaled)

# Evaluate the Model

In [178]:
acc=metrics.accuracy_score(y_test, y_pred)

In [179]:
print(f"Accuracy: {acc}")

Accuracy: 0.651270207852194


# The Model-02
## 01). Import Relevant Packages, Classes and Functions

In [180]:
from sklearn.ensemble import RandomForestClassifier

## 02). Create the Model

In [181]:
#base estimator
rfc=RandomForestClassifier(n_estimators=100,max_depth=10,random_state=42)

In [182]:
# Create adaboost classifer object
# n_estimators=> Number of week learners
abc = AdaBoostClassifier(n_estimators=200,estimator=rfc,learning_rate=0.01)

## 03). Train the Model

In [183]:
model = abc.fit(x_train_scaled, y_train)

## 04). Make Predictions

In [184]:
y_pred = model.predict(x_test_scaled)

# Evaluate the Model

In [185]:
acc=metrics.accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc}")

Accuracy: 0.6235565819861432
