# Import Relevant Packages, Classes and Functions

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Import Dataset

In [2]:
raw_data=pd.read_csv("Outliers Removed Dataset.csv",sep=",")

In [3]:
raw_data

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,5.971861,204.890456,20791.31898,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
1,3.716080,129.422921,18630.05786,6.635246,252.848888,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.54173,9.275884,252.848888,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.41744,8.059332,356.886136,363.266516,18.436525,100.341674,4.628771,0
4,9.092223,181.101509,17978.98634,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
...,...,...,...,...,...,...,...,...,...,...
2881,6.069616,186.659040,26138.78019,7.747547,345.700257,415.886955,12.067620,60.419921,3.669712,1
2882,7.808856,193.553212,17329.80216,8.061362,256.321194,392.449580,19.903225,63.676082,2.798243,1
2883,9.419510,175.762646,33155.57822,7.350233,256.321194,432.044783,11.039070,69.845400,3.298875,1
2884,5.126763,230.603758,11983.86938,6.303357,256.321194,402.883113,11.168946,77.488213,4.708658,1


# Exploring the Dataset

In [4]:
print(raw_data.shape)

(2886, 10)


In [5]:
#the first 5 raws of data frame
raw_data.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,5.971861,204.890456,20791.31898,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.05786,6.635246,252.848888,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.54173,9.275884,252.848888,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.41744,8.059332,356.886136,363.266516,18.436525,100.341674,4.628771,0
4,9.092223,181.101509,17978.98634,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [6]:
# data types of each column
raw_data.dtypes

ph                 float64
Hardness           float64
Solids             float64
Chloramines        float64
Sulfate            float64
Conductivity       float64
Organic_carbon     float64
Trihalomethanes    float64
Turbidity          float64
Potability           int64
dtype: object

In [7]:
#checking whether there are null values
raw_data.isnull().sum()

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

# Data Transformation

## 01). Split of Input(x) and Output(y) 

In [8]:
#converting dataframe into numpy array
raw_data=raw_data.to_numpy()

In [9]:
x,y=raw_data[:,:-1],raw_data[:,-1]

In [10]:
print(x)
print(x.shape)

[[5.97186050e+00 2.04890456e+02 2.07913190e+04 ... 1.03797831e+01
  8.69909705e+01 2.96313538e+00]
 [3.71608007e+00 1.29422921e+02 1.86300579e+04 ... 1.51800131e+01
  5.63290763e+01 4.50065627e+00]
 [8.09912419e+00 2.24236259e+02 1.99095417e+04 ... 1.68686369e+01
  6.64200925e+01 3.05593375e+00]
 ...
 [9.41951032e+00 1.75762646e+02 3.31555782e+04 ... 1.10390697e+01
  6.98454003e+01 3.29887550e+00]
 [5.12676292e+00 2.30603758e+02 1.19838694e+04 ... 1.11689462e+01
  7.74882131e+01 4.70865847e+00]
 [7.87467136e+00 1.95102299e+02 1.74041771e+04 ... 1.61403676e+01
  7.86984463e+01 2.30914906e+00]]
(2886, 9)


In [11]:
print(y)
print(y.shape)

[0. 0. 0. ... 1. 1. 1.]
(2886,)


## 02). Train Set and Test Set Splitting

In [12]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30)

In [13]:
print(x_test.shape)
print(x_train.shape)
x_train

(866, 9)
(2020, 9)


array([[5.74406754e+00, 1.73850882e+02, 2.83919982e+04, ...,
        1.68951797e+01, 6.54473615e+01, 3.17165521e+00],
       [8.96724411e+00, 2.66478743e+02, 3.44467823e+04, ...,
        1.42633964e+01, 6.49596701e+01, 3.74225594e+00],
       [7.07836107e+00, 1.98412447e+02, 1.66486006e+04, ...,
        1.72496164e+01, 5.96308939e+01, 3.26346473e+00],
       ...,
       [5.57320116e+00, 2.29174986e+02, 1.56131605e+04, ...,
        1.07519037e+01, 6.37585161e+01, 3.59962466e+00],
       [9.14655756e+00, 2.26093425e+02, 6.78748466e+03, ...,
        1.41104762e+01, 8.05711462e+01, 3.21088194e+00],
       [6.53209727e+00, 1.63396241e+02, 8.24081271e+03, ...,
        1.78961280e+01, 5.50224072e+01, 4.59932404e+00]])

In [14]:
x_test

array([[1.02893796e+01, 1.62780252e+02, 1.88692121e+04, ...,
        1.88571411e+01, 7.42937787e+01, 3.50016698e+00],
       [7.70036230e+00, 2.20571796e+02, 2.94322082e+04, ...,
        1.30274393e+01, 7.78298016e+01, 3.78648699e+00],
       [6.09408091e+00, 2.35240587e+02, 1.04437965e+04, ...,
        8.28655037e+00, 8.05403515e+01, 3.90050852e+00],
       ...,
       [5.97186050e+00, 1.72111514e+02, 2.65953735e+04, ...,
        7.87773851e+00, 9.33028160e+01, 3.55031114e+00],
       [4.66349219e+00, 1.70951830e+02, 4.14526256e+04, ...,
        1.62740510e+01, 6.06187542e+01, 3.74229345e+00],
       [7.49384386e+00, 1.97331291e+02, 2.66783929e+04, ...,
        1.31349166e+01, 6.95908059e+01, 3.81994656e+00]])

In [15]:
print(y_train.shape)
print(y_test.shape)


(2020,)
(866,)


## 03). Normalization (Min-Max Scalling)

In [16]:
# rescale all values between 0-1
scaler=MinMaxScaler(feature_range=(0,1))
x_train_scaled=scaler.fit_transform(x_train)
print(x_train_scaled.shape)
print(x_train)
print(x_train_scaled)

(2020, 9)
[[5.74406754e+00 1.73850882e+02 2.83919982e+04 ... 1.68951797e+01
  6.54473615e+01 3.17165521e+00]
 [8.96724411e+00 2.66478743e+02 3.44467823e+04 ... 1.42633964e+01
  6.49596701e+01 3.74225594e+00]
 [7.07836107e+00 1.98412447e+02 1.66486006e+04 ... 1.72496164e+01
  5.96308939e+01 3.26346473e+00]
 ...
 [5.57320116e+00 2.29174986e+02 1.56131605e+04 ... 1.07519037e+01
  6.37585161e+01 3.59962466e+00]
 [9.14655756e+00 2.26093425e+02 6.78748466e+03 ... 1.41104762e+01
  8.05711462e+01 3.21088194e+00]
 [6.53209727e+00 1.63396241e+02 8.24081271e+03 ... 1.78961280e+01
  5.50224072e+01 4.59932404e+00]]
[[0.33576036 0.33953432 0.64158154 ... 0.64538835 0.48210075 0.30989665]
 [0.78547678 0.96213777 0.78200749 ... 0.4958458  0.47587814 0.44601367]
 [0.52192878 0.50462628 0.36922208 ... 0.66552806 0.40788655 0.33179784]
 ...
 [0.31192008 0.71139845 0.34520757 ... 0.29631662 0.46055222 0.41198892]
 [0.81049564 0.69068555 0.14051755 ... 0.48715661 0.67507004 0.3192542 ]
 [0.44571087 0.26926

In [17]:
x_test_scaled=scaler.fit_transform(x_test)
print(x_test_scaled.shape)
print(x_test)
print(x_test_scaled)

(866, 9)
[[1.02893796e+01 1.62780252e+02 1.88692121e+04 ... 1.88571411e+01
  7.42937787e+01 3.50016698e+00]
 [7.70036230e+00 2.20571796e+02 2.94322082e+04 ... 1.30274393e+01
  7.78298016e+01 3.78648699e+00]
 [6.09408091e+00 2.35240587e+02 1.04437965e+04 ... 8.28655037e+00
  8.05403515e+01 3.90050852e+00]
 ...
 [5.97186050e+00 1.72111514e+02 2.65953735e+04 ... 7.87773851e+00
  9.33028160e+01 3.55031114e+00]
 [4.66349219e+00 1.70951830e+02 4.14526256e+04 ... 1.62740510e+01
  6.06187542e+01 3.74229345e+00]
 [7.49384386e+00 1.97331291e+02 2.66783929e+04 ... 1.31349166e+01
  6.95908059e+01 3.81994656e+00]]
[[0.9676133  0.26944485 0.42505115 ... 0.75301241 0.60038232 0.38107225]
 [0.61114332 0.65642147 0.6671122  ... 0.4240649  0.64580849 0.44924451]
 [0.38998177 0.75464482 0.23197478 ... 0.1565549  0.68063006 0.47639283]
 ...
 [0.3731538  0.3319277  0.60210345 ... 0.13348722 0.8445854  0.39301148]
 [0.19301053 0.32416236 0.94257143 ... 0.60725863 0.42470362 0.43872212]
 [0.58270874 0.500801

# The Model

## 01). Import Relevant Packages, Classes and Functions

In [18]:
from sklearn.linear_model import LogisticRegression

## 02). Create the Model

In [19]:
model=LogisticRegression()

## 03). Train the Model

In [22]:
model.fit(x_train_scaled,y_train)

LogisticRegression()

# Evaluate the Model

In [23]:
test_accuracy=model.score(x_test_scaled,y_test)

print(f"Test Accuarcy: {test_accuracy}")

Test Accuarcy: 0.6154734411085451
