In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
dataset = pd.read_csv("water_potability.csv")

In [4]:
dataset.isnull().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

In [5]:
X = dataset.iloc[:,:-1].values
Y = dataset.iloc[:,-1].values

In [6]:
X

array([[           nan, 2.04890455e+02, 2.07913190e+04, ...,
        1.03797831e+01, 8.69909705e+01, 2.96313538e+00],
       [3.71608008e+00, 1.29422921e+02, 1.86300579e+04, ...,
        1.51800131e+01, 5.63290763e+01, 4.50065627e+00],
       [8.09912419e+00, 2.24236259e+02, 1.99095417e+04, ...,
        1.68686369e+01, 6.64200925e+01, 3.05593375e+00],
       ...,
       [9.41951032e+00, 1.75762646e+02, 3.31555782e+04, ...,
        1.10390697e+01, 6.98454003e+01, 3.29887550e+00],
       [5.12676292e+00, 2.30603758e+02, 1.19838694e+04, ...,
        1.11689462e+01, 7.74882131e+01, 4.70865847e+00],
       [7.87467136e+00, 1.95102299e+02, 1.74041771e+04, ...,
        1.61403676e+01, 7.86984463e+01, 2.30914906e+00]])

In [7]:
Y

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [9]:
mv_columns = dataset.columns[dataset.isnull().any()].tolist()
mv_columns

['ph', 'Sulfate', 'Trihalomethanes']

In [15]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
imputer.fit(X[:,:])
X[:,:] = imputer.transform(X[:,:])


In [40]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X ,Y,test_size=0.35,random_state=0)

In [41]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [42]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5,algorithm='ball_tree')
classifier.fit(X_train,Y_train)

In [43]:
Y_pred = classifier.predict(X_test)

In [44]:
from sklearn.metrics import confusion_matrix , accuracy_score
cm = confusion_matrix(Y_test,Y_pred)
print(cm)
accuracy_score(Y_test,Y_pred)

[[540 179]
 [261 167]]


0.6163905841325196

In [47]:
from sklearn.model_selection import  cross_val_score

scores = cross_val_score(classifier,X_test,Y_test,cv=10)
print("Accuracy: {:.2f} %".format(scores.mean()*100))
print("Standard Deviation: {:.2f} %".format(scores.std()*100))

Accuracy: 64.00 %
Standard Deviation: 2.75 %


In [54]:
from sklearn.model_selection import GridSearchCV
parameters = [{
    'n_neighbors': [ 5,8, 10,13,17 ,19,20,],
    'weights': ['uniform', 'distance'],
    'algorithm': ['ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [5,8,20,25,30, 40]
}]
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X_train, Y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 63.36 %
Best Parameters: {'algorithm': 'ball_tree', 'leaf_size': 5, 'n_neighbors': 20, 'weights': 'distance'}
