In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra 
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from plotly.offline import iplot
from scipy.stats import ttest_ind
from statistics import mean,variance
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset, DataLoader

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/water-potability/water_potability.csv')
data.head()

In [None]:
data.isna().sum()

In [None]:
data = data.dropna()

In [None]:
data.isna().sum() #Checking

In [None]:
labels = data["Potability"].unique().tolist()
values = data["Potability"].value_counts().tolist()

dades = [go.Pie(labels=labels, values=values, textinfo='label+percent', hole=0.3, marker_colors=['blue', 'purple'])]

figure = go.Figure(dades)
figure.update_layout(title="Potability proportion", width=900, height=400)

iplot(figure)

In [None]:
potability_group = data[data["Potability"] == 1]
non_potability_group = data[data["Potability"] == 0]

In [None]:
sns.distplot(potability_group["ph"], hist=True, kde=True, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, label = "ph potability group")

sns.distplot(non_potability_group["ph"], hist=True, kde=True, 
             bins=int(180/5), color = 'orange', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, label = "ph non_potability group")

plt.legend(prop={'size': 8}, title = 'group')
plt.title('pH distribution for potability and non-potability group')
plt.xlabel('Delay (min)')
plt.ylabel('Density')

In [None]:
sns.distplot(potability_group["Hardness"], hist=True, kde=True, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, label = "Hardness potability group")

sns.distplot(non_potability_group["Hardness"], hist=True, kde=True, 
             bins=int(180/5), color = 'orange', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, label = "Hardness non_potability group")

plt.legend(prop={'size': 8}, title = 'group')
plt.title('Hardness distribution for potability and non-potability group')
plt.xlabel('Delay (min)')
plt.ylabel('Density')


In [None]:
sns.distplot(potability_group["Solids"], hist=True, kde=True, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, label = "Solids potability group")

sns.distplot(non_potability_group["Solids"], hist=True, kde=True, 
             bins=int(180/5), color = 'orange', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, label = "Solids non_potability group")

plt.legend(prop={'size': 8}, title = 'group')
plt.title('Solids distribution for potability and non-potability group')
plt.xlabel('Delay (min)')
plt.ylabel('Density')


In [None]:
sns.distplot(potability_group["Chloramines"], hist=True, kde=True, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, label = "Chloramines potability group")

sns.distplot(non_potability_group["Chloramines"], hist=True, kde=True, 
             bins=int(180/5), color = 'orange', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, label = "Chloramines non_potability group")

plt.legend(prop={'size': 8}, title = 'group')
plt.title('Chloramines distribution for potability and non-potability group')
plt.xlabel('Delay (min)')
plt.ylabel('Density')

In [None]:
sns.distplot(potability_group["Sulfate"], hist=True, kde=True, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, label = "Sulfate potability group")

sns.distplot(non_potability_group["Sulfate"], hist=True, kde=True, 
             bins=int(180/5), color = 'orange', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, label = "Sulfate non_potability group")

plt.legend(prop={'size': 8}, title = 'group')
plt.title('Sulfate distribution for potability and non-potability group')
plt.xlabel('Delay (min)')
plt.ylabel('Density')

In [None]:
sns.distplot(potability_group["Conductivity"], hist=True, kde=True, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, label = "Conductivity potability group")

sns.distplot(non_potability_group["Conductivity"], hist=True, kde=True, 
             bins=int(180/5), color = 'orange', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, label = "Conductivity non_potability group")

plt.legend(prop={'size': 8}, title = 'group')
plt.title('Conductivity distribution for potability and non-potability group')
plt.xlabel('Delay (min)')
plt.ylabel('Density')

In [None]:
sns.distplot(potability_group["Organic_carbon"], hist=True, kde=True, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, label = "Organic_carbon potability group")

sns.distplot(non_potability_group["Organic_carbon"], hist=True, kde=True, 
             bins=int(180/5), color = 'orange', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, label = "Organic_carbon non_potability group")

plt.legend(prop={'size': 7}, title = 'group')
plt.title('Organic_carbon distribution for potability and non-potability group')
plt.xlabel('Delay (min)')
plt.ylabel('Density')


In [None]:
sns.distplot(potability_group["Trihalomethanes"], hist=True, kde=True, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, label = "Trihalomethanes potability group")

sns.distplot(non_potability_group["Trihalomethanes"], hist=True, kde=True, 
             bins=int(180/5), color = 'orange', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, label = "Trihalomethanes non_potability group")

plt.legend(prop={'size': 7}, title = 'group')
plt.title('Trihalomethanes distribution for potability and non-potability group')
plt.xlabel('Delay (min)')
plt.ylabel('Density')


In [None]:
sns.distplot(potability_group["Turbidity"], hist=True, kde=True, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, label = "Turbidity potability group")

sns.distplot(non_potability_group["Turbidity"], hist=True, kde=True, 
             bins=int(180/5), color = 'orange', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, label = "Turbidity non_potability group")

plt.legend(prop={'size': 7}, title = 'group')
plt.title('Turbidity distribution for potability and non-potability group')
plt.xlabel('Delay (min)')
plt.ylabel('Density')



In [None]:
correlation = data.corr()

plt.figure(figsize=(35,35))

ax = sns.heatmap(correlation, annot=True, linewidths=.5)


In [None]:
data.corr()['Potability'].sort_values()

In [None]:
var = ["Organic_carbon","Conductivity","Sulfate","Hardness","Trihalomethanes","ph","Chloramines",
"Turbidity","Solids"]
X = data.iloc[:, 0:-1]
Y = data.iloc[:, -1]
X_train,X_test,y_train,y_test = train_test_split(X,Y, test_size = 0.2)
X_train_2v = X_train[["Sulfate", "Chloramines"]]
X_test_2v = X_test[["Sulfate", "Chloramines"]]


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
acc = accuracy_score(y_test, model.predict(X_test))
print(acc)

In [None]:
model = SVC(kernel = 'linear')
model.fit(X_train, y_train)
acc = accuracy_score(y_test, model.predict(X_test))
print(acc)

In [None]:
model = SVC(kernel = 'rbf')
model.fit(X_train, y_train)
acc = accuracy_score(y_test, model.predict(X_test))
print(acc)

In [None]:
model = SVC(kernel = 'sigmoid')
model.fit(X_train, y_train)
acc = accuracy_score(y_test, model.predict(X_test))
print(acc)

In [None]:
model = SVC(kernel = 'poly',degree = 1)
model.fit(X_train, y_train)
acc = accuracy_score(y_test, model.predict(X_test))
print(acc)

In [None]:
model = SVC(kernel = 'poly',degree = 2)
model.fit(X_train, y_train)
acc = accuracy_score(y_test, model.predict(X_test))
print(acc)

In [None]:
model = KNeighborsClassifier(n_neighbors = 1)
model.fit(X_train, y_train)
acc = accuracy_score(y_test, model.predict(X_test))
print(acc)

In [None]:
model = KNeighborsClassifier(n_neighbors = 3)
model.fit(X_train, y_train)
acc = accuracy_score(y_test, model.predict(X_test))
print(acc)

In [None]:
model = KNeighborsClassifier(n_neighbors = 5)
model.fit(X_train, y_train)
acc = accuracy_score(y_test, model.predict(X_test))
print(acc)

In [None]:
model = KNeighborsClassifier(n_neighbors = 7)
model.fit(X_train, y_train)
acc = accuracy_score(y_test, model.predict(X_test))
print(acc)

In [None]:
model = KNeighborsClassifier(n_neighbors = 10)
model.fit(X_train, y_train)
acc = accuracy_score(y_test, model.predict(X_test))
print(acc)

In [None]:
model = KNeighborsClassifier(n_neighbors = 12)
model.fit(X_train, y_train)
acc = accuracy_score(y_test, model.predict(X_test))
print(acc)

In [None]:
model = RandomForestClassifier(max_depth=12,criterion = "gini")
model.fit(X_train, y_train)
acc = accuracy_score(y_test, model.predict(X_test))
print(acc)

In [None]:
model = RandomForestClassifier(max_depth=12,criterion = "entropy")
model.fit(X_train, y_train)
acc = accuracy_score(y_test, model.predict(X_test))
print(acc)

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier( max_depth=12, criterion = "gini")
model.fit(X_train, y_train)
acc = accuracy_score(y_test, model.predict(X_test))
print(acc)

In [None]:
model = DecisionTreeClassifier( max_depth=12, criterion = "entropy")
model.fit(X_train, y_train)
acc = accuracy_score(y_test, model.predict(X_test))
print(acc)

In [None]:
###Only using two variables: Chlorines and Sulfates
model = SVC(kernel = 'poly',degree = 2)
model.fit(X_train_2v, y_train)
acc = accuracy_score(y_test, model.predict(X_test_2v))
print(acc)

In [None]:
model = LogisticRegression()
model.fit(X_train_2v, y_train)
acc = accuracy_score(y_test, model.predict(X_test_2v))
print(acc)

In [None]:
model = SVC(kernel = 'linear')
model.fit(X_train_2v, y_train)
acc = accuracy_score(y_test, model.predict(X_test_2v))
print(acc)

In [None]:
model = SVC(kernel = 'poly',degree = 3)
model.fit(X_train_2v, y_train)
acc = accuracy_score(y_test, model.predict(X_test_2v))
print(acc)

In [None]:
model = KNeighborsClassifier(n_neighbors = 3)
model.fit(X_train_2v, y_train)
acc = accuracy_score(y_test, model.predict(X_test_2v))
print(acc)

### Neuronal Network with pytorch

In [None]:
# use gpu if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using: ",device)

In [None]:
batch_size = 64

y_train, y_test = y_train.to_frame(),y_test.to_frame()

In [None]:
class trainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)


train_data = trainData(torch.FloatTensor(X_train), 
                       torch.FloatTensor(y_train.values))
## test data    
class testData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    

test_data = testData(torch.FloatTensor(X_test))

In [None]:
train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [None]:
class SimpleMLP(nn.Module):
 
    def __init__(self,inp_dim,layer1_dim,layer2_dim,layer3_dim,output_dim):
        super().__init__()
        self.fc1 = nn.Linear(inp_dim, layer1_dim)
        self.fc2 = nn.Linear(layer1_dim,layer2_dim)
        self.fc3 = nn.Linear(layer2_dim, layer3_dim)
        self.fc4 = nn.Linear(layer3_dim, output_dim)
        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.zeros_(self.fc1.bias)
        nn.init.xavier_uniform_(self.fc2.weight)
        nn.init.zeros_(self.fc2.bias)
        nn.init.xavier_uniform_(self.fc3.weight)
        nn.init.zeros_(self.fc3.bias)
        nn.init.xavier_uniform_(self.fc4.weight)
        nn.init.zeros_(self.fc4.bias)
       
 
    def forward(self, x):
        out = torch.relu(self.fc1(x))
        out = torch.tanh(self.fc2(out))
        out = torch.tanh(self.fc3(out))
        out = torch.tanh(self.fc4(out))
        return out

In [None]:
input_size = 9 #9 entry variables
layer1_dim = 3
layer2_dim = 2
layer3_dim = 2
output_size = 1
model = SimpleMLP(input_size,layer1_dim,layer2_dim,layer3_dim, output_size)
print(model)

In [None]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc


In [None]:
#learning_rate = 0.000015
learning_rate = 0.0001
epochs = 2000
criterion = nn.BCEWithLogitsLoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
#optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)


In [None]:
model.train()
for e in range(1, epochs+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        acc = binary_acc(y_pred, y_batch)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

In [None]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [None]:
y_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [None]:
s = 0
size = len(y_test)
for i in range(size):
    if ( y_test.iloc[i]["Potability"] == y_pred_list[i]): s+=1
        
test_accuracy = s/size
print(test_accuracy)