# Example of classification on water dataset

### Importing dependencies and loading dataset

In [24]:
# Write your code here
import seaborn as sb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

In [25]:
path = "water_potability.csv"

data = pd.read_csv(path)
print(data[:][0:10])

          ph    Hardness        Solids  Chloramines     Sulfate  Conductivity  \
0        NaN  204.890455  20791.318981     7.300212  368.516441    564.308654   
1   3.716080  129.422921  18630.057858     6.635246         NaN    592.885359   
2   8.099124  224.236259  19909.541732     9.275884         NaN    418.606213   
3   8.316766  214.373394  22018.417441     8.059332  356.886136    363.266516   
4   9.092223  181.101509  17978.986339     6.546600  310.135738    398.410813   
5   5.584087  188.313324  28748.687739     7.544869  326.678363    280.467916   
6  10.223862  248.071735  28749.716544     7.513408  393.663396    283.651634   
7   8.635849  203.361523  13672.091764     4.563009  303.309771    474.607645   
8        NaN  118.988579  14285.583854     7.804174  268.646941    389.375566   
9  11.180284  227.231469  25484.508491     9.077200  404.041635    563.885481   

   Organic_carbon  Trihalomethanes  Turbidity  Potability  
0       10.379783        86.990970   2.963135   

In [26]:
#Handle NaN values and split into inputs and outputs
inputs, outputs = data.iloc[:, 0:9], data.iloc[:, 9]
inputs = inputs.fillna(inputs.mean())
data = data.fillna(data.mean())

In [27]:
# Normalize
mu = inputs.mean()
sigma = inputs.std()
inputs = (inputs - mu)/sigma

print("Mean and std dev after normalization:")
print("Mean: \n", np.mean(inputs))
print("\nStd dev: \n", np.std(inputs))

inputs = inputs.to_numpy()
outputs = outputs.to_numpy()

Mean and std dev after normalization:
Mean: 
 ph                -4.674052e-16
Hardness           1.025906e-15
Solids            -4.554761e-17
Chloramines        5.205441e-16
Sulfate            1.591997e-15
Conductivity       1.756836e-16
Organic_carbon     1.735147e-17
Trihalomethanes    1.713458e-16
Turbidity          5.552471e-16
dtype: float64

Std dev: 
 ph                 0.999847
Hardness           0.999847
Solids             0.999847
Chloramines        0.999847
Sulfate            0.999847
Conductivity       0.999847
Organic_carbon     0.999847
Trihalomethanes    0.999847
Turbidity          0.999847
dtype: float64


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [28]:
from sklearn.model_selection import train_test_split

# split data with train_test_split from sklearn (provides data shuffling which is very useful)
X_train, X_test, y_train, y_test = train_test_split(inputs, outputs, test_size=0.25, random_state=3, shuffle=True)

train_size = X_train.shape[0]
test_size = X_test.shape[0]

### Benchmark given by gaussian bayes naive classifier

In [29]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

# fit() returns the object itself, so I can immediately call predict() on the test set
y_pred = gnb.fit(X_train, y_train).predict(X_test)

n_test = X_test.shape[0]
n_success = (y_test == y_pred).sum()
print(f"Number of correctly labeled points out of a total {n_test} points : {n_success}")
print(f"Test success rate: {n_success/n_test*100}%")

Number of correctly labeled points out of a total 819 points : 509
Test success rate: 62.14896214896215%


### Experiments with convolutional layers in tensorflow

In [52]:
a = np.array([[[0,1,2,3,4,5,6,5,4,3,2,1,0]]], dtype='float').reshape(1,13,1)
input_sh = a.shape
print("input shape: ", input_sh)

""" First number is number of output channels, second number is kernel size
IMPORTANT: the input convolutional layer of a NN  in tensorflow expects at least a 3rd order tensor where:
 - First dimension is batch size
 - Second dimension is the dimension that is convoluted
 - Third dimension is number of input channels
 
IMPORTANT2: the input convolutional layer of a NN takes an input_shape argument that provides the input size:
input_shape=(dimension that is convoluted, number of input channels)
"""
y = tf.keras.layers.Conv1D(1, 3, input_shape=input_sh[1:])(a)
print(y)

input shape:  (1, 13, 1)
tf.Tensor(
[[[-1.3594038]
  [-2.6634808]
  [-3.9675577]
  [-5.2716346]
  [-6.5757113]
  [-6.495521 ]
  [-6.4650574]
  [-5.1609807]
  [-3.8569038]
  [-2.552827 ]
  [-1.24875  ]]], shape=(1, 11, 1), dtype=float32)
