# First we need to import some essential libraries.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Features Description:

# ph: pH of 1. water (0 to 14).
# Hardness: Capacity of water to precipitate soap in mg/L.
# Solids: Total dissolved solids in ppm.
# Chloramines: Amount of Chloramines in ppm.
# Sulfate: Amount of Sulfates dissolved in mg/L.
# Conductivity: Electrical conductivity of water in μS/cm.
# Organic_carbon: Amount of organic carbon in ppm.
# Trihalomethanes: Amount of Trihalomethanes in μg/L.
# Turbidity: Measure of light emiting property of water in NTU.
# *Potability*: Indicates if water is safe for human consumption. Potable - 1 and Not potable - 0

# 1.Get the Data

In [None]:
d=pd.read_csv('water_potability.csv')

# 2.Take a Look at the Data

In [None]:
d.head()

# Print information about the DataFrame: number of columns and labels, column data types,and the number of cells in each column (non-null values):

In [None]:
d.info()

# Get a descriptive statistics summary of our dataframe:

In [None]:
d.describe()

# Number of missing values in each column:

In [None]:
d.isnull().sum()

We can see that there are some missing values, we must handle it.

# Checking the number of duplicated rows to drop it:

In [None]:
d.duplicated().sum()

# Checking the shape of the data:

In [None]:
d.shape

# Missing Values:

In [None]:
plt.figure(figsize=(12, 12))
sns.heatmap(d.isnull(),yticklabels=False,cbar=False)
plt.show()

In [None]:
sns.countplot(x="Potability", data=d, palette="Paired")
plt.show()

In [None]:
d['Potability'].value_counts(normalize=True)

In [None]:
#To identify outlier data points, we used the box plot.
import matplotlib.pyplot as plt
import seaborn as sns
columns= d.columns
for i in columns:
    plt.figure(figsize=(12, 4))
    sns.set_theme(style="whitegrid")
    ax = sns.boxplot(x=d[i]).set_title(i)

In our dataset, outliers are not abnormal so that we had not make any changes on.

# We used corr() to find the pairwise correlation of all columns in our dataframe.

In [None]:
# we used corr() to find the pairwise correlation of all columns in our dataframe.
plt.figure(figsize=(15,10))
cor = d.corr()
sns.heatmap(cor, annot=True)
plt.show()

# 3. Handling Missing Features

# Filling the missing data in each column with its mean:

In [None]:
d['Sulfate'].fillna(d['Sulfate'].mean(), inplace=True)
d['ph'].fillna(d['ph'].mean(), inplace=True)
d['Trihalomethanes'].fillna(d['Trihalomethanes'].mean(), inplace=True)

Now there is no missing value in our dataset.

In [None]:
d.isnull().sum()

The corr() after handle the missing values:

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(d.corr(), annot=True);
plt.show()

### StandardScaler :is useful for the features that follow a Normal distribution.
### MinMaxScaler : may be used when the upper and lower boundaries are well known from domain knowledge .

In [None]:
%matplotlib inline
d.drop('Potability', axis=1).hist(bins=50, figsize=(20,15))
plt.show()

# Dropping the target label:

In [None]:
x=d.drop("Potability",axis=1)
y=d.Potability

# Applying the standard scaler on the numarical data: (The data is normally distributed)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder ,MinMaxScaler
#Applying the standard scaler on the numarical data, and one_hot_encoding on categorical data.
numeric_transformer = Pipeline(
    steps=[ ("scaler",  StandardScaler())]
)
a = numeric_transformer.fit_transform(x)

In [None]:
d_x= pd.DataFrame(a, columns = x.columns)
d_x.head()

# 4.Create Test Set and Train Set

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x=d.drop("Potability",axis=1)
y=d.Potability

In [None]:
x_train,x_test,y_train,y_test=train_test_split(d_x,y,test_size=0.3,random_state=42)

In [None]:
from sklearn.metrics import accuracy_score

def evaluation(x,y, model):
    y_hat = model.predict(x)
    prob = model.predict_proba(x)
    accuracy = accuracy_score(y, y_hat)
    print(f"Accuracy: {accuracy}")
    return accuracy, prob, y_hat

# Now we are ready to train a model and predict the required solution.

In [None]:
from sklearn.linear_model import LogisticRegression

logR=LogisticRegression()

logR.fit(x_train, y_train)

print("Training Accurcay: ")
accuracyTr, probTr, predictTr = evaluation(x_train,y_train, logR)
print("Testing Accurcay: ")
accuracyTs, probTs, predictTs = evaluation(x_test,y_test, logR)


In [None]:
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(logR, x_train, y_train, cv=3)
confusion_matrix(y_train, y_train_pred)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
KNC = KNeighborsClassifier(n_neighbors=30)
KNC.fit(x_train, y_train)

print("Training Accurcay: ", KNC.score(x_train, y_train))
print("Testing Accurcay: ", KNC.score(x_test, y_test))


In [None]:
y_train_pred = cross_val_predict(KNC, x_train, y_train, cv=3)
confusion_matrix(y_train, y_train_pred)

In [None]:
from sklearn.svm import SVC
clf = SVC()
clf.fit(x_train, y_train)
print("Training Accurcay: ", clf.score(x_train, y_train))
print("Testing Accurcay: ", clf.score(x_test, y_test))

In [None]:
y_train_pred = cross_val_predict(clf, x_train, y_train, cv=3)
confusion_matrix(y_train, y_train_pred)

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=10)
RF.fit(x_train,y_train)

print("Training Accurcay: ")
accuracyTr, probTr, predictTr = evaluation(x_train,y_train, RF)
print("Testing Accurcay: ")
accuracyTs, probTs, predictTs = evaluation(x_test,y_test, RF)

In [None]:
y_train_pred = cross_val_predict(RF, x_train, y_train, cv=3)
confusion_matrix(y_train, y_train_pred)

In [None]:
import  tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, optimizers
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import Dense

In [None]:
from keras import models
from keras import layers
model = models.Sequential()
model.add(layers.Dense(100, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dense(50, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dense(5, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dense(1, activation='sigmoid'))
opt = tf.keras.optimizers.SGD(learning_rate=0.05)
early_stopping_cb = keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)
checkpoint_cb = keras.callbacks.ModelCheckpoint("my_keras_model.h5", save_best_only=True)

In [None]:
model.compile(optimizer=opt, loss='binary_crossentropy', metrics = ['accuracy'])

In [None]:
history = model.fit(x_train, y_train, validation_data=(x_test, y_test), batch_size = 54, epochs = 30,callbacks=[checkpoint_cb, early_stopping_cb])

In [None]:
model.evaluate(x_test, y_test)

In [None]:
model.summary()

In [None]:
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
plt.gca().set_ylim(0, 1)
plt.show()

# SGD with momentum

In [None]:
model = models.Sequential()

model.add(layers.Dense(300, activation='relu'))

model.add(layers.BatchNormalization())

model.add(layers.Dense(150, activation='relu'))

model.add(layers.BatchNormalization())

model.add(layers.Dense(30, activation='relu'))

model.add(layers.BatchNormalization())

model.add(layers.Dense(1, activation='sigmoid'))

opt = tf.keras.optimizers.SGD(learning_rate=0.05,momentum=0.95)

early_stopping_cb = keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)

checkpoint_cb = keras.callbacks.ModelCheckpoint("my_keras_model.h5", save_best_only=True)

In [None]:
model.compile(optimizer=opt, loss='binary_crossentropy', metrics = ['accuracy'])

history = model.fit(x_train, y_train, validation_data=(x_test, y_test), batch_size = 54, epochs = 30,callbacks=[checkpoint_cb, early_stopping_cb])

In [None]:
model.evaluate(x_test,y_test)

In [None]:
model.summary()

In [None]:
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
plt.gca().set_ylim(0, 1)
plt.show()

# Adam optimizer

In [None]:

model = models.Sequential()

model.add(layers.Dense(300, activation='relu'))

model.add(layers.BatchNormalization())

model.add(layers.Dense(150, activation='relu'))

model.add(layers.BatchNormalization())

model.add(layers.Dense(30, activation='relu'))

model.add(layers.BatchNormalization())

model.add(layers.Dense(1, activation='sigmoid'))

opt = tf.keras.optimizers.Adam(learning_rate=0.05)

early_stopping_cb = keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)

checkpoint_cb = keras.callbacks.ModelCheckpoint("my_keras_model.h5", save_best_only=True)

In [None]:
model.compile(optimizer=opt, loss='binary_crossentropy', metrics = ['accuracy'])

history = model.fit(x_train, y_train, validation_data=(x_test, y_test), batch_size = 54, epochs = 30,callbacks=[checkpoint_cb, early_stopping_cb])

In [None]:
model.evaluate(x_test,y_test)

In [None]:
model.summary()

In [None]:
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
plt.gca().set_ylim(0, 1)
plt.show()

# The highest accuracy we got is 69% from the model with SGD optemizer and momentum.