# Predict the quality of water - Intel oneAPI Hackathon 2023 - Sandy Inspires

## What's in the notebook
- Importing Packages
- Loading Dataset
- Understanding Dataset
- Preprocessing stage
- Balancing Dataset
- Model Training
- Model Evaluation
- Hyper-parameter Tunning

`Note: There has been a lot of snippets and POCs made to make this entire things work which is not included as part of this notebook (it's too extensive)`

# Importing Packages

In [None]:
# intel packages and configurations
from modin.config import Engine
Engine.put("dask")
from dask.distributed import Client
client = Client(n_workers=6)

import modin.pandas as pd

In [None]:
# user library
import numpy as np
#import pandas_profiling
import random as rnd


In [None]:
# patch up sklearn
from sklearnex import patch_sklearn
patch_sklearn()


In [None]:
# preprocessing
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import category_encoders as ce

# machine learning model
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# hyperparameter tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# model evaluation
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report, RocCurveDisplay, PrecisionRecallDisplay, ConfusionMatrixDisplay, auc, roc_curve

# plots
import seaborn as sns


# Loading Dataset

In [None]:
sample_size = 1_000_000

In [None]:
fresh_water_df = pd.read_csv("datasets/dataset.csv", nrows=sample_size)

In [None]:
fresh_water_df = pd.read_csv("datasets/encoded_20K_dataset.csv")

In [None]:
fresh_water_df = pd.read_csv("datasets/20k_dataset.csv")

In [None]:
fresh_water_df = pd.read_csv("datasets/dataset.csv")

In [None]:
fresh_water_df.head()

In [None]:
# can't use drop na since the difference is too high - total data size 5956843 - 3981800 after drop
#fresh_water_df = fresh_water_df.dropna()

## Understanding Dataset

In [None]:
fresh_water_df["Target"].value_counts()

In [None]:
fresh_water_df.count()

In [None]:
corr = fresh_water_df.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
corr["Target"]

In [None]:
print(f"Numeric Columns - {list(fresh_water_df._get_numeric_data(axis=1).columns)}")
print(f"Categorical Columns - {list(fresh_water_df.select_dtypes(include=object).columns)}")


In [None]:
fresh_water_df.describe()

In [None]:
fresh_water_df.info()

# Preprocessing Stage

In [None]:
columns_to_drop = ["Month"]
fresh_water_df = fresh_water_df.drop(columns=columns_to_drop)


In [None]:
columns_to_drop = [
    "Color", "Month", "Source", "Index"]
fresh_water_df = fresh_water_df.drop(columns=columns_to_drop)


In [None]:
#feature_columns_to_use = ["Iron", "Nitrate", "Chloride", "Turbidity", "Odor", "Sulfate", "Chlorine", "Target"]
feature_columns_to_use = ["Iron", "Nitrate", "Chloride", "Turbidity", "Odor", "Sulfate", "Chlorine", "Target"]
fresh_water_df = fresh_water_df[feature_columns_to_use]
numerical_columns_to_fill_median = list(fresh_water_df._get_numeric_data(axis=0).columns)
print("Numeric Values", numerical_columns_to_fill_median)

In [None]:
# filling NaNs with median and rounding it to 10 since the floating points are too large to handle
for column_name in numerical_columns_to_fill_median:
    column_median = fresh_water_df[column_name].median()
    fresh_water_df[column_name] = fresh_water_df[column_name].replace(np.NaN, column_median).round(10)

In [None]:
fresh_water_df = pd.get_dummies(fresh_water_df, prefix=["clr"], columns=["Color"]).drop(columns=["Index", "Source","Month"])
#fresh_water_df = pd.get_dummies(fresh_water_df, prefix=["clr", "src", "mon"], columns=["Color", "Source","Month"]).drop(columns=["Index"])

# Balancing Dataset

In [None]:
target_1_count = fresh_water_df.loc[fresh_water_df["Target"] == 1]["Target"].count()
target_0_count = fresh_water_df.loc[fresh_water_df["Target"] == 0]["Target"].count()
print(target_1_count, target_0_count)


In [None]:
min_target_count = min(target_1_count, target_0_count)
print(min_target_count)

In [None]:
# get equal number of target values to train
ndf = fresh_water_df.query("Target== 1")[:min_target_count].append(
    fresh_water_df.query("Target == 0")[:min_target_count])


In [None]:
X, Y = ndf.drop(columns=["Target"]), ndf["Target"]

In [None]:
scaler = preprocessing.MinMaxScaler()
x = scaler.fit_transform(X)
scaled_x = pd.DataFrame(x, columns=X.columns)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(
    scaled_x, Y, test_size=0.25, shuffle=True)

# Model Training

In [None]:
random_forest = RandomForestClassifier(n_estimators=20, max_depth = 13, min_samples_leaf=4)
#random_forest = RandomForestClassifier(max_depth=50, min_samples_leaf=2, min_samples_split=10, n_estimators=60)
random_forest.fit(X_train, Y_train)
y_pred = random_forest.predict(X_test)
print(
      f"Training Accuracy - {round(random_forest.score(X_train, Y_train) * 100, 2)}")
print(
    f"Testing Accuracy - {round(random_forest.score(X_test, Y_test) * 100, 2)}")

In [None]:
logreg = LogisticRegression(max_iter=100)
logreg.fit(X_train, Y_train)
print(
      f"Training Accuracy - {round(logreg.score(X_train, Y_train) * 100, 2)}")
print(
    f"Testing Accuracy - {round(logreg.score(X_test, Y_test) * 100, 2)}")


In [None]:
svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)

print(
      f"Training Accuracy - {round(svc.score(X_train, Y_train) * 100, 2)}")
print(
    f"Testing Accuracy - {round(svc.score(X_test, Y_test) * 100, 2)}")

In [None]:
import tensorflow as tf
print(tf.__version__)

train_x = np.asarray(X_train)
train_y = np.asarray(Y_train)
test_x = np.asarray(X_test)
test_y = np.asarray(Y_test)

tf.random.set_seed(45)
model_3 = tf.keras.Sequential([

    tf.keras.layers.Dense(256, activation = 'relu'),
    tf.keras.layers.Dense(256, activation='relu'),    

    tf.keras.layers.Dropout(0.2),  

    tf.keras.layers.Dense(128, activation = 'relu'),
    tf.keras.layers.Dense(32, activation='relu'),    

    tf.keras.layers.Dropout(0.5),  

    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),

    tf.keras.layers.Dropout(0.6),
    
    tf.keras.layers.Dense(8, activation='relu'),
    
    tf.keras.layers.Dense(1, activation = 'sigmoid')

])


In [None]:
model_3.compile(loss=tf.keras.losses.BinaryCrossentropy(),

                optimizer=tf.keras.optimizers.Adam(lr=0.005),

                metrics=['accuracy'])

model_3.fit(train_x, train_y, epochs=50, verbose=1, batch_size=64, use_multiprocessing =True, shuffle=True, steps_per_epoch=10)

model_3.evaluate(test_x, test_y)

In [None]:
# Init classifier
xgb_cl = xgb.XGBClassifier()

# Fit
xgb_cl.fit(X_train, Y_train)

# Predict
y_pred = xgb_cl.predict(X_test)

print(
      f"Training Accuracy - {round(xgb_cl.score(X_train, Y_train) * 100, 2)}")
print(
    f"Testing Accuracy - {round(xgb_cl.score(X_test, Y_test) * 100, 2)}")

target_names = ["unsafe", "safe"]
print(classification_report(Y_test, y_pred, target_names=target_names))

# Model Evaluation

In [None]:
target_names = ["safe", "unsafe"]
print(classification_report(Y_test, y_pred, target_names=target_names))

In [None]:
con_mat = confusion_matrix(Y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=con_mat,display_labels=random_forest.classes_)
disp.plot(cmap="summer")

In [None]:
fpr, tpr, thresholds = roc_curve(Y_test, y_pred)
roc_auc = auc(fpr, tpr)
display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='example estimator')
display.plot()

In [None]:
import joblib
joblib.dump(random_forest, "models/97_random_forest_nor_full.sav")

# Hyper-parameter Tunning

In [None]:
param_grid = {'bootstrap': [True, False],
              'max_depth': list(range(1,100)),
              'max_features': ['auto', 'sqrt'],
              'min_samples_leaf': [1, 2, 4],
              'min_samples_split': [2, 5, 10],
              'n_estimators': [10, 20, 30, 40]
              }
grid_search = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, n_jobs=-1, return_train_score=True)
grid_search.fit(X_train, Y_train)
print(grid_search.best_estimator_)

In [None]:
random_search = RandomizedSearchCV(RandomForestClassifier(),
                                   param_grid)
random_search.fit(X_train, Y_train)
print(random_search.best_estimator_)

h_random_forest = random_search.best_estimator_
h_random_forest.fit(X_train, Y_train)
print(
      f"Training Accuracy - {round(h_random_forest.score(X_train, Y_train) * 100, 2)}")
print(
    f"Testing Accuracy - {round(h_random_forest.score(X_test, Y_test) * 100, 2)}")