In [None]:
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.models import Sequential
import pandas as pd 
import numpy as np 
import keras
from sklearn.metrics import accuracy_score, confusion_matrix
import os
import matplotlib.pyplot as plt

In [None]:
train = np.loadtxt(r'D:\M.eng\Machine learning\ML_project\Data\Water_Train_Data.txt')
cv = np.loadtxt(r'D:\M.eng\Machine learning\ML_project\Data\Water_CV_Data.txt')
test = np.loadtxt(r'D:\M.eng\Machine learning\ML_project\Data\Water_Test_Data.txt')
#declare variable
X = train[:,0:20]
Y = train[:,20]
X_val = cv[:,0:20]
y_val = cv[:,20] 
X_test = test[:,0:20].T
y_test = test[:,20] 

<blockquote>Create model using NN

# Train NN

In [None]:
# include only the rows having label = 0 or 1 (binary classification)
X = train[:,0:20]

# target variable
Y = train[:,20]

In [None]:
X

In [None]:
Y

In [None]:
# implementing a sigmoid activation function
def sigmoid(z):
    s = 1.0/ (1 + np.exp(-z))    
    return s

In [None]:
def network_architecture(X, Y):
    # nodes in input layer
    n_x = X.shape[0] 
    # nodes in hidden layer
    n_h = 100          
    # nodes in output layer
    n_y = Y.shape[0] 
    return (n_x, n_h, n_y)

In [None]:
#print check
network_architecture(X, Y)

You can adjust node in input layer

In [None]:
def define_network_parameters(n_x, n_h, n_y):
    W1 = np.random.randn(n_h,n_x) * 0.01 # random initialization
    b1 = np.zeros((n_h, 1)) # zero initialization
    W2 = np.random.randn(n_y,n_h) * 0.01 
    b2 = np.zeros((n_y, 1)) 
    return {"W1": W1, "b1": b1, "W2": W2, "b2": b2}   

In [None]:
#print check
define_network_parameters(4796,100,4796)

the hidden layer and the output layer have a weight and bias term. 

In [None]:
def forward_propagation(X, params):
    Z1 = np.dot(params['W1'], X)+params['b1']
    A1 = sigmoid(Z1)

    Z2 = np.dot(params['W2'], A1)+params['b2']
    A2 = sigmoid(Z2)
    return {"Z1": Z1, "A1": A1, "Z2": Z2, "A2": A2}   

In [None]:
#print check
forward_propagation(4796,_)

In [None]:
def compute_error(Predicted, Actual):
    logprobs = np.multiply(np.log(Predicted), Actual)+ np.multiply(np.log(1-Predicted), 1-Actual)
    cost = -np.sum(logprobs) / Actual.shape[0] 
    return np.squeeze(cost)

compute network error

In [None]:
def backward_propagation(params, activations, X, Y):
    m = X.shape[1]
    
    # output layer
    dZ2 = activations['A2'] - Y # compute the error derivative 
    dW2 = np.dot(dZ2, activations['A1'].T) / m # compute the weight derivative 
    db2 = np.sum(dZ2, axis=1, keepdims=True)/m # compute the bias derivative
    
    # hidden layer
    dZ1 = np.dot(params['W2'].T, dZ2)*(1-np.power(activations['A1'], 2))
    dW1 = np.dot(dZ1, X.T)/m
    db1 = np.sum(dZ1, axis=1,keepdims=True)/m
    
    return {"dW1": dW1, "db1": db1, "dW2": dW2, "db2": db2}

def update_parameters(params, derivatives, alpha = 0.001):
    # alpha is the model's learning rate 
    
    params['W1'] = params['W1'] - alpha * derivatives['dW1']
    params['b1'] = params['b1'] - alpha * derivatives['db1']
    params['W2'] = params['W2'] - alpha * derivatives['dW2']
    params['b2'] = params['b2'] - alpha * derivatives['db2']
    return params

You can adjust alpha

Implement Backward Propagation

In [None]:
def neural_network(X, Y, n_h, num_iterations=500):
    n_x = network_architecture(X, Y)[0]
    n_y = network_architecture(X, Y)[2]
    
    params = define_network_parameters(n_x, n_h, n_y)
    for i in range(0, num_iterations):
        results = forward_propagation(X, params)
        error = compute_error(results['A2'], Y)
        derivatives = backward_propagation(params, results, X, Y) 
        params = update_parameters(params, derivatives)    
    return params

Compile and Train the Model

In [None]:
y = Y.reshape(1, Y.size)
x = X.T
model = neural_network(x, y, n_h = 10, num_iterations = 500)

In [None]:
def predict(parameters, X):
    results = forward_propagation(X, parameters)
    print (results['A2'][0])
    predictions = np.around(results['A2'])    
    return predictions

predictions = predict(model, x)
print ('Training accuracy: %d' % float((np.dot(y,predictions.T) + np.dot(1-y,1-predictions.T))/float(y.size)*100) + '%')

# Validation NN

In [None]:
# validate the data from test set
X_val = cv[:,0:20].T

# True labels for the test examples (must match the shape used in accuracy computation)
y_val = cv[:,20] 

# Predict using the test set
predictions = predict(model, X_val)

# Calculate and print the accuracy (for binary classification)
accuracy = float(
    (np.dot(y_val, predictions.T) + np.dot(1 - y_val, 1 - predictions.T))
    / y_val.size * 100
)
print('Validation accuracy: %d%%' % accuracy)

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Load validation data
X_val = cv[:, 0:20].T
y_val = cv[:, 20]  # True labels, should be (1600,)

# Predict using the model
predictions = predict(model, X_val)  # Ensure predictions are (1600,)

# Reshape predictions if necessary
predictions = predictions.ravel()  # Converts shape to (1600,)

# Compute confusion matrix
cm = confusion_matrix(y_val, predictions)

# Display the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')

print("Confusion Matrix:")
print(cm)


[[1415 0
185 0]]
TP = 0
FP = 185
TN = 1415
FN = 0

In [None]:
# Calculate confusion matrix components
TP = np.sum((predictions == 1) & (y_val == 1))  # True Positives
TN = np.sum((predictions == 0) & (y_val == 0))  # True Negatives
FP = np.sum((predictions == 1) & (y_val == 0))  # False Positives
FN = np.sum((predictions == 0) & (y_val == 1))  # False Negatives

# Create confusion matrix
confusion_matrix = np.array([[TP, FP], [FN, TN]])

# Calculate accuracy
accuracy = float(
    (TP + TN) / y_val.size * 100
)

# Print confusion matrix and accuracy
print("Confusion Matrix of Validation:")
print(confusion_matrix)
print(f'Accuracy: {accuracy}%')

# Learning curve NN

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Define training and validation data
X_train_full = X       
y_train_full = Y      

X_val_full = X_val     
y_val_full = y_val    

training_sizes = np.linspace(0.1, 1.0, 100)
train_accuracies = []
val_accuracies = []

for size in training_sizes:
    subset_size = int(size * X_train_full.shape[0])
    
    # Get subsets
    X_subset = X_train_full[:subset_size].T          
    y_subset = y_train_full[:subset_size].reshape(1, -1)

    # Keep validation size fixed (recommendation)
    X_val_subset = X_val_full.T if X_val_full.shape[0] == y_val_full.shape[0] else X_val_full 
    y_val_subset = y_val_full.reshape(1, -1)          

    # Train the model
    model = neural_network(X_subset, y_subset, n_h=100, num_iterations=500) #change the number of num_iteration

    # Predict on training data
    train_preds = predict(model, X_subset).flatten()
    train_acc = float(
        (np.dot(y_subset.flatten(), train_preds) + np.dot(1 - y_subset.flatten(), 1 - train_preds)) / y_subset.size * 100
    )

    # Predict on validation data
    val_preds = predict(model, X_val_subset).flatten()
    val_acc = float(
        (np.dot(y_val_subset.flatten(), val_preds) + np.dot(1 - y_val_subset.flatten(), 1 - val_preds)) / y_val_subset.size * 100
    )

    train_accuracies.append(train_acc)
    val_accuracies.append(val_acc)

# Plot learning curve
plt.figure(figsize=(8, 6))
plt.plot(training_sizes * 100, train_accuracies, label='Training Accuracy')
plt.plot(training_sizes * 100, val_accuracies, label='Validation Accuracy')
plt.xlabel('Training Set Size (%)')
plt.ylabel('Accuracy (%)')
plt.title('Learning Curve')
plt.legend()
plt.grid(True)
plt.show()


<blockquote>Create model using XGBoost

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
X_train = train[:,0:20] 
y_train = train[:,20] 
X_val = cv[:,0:20]
y_val = cv[:,20] 

In [None]:
# Initialize the XGBoost model
model = xgb.XGBClassifier(
    n_estimators = 500,        # Number of boosting rounds
    learning_rate = 0.001,       # Step size at each iteration
    max_depth = 4,             # Depth of each tree
    subsample = 0.8,           # Fraction of samples to use for each tree
    colsample_bytree = 0.8,    # Fraction of features to use for each tree
    objective = 'binary:logistic',  # Binary classification task
    eval_metric = 'logloss'    # Metric for evaluation
)

# Training and Validation XGBoost

In [None]:
# Train the model on the training data
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Confusion Matrix
cm = confusion_matrix(y_val, y_pred)
print("Confusion Matrix of Validation:")
print(cm)

In [None]:
# #          Predicted
#           0     1
# Actual  ---------
#    0   | TN   FP |
#    1   | FN   TP |


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import itertools
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    # Example: if you used a scaler
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)  # use transform, not fit_transform

    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, f"{cm[i, j]:.2f}" if normalize else int(cm[i, j]),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

# Replace 'clf' with 'model'
y_pred = model.predict(X_test)
confusion_mtx = confusion_matrix(y_test, y_pred) 
plot_confusion_matrix(confusion_mtx, classes=range(2))

#visualize confusion matrix plot with blue one

In [None]:
import joblib

# Save the model to a file
joblib.dump(model, 'xgb_model.pkl')

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification

X_train = train[:,0:20] 
y_train = train[:,20] 
X_val = cv[:,0:20]
y_val = cv[:,20] 

# Step 3: Define XGBoost model
model = xgb.XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=4)

# Step 4: Train model with evaluation sets
eval_set = [(X_train, y_train), (X_val, y_val)]
model.fit(
    X_train, y_train,
    eval_set=eval_set,
    verbose=False  # Set to True to see training output
)

# Step 5: Extract training and validation loss using model.evals_result()
eval_results = model.evals_result()

# Step 6: Extract training and validation loss
train_loss = eval_results['validation_0']['logloss']  # Training loss
val_loss = eval_results['validation_1']['logloss']    # Validation loss

# Step 7: Plot the learning curve
plt.figure(figsize=(8, 5))
plt.plot(train_loss, label='Training Log Loss', color='blue')
plt.plot(val_loss, label='Validation Log Loss', color='red')
plt.xlabel('Number of Trees (Iterations)')
plt.ylabel('Log Loss')
plt.title('XGBoost Learning Curve')
plt.legend()
plt.grid()
plt.show()

<blockquote>Create model using decision trees

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.metrics import confusion_matrix, make_scorer, accuracy_score 
from sklearn.model_selection import learning_curve, StratifiedKFold, train_test_split
from sklearn.feature_selection import RFECV, SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
# import graphviz 


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
dt = DecisionTreeClassifier(criterion = 'entropy') #max_depth= 5, random_state=0
dt.fit(X,Y)
#training -> X,Y
#validation -> X_val,y_val
#test -> X_test, y_test

# Training Decision tree

In [None]:
print('Train Accuracy:',accuracy_score(Y,dt.predict(X)))

In [None]:
#test on validation set
y_pred = dt.predict(X_val)

In [None]:
print('Validation Accuracy:',accuracy_score(y_val,y_pred))

In [None]:
#building model using Gini as the criteria
dt_gini = DecisionTreeClassifier(criterion='gini') #max_depth= 10, random_state=0
dt_gini.fit(X,Y)

Change max_dept-> the accuracy will be change

In [None]:
#test validation set
y_pred_gini = dt_gini.predict(X_val)
print('Validation Accuracy:', accuracy_score(y_val,y_pred_gini))

In [None]:
cm = confusion_matrix(y_val,y_pred_gini)

In [None]:
cm

In [None]:
# [[TP,FP],
#  [FN,TN]]

Decision Trees with Gini index provide better accuracy! So I'll show the model created by this!

In [None]:
unique_elements, counts_elements = np.unique(y_pred_gini,return_counts=True)
print("Frequency of unique values of the said array:")
print(np.asarray((unique_elements, counts_elements)))

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(50,50), dpi = 100)
tree.plot_tree(dt_gini, fontsize=14, ax=axes);
fig.savefig('Decision.png')

In [None]:
#Putting the feature names and class names into varaibles
df = pd.read_excel(r'D:\M.eng\Machine learning\ML_project\WaterSafety_Data.xlsx')
df = pd.DataFrame(df)
df.iloc[0:0]
fn = []
for colum in df.iloc[0:0]:
    fn.append(colum)
print(fn)
cn = ['0_is not safe','1_is safe']

In [None]:
fig, axes = plt.subplots(nrows = 1, ncols = 1, figsize=(50,50), dpi = 1000)
tree.plot_tree(dt_gini, fontsize=14, ax=axes);
tree.plot_tree(dt_gini,
               feature_names = fn, 
               class_names=cn,
               filled = True,
               fontsize=14,
               ax=axes);
fig.savefig('Decision.png')

# Decision Tree and Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=500,
                            random_state=0)

In [None]:
rf.fit(X,Y)

In [None]:
#test validation set
y_pred_rf = rf.predict(X_val)
print('Validation Accuracy:', accuracy_score(y_val,y_pred_rf))

In [None]:
# Test with validation set
# score = rf.score(X_val, y_val)
# print(score)

In [None]:
cm = confusion_matrix(y_val,y_pred_rf)

In [None]:
cm

In [None]:
rf.estimators_

In [None]:
print(len(rf.estimators_))

In [None]:
rf.estimators_[0]

In [None]:
#Putting the feature names and class names into varaibles
df = pd.read_excel(r'D:\M.eng\Machine learning\ML_project\WaterSafety_Data.xlsx')
df = pd.DataFrame(df)
df.iloc[0:0]
fn = []
for colum in df.iloc[0:0]:
    fn.append(colum)
print(fn)
cn = ['0_is not safe','1_is safe']
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize=(50,50),dpi=1000)
tree.plot_tree(rf.estimators_[0],
               feature_names = fn, 
               class_names=cn,
               filled = True);
fig.savefig('rf_individualtree.png')

# Test Decision Tree

In [None]:
print('Test Accuracy:',accuracy_score(y_test,y_pred)) #test using entropy

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
print('Test Accuracy:',accuracy_score(y_test,y_pred_gini)) #test using Gini index

In [None]:
print(classification_report(y_test,y_pred_gini))

# Test NN

In [None]:
X_test = test[:,0:20].T #test with test set

# True labels for the test examples (must match the shape used in accuracy computation)
y_test = test[:,20] 

# Predict using the test set
predictions = predict(model, X_test)

# Calculate and print the accuracy (for binary classification)
accuracy = float(
    (np.dot(y_test, predictions.T) + np.dot(1 - y_test, 1 - predictions.T))
    / y_val.size * 100
)
print('Test accuracy: %d%%' % accuracy)

In [None]:
# Calculate confusion matrix components
TP = np.sum((predictions == 1) & (y_test == 1))  # True Positives
TN = np.sum((predictions == 0) & (y_test == 0))  # True Negatives
FP = np.sum((predictions == 1) & (y_test == 0))  # False Positives
FN = np.sum((predictions == 0) & (y_test == 1))  # False Negatives

# Create confusion matrix
confusion_matrix = np.array([[TP, FP], [FN, TN]])

# Calculate accuracy
accuracy = float(
    (TP + TN) / y_test.size * 100
)

# Print confusion matrix and accuracy
print("Confusion Matrix of Testing:")
print(confusion_matrix)
print(f'Accuracy: {accuracy}%')

""" [[TP, FP],
 [FN, TN]] """

Model Explanations
From my confusion matrix foused
a large number of false negatives (FN), which means model is missing a lot of positive cases and incorrectly classifying them as negative. This suggests that your model is not very good at identifying the positive class (1). 

Your model isn't making any predictions for the negative class (0), which could mean:

The model is biased towards predicting the positive class (1)/(TN) and never predicting 0.
It could be that your model is overfitting to the positive class, or that the negative class (0) is underrepresented in the training data (leading the model to favor the positive class).

Model => Overfitting improve by applied to xgboost

In [None]:
model = joblib.load('NN_model.pkl')

# Test XGBoost

In [None]:
# Load the saved model
model = joblib.load('xgb_model.pkl')

# Predict test set using the loaded model
X_test = test[:,0:20]
y_pred = model.predict(X_test)
new_predictions = model.predict(X_test)
new_predictions

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Confusion Matrix
cm = confusion_matrix(y_val, y_pred)
print("Confusion Matrix of Validation:")
print(cm)

In [None]:
#predict list set for testing data
model = joblib.load('xgb_model.pkl')
# New sample with 20 features
new_sample = [2.4864, 13.9879, 1.2767, 2.5472, 0.6646, 4.2778, 0.6198, 0.8900, 1.0629, 1.3389,
             0.5301, 0.6185, 3.2371, 2.6852, 0.0402, 23.6282, 2.8132, 0.6108, 0.1690, 0.0063]
new_sample = np.array(new_sample).reshape(1, -1)


# Check the shape
print(f"Shape of new data (single sample): {new_sample.shape}")  # Should print (1, 20)

# Predict using the trained model
prediction = model.predict(new_sample)

# Print the prediction
print("Prediction for the new sample:", prediction)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Confusion Matrix
cm = confusion_matrix(y_val, y_pred)
print("Confusion Matrix of Validation:")
print(cm)

When using the same data that is X_test = [2.4864, 13.9879, 1.2767, 2.5472, 0.6646, 4.2778, 0.6198, 0.8900, 1.0629, 1.3389,
             0.5301, 0.6185, 3.2371, 2.6852, 0.0402, 23.6282, 2.8132, 0.6108, 0.1690, 0.0063]
The model of NN predict y = 0 accuracy = 88.4%
The model of XGBoost y = 0 with accuracy = 96.5%