<a href="https://colab.research.google.com/github/PatrickJahn/easv-ML-mini-project/blob/main/ML_Mini_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## ML Mini Project


The mini project can be a classification or regression task, and it should
include

1.   Loading and preparation of data
2.   Selection, training and fine-tuning of a model
3.   Evaluation of the model

You can choose between the following model architectures:

1.   Multilayer Perceptron
2.   Convolutional Neural Network
3.   Random Forest
4.   Gradient Boosted Decision Trees (incl. Histogram-Based Gradient Boosting)

#Spam Email Classification - Random Forest

## Loading and preparation of data


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
column_names = [f'feature_{i}' for i in range(57)] + ['is_spam']
df = pd.read_csv(url, names=column_names)

# Preprocess the data
# Split the data into features and target
X = df.drop('is_spam', axis=1)
y = df['is_spam']

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


## Training and fine-tuning of a model

In [None]:
# Import the Random Forest Model and model selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Initialize the Random Forest classifier
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
random_forest.fit(X_train, y_train)


# Define a grid of hyperparameters to test
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_features': ['auto', 'sqrt'],  # Number of features to consider at every split
    'max_depth': [10, 20, 30, None],  # Maximum number of levels in tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required at each leaf node
    'bootstrap': [True, False]  # Method of selecting samples for training each tree
}

# Initialize the grid search model
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=3,
                           n_jobs=-1,
                           verbose=2)

# Fit the grid search model
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best parameters found: ", grid_search.best_params_)

# Use the best model for predictions
best_grid = grid_search.best_estimator_
predictions = best_grid.predict(X_test)


Fitting 3 folds for each of 432 candidates, totalling 1296 fits


KeyboardInterrupt: 

## Evaluation of the model

In [None]:
# Import necessary modules for evaluation
from sklearn.metrics import classification_report, accuracy_score

# Make predictions
predictions = random_forest.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

# Evaluate the best model with fine tuning
print("Accuracy after fine-tuning:", accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

Accuracy: 0.9565532223026793
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       804
           1       0.97      0.93      0.95       577

    accuracy                           0.96      1381
   macro avg       0.96      0.95      0.96      1381
weighted avg       0.96      0.96      0.96      1381

Accuracy after fine-tuning: 0.9565532223026793
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       804
           1       0.97      0.93      0.95       577

    accuracy                           0.96      1381
   macro avg       0.96      0.95      0.96      1381
weighted avg       0.96      0.96      0.96      1381



# Diabetes Progression Prediction - MLP


#Age Classification from images - CNN

##Loading and preparation of data

In [9]:
# Import necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
import glob
import os
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [11]:
# Read the csv file to check out the images and class
data = pd.read_csv("/content/faces_data/train.csv")
data.head()

Unnamed: 0,ID,Class
0,377.jpg,MIDDLE
1,17814.jpg,YOUNG
2,21283.jpg,MIDDLE
3,16496.jpg,YOUNG
4,4487.jpg,MIDDLE


In [16]:
# We check the different values for the Class
data['Class'].unique()

# Replace the Class values with a number
data['Class'].replace(['YOUNG','MIDDLE','OLD'],[0,1,2],inplace=True)
data.head()

Unnamed: 0,ID,Class
0,377.jpg,1
1,17814.jpg,0
2,21283.jpg,1
3,16496.jpg,0
4,4487.jpg,1


In [20]:
# Function to read the image an reformat it so all images are the same size and type
def readAndFormatImage(path):
      img = tf.io.read_file(path)
      img = tf.image.decode_jpeg(img, channels=3)
      img = tf.image.convert_image_dtype(img, dtype=tf.float32)
      img = tf.image.resize(img, (150, 150))
      return img

In [21]:
# function to load the data that includes images and respective labels
def load_data(image_path, label):
    img = readAndFormatImage(image_path)
    return (img, label)

In [22]:
# built the list of image paths and list of respective responses of the images
PATH = "/content/faces-age-detection-dataset/Train"
image_paths = []
for path in os.listdir(PATH):
    image_paths.append(PATH+"/"+path)
print(len(image_paths))

response_list = []

for i in image_paths:
    _,tail = os.path.split(i)
    response = data.loc[data['ID'] == tail]['Class'].values[0]
    response_list.append(response)
print(len(response_list))

FileNotFoundError: [Errno 2] No such file or directory: '/content/faces-age-detection-dataset/Train'

In [None]:
# split the dataset into train and test dataset
train_size = int(0.9*(len(image_paths)))
print(train_size)
test_size = int(0.1*(len(image_paths)))

train_set = tf.data.Dataset.from_tensor_slices((image_paths[:train_size], response_list[:train_size]))
test_set = tf.data.Dataset.from_tensor_slices((image_paths[test_size:], response_list[test_size:]))

In [None]:
train_set = (train_set
    .map(load_data, num_parallel_calls=tf.data.AUTOTUNE)
    .batch(64)
    .prefetch(tf.data.AUTOTUNE)
)

test_set = (test_set
    .map(load_data, num_parallel_calls=tf.data.AUTOTUNE)
    .batch(64)
    .prefetch(tf.data.AUTOTUNE)
)

In [17]:
# build the layers of CNN model
from tensorflow.keras import layers,models

cnn_model = models.Sequential([
    layers.Conv2D(filters=64, kernel_size=3, activation='relu', input_shape=(150, 150, 3), padding = 'same'),
    layers.MaxPooling2D(pool_size=2),


    layers.Conv2D(filters=128, kernel_size=3, activation='relu', padding = 'same'),
    layers.Conv2D(filters=128, kernel_size=3, activation='relu', padding = 'same'),
    layers.MaxPooling2D(pool_size=2),

    layers.Conv2D(filters=256, kernel_size=3, activation='relu', padding = 'same'),
    layers.Conv2D(filters=256, kernel_size=3, activation='relu', padding = 'same'),
    layers.MaxPooling2D(pool_size=2),

    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(10, activation='softmax')
])

In [18]:
# view the summary of the cnn model
cnn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 150, 150, 64)      1792      
                                                                 
 max_pooling2d (MaxPooling2  (None, 75, 75, 64)        0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 75, 75, 128)       73856     
                                                                 
 conv2d_2 (Conv2D)           (None, 75, 75, 128)       147584    
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 37, 37, 128)       0         
 g2D)                                                            
                                                                 
 conv2d_3 (Conv2D)           (None, 37, 37, 256)       2

In [None]:
# compile the model
cnn_model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
# train the model
cnn_model.fit(train_set, epochs=10, validation_data=test_set)

In [None]:
cnn_model.evaluate(train_set)

In [None]:
# test accuracy
cnn_model.evaluate(test_set)

In [None]:
test_pred = cnn_model.predict(test_set)

In [None]:
y_labels = [np.argmax(item) for item in test_pred]
print("Test Predictions response sample:",y_labels[:10])

test_response = response_list[test_size:]
print("Test True response sample:", test_response[:10])