<a href="https://colab.research.google.com/github/PatrickJahn/easv-ML-mini-project/blob/main/ML_Mini_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## ML Mini Project


The mini project can be a classification or regression task, and it should
include

1.   Loading and preparation of data
2.   Selection, training and fine-tuning of a model
3.   Evaluation of the model

You can choose between the following model architectures:

1.   Multilayer Perceptron
2.   Convolutional Neural Network
3.   Random Forest
4.   Gradient Boosted Decision Trees (incl. Histogram-Based Gradient Boosting)

#Spam Email Classification - Random Forest

## Import

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

## Loading and preparation of data


In [22]:
# Load the data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
column_names = [f"feature_{i}" for i in range(57)] + ["is_spam"]
data = pd.read_csv(url, names=column_names)

# Prepare the data
X = data.drop("is_spam", axis=1)
y = data["is_spam"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



## Training and fine-tuning of a model

In [23]:
# Train the Random Forest model
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# Make predictions and evaluate the model
predictions = rf_classifier.predict(X_test)
print(classification_report(y_test, predictions))

# Fine-tuning the model using GridSearchCV
param_grid = {
     'n_estimators': [50, 100, 150],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

CV_rfc = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5)
CV_rfc.fit(X_train, y_train)

              precision    recall  f1-score   support

           0       0.95      0.98      0.96       804
           1       0.97      0.93      0.95       577

    accuracy                           0.96      1381
   macro avg       0.96      0.95      0.96      1381
weighted avg       0.96      0.96      0.96      1381



## Evaluation of the model

In [24]:
# Print the best parameters and the best score
print("Best Parameters:", CV_rfc.best_params_)
print("Best Score:", CV_rfc.best_score_)

# Evaluate the best model from grid search
best_model = CV_rfc.best_estimator_
best_predictions = best_model.predict(X_test)
print(classification_report(y_test, best_predictions))

Best Parameters: {'criterion': 'entropy', 'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 100}
Best Score: 0.9385093167701865
              precision    recall  f1-score   support

           0       0.93      0.98      0.95       804
           1       0.96      0.89      0.93       577

    accuracy                           0.94      1381
   macro avg       0.95      0.93      0.94      1381
weighted avg       0.94      0.94      0.94      1381



**bold text**# Income Level Prediction - MLP


## Import

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer

## Loading and preparation of Data

> Indented block



In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
    'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
    'hours_per_week', 'native_country', 'income'
]

# Load the data
data = pd.read_csv(url, names=column_names, na_values=' ?', sep=',\s', engine='python')

# Split the data into features and target label
X = data.drop('income', axis=1)
y = data['income'].apply(lambda x: 1 if x == '>50K' else 0)  # Convert income to binary

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Define categorical and numerical features for preprocessing
categorical_features = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']
numerical_features = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

# Create preprocessors for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


[link text](https://)## Training and fine-tuning of a model

In [None]:
# Create an MLP model pipeline
mlp_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('mlpclassifier', MLPClassifier(hidden_layer_sizes=(50,), max_iter=500, random_state=42))
])

X_train_small, _, y_train_small, _ = train_test_split(X_train, y_train, train_size=0.3, random_state=42, stratify=y_train)

param_grid = {
    'mlpclassifier__hidden_layer_sizes': [(50,), (100,)],
    'mlpclassifier__activation': ['relu'],
    'mlpclassifier__solver': ['adam'],
    'mlpclassifier__alpha': [0.0001, 0.001]
}

grid_search = GridSearchCV(mlp_pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
# Fine-tuning the MLP model
grid_search.fit(X_train_small, y_train_small)

# Print out the best parameters
print("Best parameters found: ", grid_search.best_params_)

Best parameters found:  {'mlpclassifier__activation': 'relu', 'mlpclassifier__alpha': 0.0001, 'mlpclassifier__hidden_layer_sizes': (100,), 'mlpclassifier__solver': 'adam'}




## Evaluation of the model

In [None]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8242399426758112
              precision    recall  f1-score   support

           0       0.88      0.89      0.88      7417
           1       0.64      0.62      0.63      2352

    accuracy                           0.82      9769
   macro avg       0.76      0.75      0.76      9769
weighted avg       0.82      0.82      0.82      9769



#Age Classification from images - CNN

##Loading and preparation of data

In [25]:
# Import necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
import glob
import os
import seaborn as sns
from matplotlib import pyplot as plt
import matplotlib.image as mpimg

from sklearn import metrics
from sklearn.metrics import confusion_matrix


In [26]:
# Setup the path to the images and the attrbutes
PATH_TO_ATTR = '/content/faces_data/train.csv'
PATH_TO_IMGS = '/content/faces_data/images'


# Then we check if there are the same amount of attributes and images
data = pd.read_csv(PATH_TO_ATTR)
images = os.listdir(PATH_TO_IMGS)

print(data.shape[0])
print(len(images))


FileNotFoundError: [Errno 2] No such file or directory: '/content/faces_data/train.csv'

In [None]:
w

In [None]:
# We can see that there are more attributes than images so we will remove the attributes we dont need
data.sort_values(by="ID")
data = data.head(len(images))

print(data.shape[0])
print(len(images))

In [None]:
# Now we will check how many images with each attriute there is
data['Class'].unique()

In [None]:
# We check the different values for the Class
print(data['Class'].unique())

# Replace the Class values with a number
data['Class'].replace(['YOUNG','MIDDLE','OLD'],[0,1,2],inplace=True)
data.sample(frac=1);

# We wawdnt to make sure that the order of image classes is random
data.head()

In [None]:
img = mpimg.imread('/content/faces_data/images/1.jpg')
imgplot = plt.imshow(img)
plt.show()

In [None]:
# Function to read the image an reformat it so all images are the same size and type
def readAndFormatImage(path):
      img = tf.io.read_file(path)
      img = tf.image.decode_jpeg(img, channels=3)
      img = tf.image.convert_image_dtype(img, dtype=tf.float32)
      img = tf.image.resize(img, (150, 150))
      return img

In [None]:
# function to load the data that includes images and respective labels
def load_data(image_path, label):
    img = readAndFormatImage(image_path)
    return (img, label)

In [None]:
# built the list of image paths and list of respective responses of the images
PATH = "/content/faces_data/images"
image_paths = []
for path in os.listdir(PATH):
    image_paths.append(PATH+"/"+path)
print(len(image_paths))

response_list = []

for i in image_paths:
    _,tail = os.path.split(i)
    data.loc
    response = data.loc[data['ID'] == tail]['Class'].values[0]
    response_list.append(response)
print(len(response_list))

In [None]:
# split the dataset into train and test dataset
train_size = int(0.9*(len(image_paths)))
print(train_size)
test_size = int(0.1*(len(image_paths)))

train_set = tf.data.Dataset.from_tensor_slices((image_paths[:train_size], response_list[:train_size]))
test_set = tf.data.Dataset.from_tensor_slices((image_paths[test_size:], response_list[test_size:]))

In [None]:
train_set = (train_set
    .map(load_data, num_parallel_calls=tf.data.AUTOTUNE)
    .batch(64)
    .prefetch(tf.data.AUTOTUNE)
)

test_set = (test_set
    .map(load_data, num_parallel_calls=tf.data.AUTOTUNE)
    .batch(64)
    .prefetch(tf.data.AUTOTUNE)
)

##Training and fine-tuning of a model

In [None]:
# build the layers of CNN model
from tensorflow.keras import layers,models

cnn_model = models.Sequential([
    layers.Conv2D(filters=64, kernel_size=3, activation='relu', input_shape=(150, 150, 3), padding = 'same'),
    layers.MaxPooling2D(pool_size=2),


    layers.Conv2D(filters=128, kernel_size=3, activation='relu', padding = 'same'),
    layers.Conv2D(filters=128, kernel_size=3, activation='relu', padding = 'same'),
    layers.MaxPooling2D(pool_size=2),

    layers.Conv2D(filters=256, kernel_size=3, activation='relu', padding = 'same'),
    layers.Conv2D(filters=256, kernel_size=3, activation='relu', padding = 'same'),
    layers.MaxPooling2D(pool_size=2),

    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(10, activation='softmax')
])

In [None]:
# view the summary of the cnn model
cnn_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_5 (Conv2D)           (None, 150, 150, 64)      1792      
                                                                 
 max_pooling2d_3 (MaxPoolin  (None, 75, 75, 64)        0         
 g2D)                                                            
                                                                 
 conv2d_6 (Conv2D)           (None, 75, 75, 128)       73856     
                                                                 
 conv2d_7 (Conv2D)           (None, 75, 75, 128)       147584    
                                                                 
 max_pooling2d_4 (MaxPoolin  (None, 37, 37, 128)       0         
 g2D)                                                            
                                                                 
 conv2d_8 (Conv2D)           (None, 37, 37, 256)      

In [None]:
# compile the model
cnn_model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
# train the model
cnn_model.fit(train_set, epochs=10, validation_data=test_set)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10

##Evaluation of the model

In [None]:
cnn_model.evaluate(train_set)



[1.0613808631896973, 0.5069444179534912]

In [None]:
# test accuracy
cnn_model.evaluate(test_set)



[1.02229905128479, 0.5347222089767456]

In [None]:
test_pred = cnn_model.predict(test_set)



In [None]:
y_labels = [np.argmax(item) for item in test_pred]
print("Test Predictions response sample:",y_labels[:10])

test_response = response_list[test_size:]
print("Test True response sample:", test_response[:10])

Test Predictions response sample: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Test True response sample: [1, 0, 0, 0, 1, 0, 0, 1, 1, 0]
