In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

import tensorflow as tf
from tensorflow import keras

from keras.models import Sequential 
from keras.layers import Dense
from keras.layers import Dense, Dropout
from keras.metrics import R2Score, CosineSimilarity

#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)

TEST_SIZE = 0.2


In [None]:
# Import del dataset e divisione in train e test
df = pd.read_csv('datasets/BMIDataSet.csv')

df['NObeyesdad'] = ((df['Weight']) / (df['Height']**2))
df.rename(columns={'NObeyesdad': 'label'}, inplace=True)

print(df.dtypes)
# Viene diviso il train set in train e validation set
train, test_df = train_test_split(df, test_size = TEST_SIZE, random_state = 42)
train_df, val_df = train_test_split(train, test_size = TEST_SIZE, random_state = 42)

train_x = train_df.drop(columns=['label'])
train_y = train_df['label'].astype('float64')

val_x = val_df.drop(columns=['label'])
val_y = val_df['label'].astype('float64')

test_x = test_df.drop(columns=['label'])
test_y = test_df['label'].astype('float64')

In [None]:
def encoding(dataset):
    cat_cols = dataset.select_dtypes(include='object').columns
    num_cols = dataset.select_dtypes(exclude='object').columns

    scaler = StandardScaler()
    dataset[num_cols] = scaler.fit_transform(dataset[num_cols])

    encoder = OrdinalEncoder()
    dataset[cat_cols] = encoder.fit_transform(dataset[cat_cols])
    return dataset, scaler, encoder

def fs(dataset, dataset_y):
    str_path = 'objects/features_BMI.npy'

    if not os.path.exists(str_path):
        dataset['label'] = dataset_y
        correlation_matrix=dataset.corr()
        features = correlation_matrix['label'][(correlation_matrix['label']>=0.1) | (correlation_matrix['label']<=-0.1)].index
        features = features.drop('label')
        print(correlation_matrix['label'].sort_values(ascending=False))
        print(features)
        np.save(str_path, features)
    features = np.load(str_path, allow_pickle=True)
    return train_x[features]

def one_hot_encoding(dataset):
    for column in dataset.columns:
        if len(dataset[column].unique()) <= 2:
            dataset = pd.get_dummies(dataset, columns=[column])
    return dataset.astype('float64')

train_x, scaler, encoder = encoding(train_x)
train_x = fs(train_x, train_y)
train_x = one_hot_encoding(train_x)

In [5]:
def preproc_test_val(dataset, scaler, encoder):
    cat_cols = dataset.select_dtypes(include='object').columns
    num_cols = dataset.select_dtypes(exclude='object').columns
    dataset[num_cols] = scaler.transform(dataset[num_cols])
    dataset[cat_cols] = encoder.transform(dataset[cat_cols])
    features = np.load('objects/features_BMI.npy', allow_pickle=True)
    dataset = dataset[features]
    dataset = dataset.astype(float)
    dataset = one_hot_encoding(dataset)
    return dataset

val_x = preproc_test_val(val_x, scaler, encoder)
test_x = preproc_test_val(test_x, scaler, encoder)

In [None]:
EPOCHS = 1000
BATCH_SIZE = 256
DROPOUT = 0.1
np.random.seed(42)
tf.random.set_seed(42)
loss_str = 'mean_squared_error'

def model_fn():
    model = Sequential()

    model.add(Dense(128, kernel_initializer='normal',input_dim = train_x.shape[1], activation='relu'))
    model.add(Dense(256, kernel_initializer='normal',activation='relu'))
    model.add(Dropout(DROPOUT))
    model.add(Dense(128, kernel_initializer='normal',activation='relu'))
    model.add(Dense(64, kernel_initializer='normal',activation='relu'))
    model.add(Dense(1, kernel_initializer='normal',activation='linear'))

    model.summary()

    initial_learning_rate = 0.01
    final_learning_rate = 0.0001
    learning_rate_decay_factor = (final_learning_rate / initial_learning_rate)**(1/200)
    steps_per_epoch = int(train_x.shape[0]/BATCH_SIZE)

    lr_schedule = keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate,
        decay_steps=steps_per_epoch,
        decay_rate=learning_rate_decay_factor,
        staircase=True)
    model.compile(optimizer = keras.optimizers.Adam(learning_rate=lr_schedule), loss = loss_str, metrics = [loss_str, 'mae', CosineSimilarity(axis=1), R2Score()])
    return model

model = model_fn()

early_stopping_cb = keras.callbacks.EarlyStopping(patience=20, min_delta=0.0001, restore_best_weights=True)

history = model.fit(train_x, train_y,
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    callbacks=[early_stopping_cb],
                    validation_data=(val_x, val_y))

EPOCHS = len(history.history['loss'])


In [None]:
print(history.history.keys())

In [None]:
print(history.history.keys())
fig, axs = plt.subplots(2, 2)
axs[0, 0].plot(history.history['val_mean_squared_error'])
axs[0, 0].plot(history.history['mean_squared_error'])
axs[0, 0].legend(['val_mse', 'mse'])

axs[0, 1].plot(history.history['val_mae'])
axs[0, 1].plot(history.history['mae'])
axs[0, 1].legend(['val_mae', 'mae'])

axs[1, 0].plot(history.history['val_cosine_similarity'])
axs[1, 0].plot(history.history['cosine_similarity'])
axs[1, 0].legend(['val_cosine_similarity', 'cosine_similarity'])

axs[1, 1].plot(history.history['val_r2_score'])
axs[1, 1].plot(history.history['r2_score'])
axs[1, 1].legend(['val_r2_score', 'r2_score'])

In [None]:
score = model.evaluate(test_x, test_y)
print(score[0])

In [11]:
train_x = train.drop(columns=['label'])
train_y = train['label'].astype(float)

test_x = test_df.drop(columns=['label'])
test_y = test_df['label'].astype(float)
train_x = preproc_test_val(train_x, scaler, encoder)
test_x = preproc_test_val(test_x, scaler, encoder)

In [None]:
model = model_fn()

early_stopping_cb = keras.callbacks.EarlyStopping(patience=20, min_delta=0.0001, restore_best_weights=True)

history = model.fit(train_x, train_y,
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    callbacks=[early_stopping_cb],
                    validation_data=(val_x, val_y))

In [None]:
print(history.history.keys())
fig, axs = plt.subplots(2, 2)
axs[0, 0].plot(history.history['val_mean_squared_error'])
axs[0, 0].plot(history.history['mean_squared_error'])
axs[0, 0].legend(['val_mse', 'mse'])

axs[0, 1].plot(history.history['val_mae'])
axs[0, 1].plot(history.history['mae'])
axs[0, 1].legend(['val_mae', 'mae'])

axs[1, 0].plot(history.history['val_cosine_similarity'])
axs[1, 0].plot(history.history['cosine_similarity'])
axs[1, 0].legend(['val_cosine_similarity', 'cosine_similarity'])

axs[1, 1].plot(history.history['val_r2_score'])
axs[1, 1].plot(history.history['r2_score'])
axs[1, 1].legend(['val_r2_score', 'r2_score'])

In [None]:
score = model.evaluate(test_x, test_y)

In [None]:
train_df = pd.concat([train_x, train_y], axis=1)
train_df.to_csv('datasets/train_BMI.csv', index=False)
test_df = pd.concat([test_x, test_y], axis=1)
test_df.to_csv('datasets/test_BMI.csv', index=False)

model.save('models/titanic_model')
model.save_weights('models/titanic_weights')