In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
import numpy as np
import scipy

# Confusion matrix of best model on our validation data
# Given code that can be copy-pasted
def score_weight_class(bmi_pred, bmi_true, low, high):
    tol=1
    vpred = (bmi_pred >= low-tol) & (bmi_pred < high+tol)
    vtrue = (bmi_true >= low) & (bmi_true < high)
    if vtrue.sum() == 0:
        # print("no true samples here")
        return 0
    rmse = np.sqrt(((bmi_true[vtrue] - bmi_pred[vtrue]) ** 2).mean())
    rmse = rmse/(high-low+tol)               # normalize rmse in interval
    acc = (vpred & vtrue).sum()/vtrue.sum()  # % of accurate prediction for this bmi class
    return rmse*(1-acc)

# 0.0 = perfect (no error), the score changes indepentently from just the bmi_class predictions
# i.e. if you're wrong and you predict 100kg, its better than if you're wrong and you predict 150kg
# but if you predict correctly, it does not matter where you are in the range of the bmi class !
def score_regression(ytrue, ypred, height):
    bmi_pred = ypred/(height*height)
    bmi_true = ytrue/(height*height)
    scores = []
    for bmi_low, bmi_high in zip([0, 18.5, 25, 30], [18.5, 25, 30, 100]):
        scores.append(score_weight_class(bmi_pred, bmi_true, low=bmi_low, high=bmi_high))
    return np.mean(scores)

# Check freq of 4 classes
# array containing indices of the samples belonging to the different classes
def oversampling(df):
    bmi_low = []
    bmi_normal = []
    bmi_high = []
    bmi_very_high = []
    for i in range(len(df)):
        if (i in df["bmi"]):
            bmi = df["bmi"][i]
            if 0 <= bmi < 18.5:
                bmi_low.append(i)
            elif 18.5 <= bmi < 25:
                bmi_normal.append(i)
            elif 25 <= bmi < 30:
                bmi_high.append(i)
            elif 30 <= bmi < 100:
                bmi_very_high.append(i)
    # low: 14, normal: 157, high: 51, very_high: 22 (sum = 244, correct)

    # Duplicate rare sample by checking bmi less represented
    # we multiply each sample of under-represented class the same number of times to reach the closest number to the most frequent samples
    # low: 14 -> *11 (154), normal: 157 -> X, high: 51 -> *3 (153), very_high: 22 -> *7 (154)
    # new total will be 618

    max_bmi = len(bmi_normal) # Len of bmi_normal because he's the most seen
    duplicated_samples = df.loc[bmi_low, :]
    for i in range(max_bmi//len(bmi_low)):  #since there is already the sample *1, you only add *10
        df = pd.concat([df, duplicated_samples], ignore_index=True)
    duplicated_samples = df.loc[bmi_high, :]
    for i in range(max_bmi//len(bmi_high)):
        df = pd.concat([df, duplicated_samples], ignore_index=True)

    duplicated_samples = df.loc[bmi_very_high, :]
    for i in range(max_bmi//len(bmi_very_high)):
        df = pd.concat([df, duplicated_samples], ignore_index=True)
    return df.drop(["bmi"], axis=1).to_numpy()

# See correlation plot between feature and target
def print_data(X, Y):
    fig=plt.figure(figsize=(10, 10), dpi=90)
    n_feats = len(X.columns)
    for i, feat in enumerate(X.columns):
        plt.subplot(n_feats//3+1,3,i+1)
        plt.scatter(X[feat],Y , s=10)
        plt.title(feat)
    plt.tight_layout()
    plt.show()

# See heatmap of correlation
def heatmap(df):
    plt.figure(figsize=(10,10))
    cor = df.corr()
    sns.heatmap(cor, annot=True, cmap='PiYG', center=0, vmin = -0.4, vmax = 0.4, linewidths=.5)
    plt.show()

def display_all_data(X):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        print(X)

def KF_validation(X, Y, model, K, mse=2):
    KF = KFold(n_splits=K, random_state=1, shuffle=True)
    SUM = 0
    # ite = 0
    for train_index, test_index in KF.split(X):
        X_train, X_test = X.loc[train_index][:].drop(["bmi"], axis=1).to_numpy(), X.loc[test_index][:].drop(["bmi"], axis=1).to_numpy()
        Y_train, Y_test = Y.loc[train_index][:].drop(["bmi"], axis=1).to_numpy(), Y.loc[test_index][:].drop(["bmi"], axis=1).to_numpy()

        # X_train = oversampling(X_train)
        # Y_train = oversampling(Y_train)

        # weight_ite = coef_weight[0:ite*25].append(coef_weight[(ite+1)*25:249])

        # model.fit(X_train, np.ravel(Y_train))             #,sample_weight=weight_ite)
        model.fit(X_train, Y_train)
        if mse == 0:
            SUM += mean_absolute_error(Y_test, model.predict(X_test))
        elif mse == 1:
            SUM += model.score(X_test, Y_test)
        else: # mse==2
            SUM += score_regression(Y_test, model.predict(X_test), X_test[:,2])
            # SUM += score_regression(np.ravel(Y_test), model.predict(X_test), X_test[:,2])
        # ite+=1
    return SUM/K

In [None]:
# Read csv and setup X1
X1 = pd.read_csv("X1.csv")
Y1 = pd.read_csv("Y1.csv", header=None, names=['weight'] )
X2 = pd.read_csv("X2.csv")

X1 = X1.drop("Unnamed: 0",axis=1)
X2 = X2.drop("Unnamed: 0",axis=1)

#   Data Engineering
# Step 1 : Re-encode : Make all values in number
le_gender = preprocessing.LabelEncoder().fit(["Male", "Female"])
le_yesno = preprocessing.LabelEncoder().fit(["no", "yes"])
le_freq = preprocessing.LabelEncoder().fit(["no", "Sometimes", "Frequently", "Always"])
le_transport = preprocessing.LabelEncoder().fit(["Bike", "Walking", "Public_Transportation", "Motorbike", "Automobile"])
for categorical_feature, label_encoder in [("Gender", le_gender), ("family_history_with_overweight", le_yesno), ("FAVC", le_yesno), ("CAEC", le_freq), ("SMOKE", le_yesno), ("SCC", le_yesno), ("CALC", le_freq), ("MTRANS", le_transport)]:
    X1[categorical_feature] = label_encoder.transform(X1[categorical_feature])
    X2[categorical_feature] = label_encoder.transform(X2[categorical_feature])

    X1[categorical_feature] = X1[categorical_feature].astype("float")
    X2[categorical_feature] = X2[categorical_feature].astype("float")

# Drop not relevant features
for i in ["FAVC", "FCVC", "NCP", "CAEC", "SMOKE", "SCC", "FAF", "TUE", "CALC"]:
    X1 = X1.drop(i,axis=1)
    X2 = X2.drop(i,axis=1)

# Compute weight to give more impact on low seen data in model fitting (not used)
coef_weight = []
for i in range(len(X1["Age"])):
    weight_i = 1
    if (not (15 <= X1["Age"][i] < 37)):
        weight_i += 1
    if (not (1.55 < X1["Height"][i] < 1.85)):
        weight_i += 1
    if (X1["MTRANS"][i] == 1 or X1["MTRANS"][i] == 2):
        weight_i += 1
    coef_weight.append(weight_i)

# Data Augmenting
# big datafram with all data (incl. weight)
df = pd.concat([X1, Y1], axis=1)

# Outlier removal
# removed 6 outlier samples (26, 90, 91, 124, 214, 241) -> this means the df has now 244 rows
df = df[(np.abs(scipy.stats.zscore(df)) < 3).all(axis=1)]

# class imbalance
# calculer les classes de BMI pour chaque personne
bmi_col = []
for i in range(250):
    if (i in df["weight"]):
        weight = df["weight"][i]
        height = df["Height"][i]
        bmi = weight / (height*height)
        bmi_col.append(bmi)
    else:
        bmi_col.append(-1)
bmi_col = pd.DataFrame({"bmi": bmi_col})

# Append bmi column to df
# drop again previous outliers.
df = pd.concat([df, bmi_col], axis=1)
df = df.drop([26, 90, 91, 124, 214, 241], axis=0)
df = df.dropna()
df = df.reset_index()
df = df.drop("index", axis=1)

Y1_np = df.loc[:][["weight", "bmi"]]
X1_np = df.drop(["weight"], axis=1)

# print(KF_validation(X1_np, Y1_np, LinearRegression(), 10))
# print(KF_validation(X1_np, Y1_np, KNeighborsRegressor(n_neighbors=2), 10))
# print(KF_validation(X1_np, Y1_np, MLPRegressor(max_iter=1000), 10))
# print(KF_validation(X1_np, Y1_np, RandomForestRegressor(max_depth=2, random_state=1), 10))

# print_data(df.drop("weight", axis=1), df["weight"])
# heatmap(df)

In [None]:
# Model
# K-Folding
Y1_np = df.loc[:][["weight", "bmi"]]
X1_np = df.drop(["weight"], axis=1)

#===============================================================================
##### Custom Parameters Search #####

### no grid search for linear model
### best score : 0.12866988784982425
# KF_LR = KF_validation(X1_np, Y1_np, LinearRegression(), 10)
# print("LinearRegressor without parameters search : "+ str(KF_LR))  # 0.12866988784982425

### MLP parameter opti
### best score : 0.15898470190027653 avec les best params : {'activation': 'identity', 'solver': 'adam', 'alpha': 1e-07, 'tol': 0.0001}
# best_score = 1000
# best_params = {}
# for activation in ["relu", "identity", "logistic", "tanh"]:
#     print(activation)
#     for solver in ["adam"]:  # "lbfgs" , "sgd", "adam"
#         for alpha in [ 0.00001, 0.000001, 0.0000001]:
#             for tol in [1e-3, 1e-4, 1e-5]:
#                 model = MLPRegressor(max_iter=800, activation=activation, solver=solver, alpha=alpha, tol=tol)
#                 score = KF_validation(X1_np, Y1_np, model, 10)
#                 if best_score > score:
#                     best_score = score
#                     best_params["activation"] = activation
#                     best_params["solver"] = solver
#                     best_params["alpha"] = alpha
#                     best_params["tol"] = tol
# print("best score : " + str(best_score) + " avec les best params : " + str(best_params))
# KF_MLP = KF_validation(X1_np, Y1_np, MLPRegressor(max_iter=1000), 10)
# print("MLP without parameters search : " + str(KF_MLP))  # 0.20188709269241106

### KNN parameter opti
### best score : 0.124841313131318 avec les best params : {'weights': 'uniform', 'algorithm': 'kd_tree', 'p': 1, 'leaf_size': 10, 'n_neighbors': 15}
best_score = 1000
best_params = {}
for weights in ["uniform", "distance"]:
    for algorithm in ["ball_tree", "kd_tree"]:
        for p in [1,2]:
            for leaf_size in [10, 20, 30, 40, 50]:
                for n_neighbors in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 25]:
                    model = KNeighborsRegressor(weights=weights, algorithm=algorithm, p=p, leaf_size=leaf_size, n_neighbors=n_neighbors)
                    score = KF_validation(X1_np, Y1_np, model, 10)
                    if best_score > score:
                        best_score = score
                        best_params["weights"] = weights
                        best_params["algorithm"] = algorithm
                        best_params["p"] = p
                        best_params["leaf_size"] = leaf_size
                        best_params["n_neighbors"] = n_neighbors
print("best score : " + str(best_score) + " avec les best params : " + str(best_params))
KF_KNN = KF_validation(X1_np, Y1_np, KNeighborsRegressor(n_neighbors=2), 10)
print("KNN without parameters search : "+ str(KF_KNN))  # 0.19132789047012183

### Random Forest parameter opti
### best score : 0.1659561008369817 avec les best params : {'n_estimators': 50, 'criterion': 'squared_error', 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 8}
### OLD best score : 0.030549093125539557 avec les best params : {'n_estimators': 50, 'criterion': 'poisson', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1}
# best_score = 1000
# best_params = {}
# for n_estimators in [50, 100, 150]:
#     for criterion in ["squared_error" , "absolute_error", "poisson"]:
#         for max_depth in [None, 5, 10, 15, 20]:
#             for min_samples_split in [2, 4, 6, 8]:
#                 for min_samples_leaf in [2, 5, 8]:
#                     model = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
#                     score = KF_validation(X1_np, Y1_np, model, 10)
#                     if best_score > score:
#                         best_score = score
#                         best_params["n_estimators"] = n_estimators
#                         best_params["criterion"] = criterion
#                         best_params["max_depth"] = max_depth
#                         best_params["min_samples_split"] = min_samples_split
#                         best_params["min_samples_leaf"] = min_samples_leaf
# print("best score : " + str(best_score) + " avec les best params : " + str(best_params))
# KF_RF = KF_validation(X1_np, Y1_np, RandomForestRegressor(max_depth=2, random_state=1), 10)
# print("RandomForest without parameters search : "+ str(KF_RF))  # 0.24233884509903833

best score : 0.124841313131318 avec les best params : {'weights': 'uniform', 'algorithm': 'kd_tree', 'p': 1, 'leaf_size': 10, 'n_neighbors': 15}
KNN without parameters search : 0.19132789047012183


In [None]:
# Confusion matrix
def conf_matrix_printer(bmi_pred, bmi_true, low, high):
    tol=1
    vpred = (bmi_pred >= low-tol) & (bmi_pred < high+tol)
    vtrue = (bmi_true >= low) & (bmi_true < high)

    return confusion_matrix(vtrue, vpred)

def conf(ytrue, ypred, height):
    bmi_pred = ypred/(height*height)
    bmi_true = ytrue/(height*height)

    conf_matrices = []
    for bmi_low, bmi_high in zip([0, 18.5, 25, 30], [18.5, 25, 30, 100]):
        conf_matrices.append(conf_matrix_printer(bmi_pred, bmi_true, low=bmi_low, high=bmi_high))
    return conf_matrices

def KF_confusion(X, Y, model, K, mse=2):
    KF = KFold(n_splits=K, random_state=1, shuffle=True)

    conf_matrices = []
    for train_index, test_index in KF.split(X):
        X_train, X_test = X.loc[train_index][:].drop(["bmi"], axis=1).to_numpy(), X.loc[test_index][:].drop(["bmi"], axis=1).to_numpy()
        Y_train, Y_test = Y.loc[train_index][:].drop(["bmi"], axis=1).to_numpy(), Y.loc[test_index][:].drop(["bmi"], axis=1).to_numpy()

        model.fit(X_train, np.ravel(Y_train))
        # model.fit(X_train, Y_train)
        conf_matrices.append(conf(np.ravel(Y_test), model.predict(X_test), X_test[:,2])) # K element containing 4 conf_matrices (1 per bmi_class)
        # conf_matrices.append(conf(Y_test, model.predict(X_test), X_test[:,2])) # K element containing 4 conf_matrices (1 per bmi_class)
        # now we do the mean of those 10 element to obtain the final 4 matrices

    conf_matrix_low = [[0, 0], [0, 0]]
    conf_matrix_normal = [[0, 0], [0, 0]]
    conf_matrix_high = [[0, 0], [0, 0]]
    conf_matrix_very_high = [[0, 0], [0, 0]]
    for split in conf_matrices:
        conf_matrix_low = np.add(conf_matrix_low, split[0])
        conf_matrix_normal = np.add(conf_matrix_normal, split[1])
        conf_matrix_high = np.add(conf_matrix_high, split[2])
        conf_matrix_very_high = np.add(conf_matrix_very_high, split[3])

    plot_conf_matrix(conf_matrix_low)
    plot_conf_matrix(conf_matrix_normal)
    plot_conf_matrix(conf_matrix_high)
    plot_conf_matrix(conf_matrix_very_high)

    return

def plot_conf_matrix(matrix):
    fig, ax = plt.subplots()
    ax.matshow(matrix, cmap='tab20b')
    # https://matplotlib.org/stable/tutorials/colors/colormaps.html pour les colormaps (incroyable découverte)
    for i in range(2):
        for j in range(2):
            text = ax.text(j, i, matrix[i, j], ha="center", va="center")
    fig.tight_layout()
    plt.show()

KF_confusion(X1_np, Y1_np, LinearRegression(), 10)


In [None]:
def bar_plot(tab1, tab2, labels):
    x = np.arange(len(labels))
    width = 0.35  # the width of the bars

    fig, ax = plt.subplots()
    rects1 = ax.bar(x - width/2, tab1, width, label='Non-optimized', color='forestgreen')
    rects2 = ax.bar(x + width/2, tab2, width, label='Optimized', color='darkviolet')

    ax.set_ylabel('Evaluation')
    ax.set_title('Evaluation by optimization and model')
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.legend()
    fig.tight_layout()
    plt.show()

labels = [] # Name of models
nopti = []  # Values for Non-optimzed
opti = []   # Values for optimized

# BarPlot where preprocessing is already done
labels = ["LinearReg", "MLP", "KNN", "RandomForest"] # Name of models
nopti = [0.10152731927116092, 0.169157591378042, 0.19132789047012183, 0.1711198064639973] # Values for Non-optimzed
opti = [0.10152731927116092, 0.15898470190027653, 0.124841313131318 , 0.1659561008369817] # Values for optimized
bar_plot(nopti, opti, labels)

In [None]:
# Prediction on Y2
# one line per prediction and no header, no quotation marks around your numbers either.
# At the tail end of this file you will add an additional number which is the estimated performance of your model
# on the unseen data.

Y1 = df.loc[:]["weight"]
X1 = df.drop(["weight", "bmi"], axis=1)

estimate = 0.11
Y2_LR = LinearRegression().fit(X1, Y1).predict(X2)
Y2_LR = np.append(Y2_LR, estimate)
Y2_LR = np.around(Y2_LR, 1)
prediction = pd.DataFrame(Y2_LR).to_csv('Y2.csv', index=False, header=False)

# estimate = 0
# Y2_MLP = MLPRegressor(random_state=1, max_iter=1000).fit(X1, Y1).predict(X2)
# Y2_MLP = np.append(Y2_MLP, estimate)
# Y2_MLP = np.around(Y2_MLP, 1)
# prediction = pd.DataFrame(Y2_MLP).to_csv('Y2.csv', index=False, header=False)

# estimate = 0
# Y2_KNN = KNeighborsRegressor(n_neighbors=10).fit(X1, Y1).predict(X2)
# Y2_KNN = np.append(Y2_KNN, estimate)
# Y2_KNN = np.around(Y2_KNN, 1)
# prediction = pd.DataFrame(Y2_KNN).to_csv('Y2.csv', index=False, header=False)

# estimate = 0
# Y2_RF = RandomForestRegressor(max_depth=2, random_state=1).fit(X1, Y1).predict(X2)
# Y2_RF = np.append(Y2_RF, estimate)
# Y2_RF = np.around(Y2_RF, 1)
# prediction = pd.DataFrame(Y2_RF).to_csv('Y2.csv', index=False, header=False)