In [52]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
import numpy as np
import scipy
X1 = pd.read_csv("X1.csv")
Y1 = pd.read_csv("Y1.csv", header=None, names=['weight'] )
X2 = pd.read_csv("X2.csv")

X1 = X1.drop("Unnamed: 0",axis=1)
X2 = X2.drop("Unnamed: 0",axis=1)

# Confusion matrix of best model on our validation data
# Given code that can be copy-pasted
def score_weight_class(bmi_pred, bmi_true, low, high):
    tol=1
    vpred = (bmi_pred >= low-tol) & (bmi_pred < high+tol)
    vtrue = (bmi_true >= low) & (bmi_true < high)
    if vtrue.sum() == 0:
        print("no true samples here")
        return 0
    rmse = np.sqrt(((bmi_true[vtrue] - bmi_pred[vtrue]) ** 2).mean())
    rmse = rmse/(high-low+tol)   # normalize rmse in interval
    acc = (vpred & vtrue).sum()/vtrue.sum()
    return rmse*(1-acc)

def score_regression(ytrue, ypred, height):
    bmi_pred = ypred/(height*height)
    bmi_true = ytrue/(height*height)
    
    scores = []
    for bmi_low, bmi_high in zip([0, 18.5, 25, 30], [18.5, 25, 30, 100]):
        scores.append(score_weight_class(bmi_pred, bmi_true, low=bmi_low, high=bmi_high))
    return np.mean(scores)

# # Display all data
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # print all data
#     print(X2)

# I. Faire un ptit truc pour transformer le Y de poids vers les classes de poids
# II. Faire qu'on prédise les classes et pas le poids.
# III. Tenter d'augmenter le dataset avec class imbalance (en copiant les samples des classes sous-rpz ? / autre)
# IV. Grid Search

In [53]:
#   Data Engineering
# Step 1 : Re-encode : Make all values in integer
# Gender : {0: Male, 1: Female}
# family_history_with_overweight : {0 : no, 1 : yes}
# FAVC : {0 : no, 1 : yes}
# CAEC : {0 : no, 1 : Sometimes, 2 : Frequently, 3 : Always}
# SMOKE : {0 : no, 1 : yes}
# SCC : {0 : no, 1 : yes}
# CALC : {0 : no, 1 : Sometimes, 2 : Frequently, 3 : Always}
# MTRANS {0 : "Bike", 1 : "Walking", 2 : "Public_Transportation", 3 : "Motorbike", 4 : "Automobile"}

le_gender = preprocessing.LabelEncoder().fit(["Male", "Female"])
le_yesno = preprocessing.LabelEncoder().fit(["no", "yes"])
le_freq = preprocessing.LabelEncoder().fit(["no", "Sometimes", "Frequently", "Always"]) 
le_transport = preprocessing.LabelEncoder().fit(["Bike", "Walking", "Public_Transportation", "Motorbike", "Automobile"])
for categorical_feature, label_encoder in [("Gender", le_gender), ("family_history_with_overweight", le_yesno), ("FAVC", le_yesno), ("CAEC", le_freq), ("SMOKE", le_yesno), ("SCC", le_yesno), ("CALC", le_freq), ("MTRANS", le_transport)]:
    X1[categorical_feature] = label_encoder.transform(X1[categorical_feature])
    X2[categorical_feature] = label_encoder.transform(X2[categorical_feature])
    
    X1[categorical_feature] = X1[categorical_feature].astype("float")
    X2[categorical_feature] = X2[categorical_feature].astype("float")

for i in ["FAVC", "FCVC", "NCP", "CAEC", "SMOKE", "SCC", "FAF", "TUE", "CALC"]:
    X1 = X1.drop(i,axis=1)
    
coef_weight = []
for i in range(len(X1["Age"])):
    weight_i = 1
    if (15 <= X1["Age"][i] < 37):
        weight_i += 1
    if (1.55 < X1["Height"][i] < 1.85):
        weight_i += 1
    if (X1["MTRANS"][i] == 1 or X1["MTRANS"][i] == 2):
        weight_i += 1
    coef_weight.append(weight_i)

# Data Augmenting laissé pour le moment.
# data_augmenting = []
# i = 0
# temp = pd.dataframe()
# while i < range(len(X1["Age"])):
#     for feat in ["Gender", "Age", "Height", "family_history_with_overweight", "CH2O", "MTRANS", "weight"]:
        
df = pd.concat([X1, Y1], axis=1)
# df = df[(np.abs(scipy.stats.zscore(df)) < 3).all(axis=1)]

# See correlation plot between feature and target
def print_data(X, Y):    
    fig=plt.figure(figsize=(10, 10), dpi=90)
    n_feats = len(X.columns)
    for i, feat in enumerate(X.columns):
        plt.subplot(n_feats//3+1,3,i+1)
        plt.scatter(X[feat],Y , s=10)
        plt.title(feat)
    plt.tight_layout()
    plt.show()
# print_data(df.drop("weight", axis=1), df["weight"])
    
# See heatmap of correlation
def heatmap(df):
    plt.figure(figsize=(20,20))
    cor = df.corr()
    sns.heatmap(cor, annot=True, cmap='PiYG', center=0, vmin = -0.4, vmax = 0.4) #cmap=plt.cm.Reds
    plt.show()
# heatmap(df)

In [54]:
#    Model  
# Feature selection and model selection (statistical tests seen during other courses allowed too).
# Explore the metaparameters space according to the time available.

# K-Folding
X1_np = df.drop("weight", axis=1).to_numpy()
Y1_np = df["weight"].to_numpy()

def KF_validation(X, Y, model, K, mse=2):    
    KF = KFold(n_splits=K, random_state=1, shuffle=True)
    SUM = 0
    ite = 0
    for train_index, test_index in KF.split(X):
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        #weight_ite = coef_weight[0:ite*25].append(coef_weight[(ite+1)*25:249])
        model.fit(X_train, Y_train) #,sample_weight=weight_ite)
        if mse == 0:
            SUM += mean_absolute_error(Y_test, model.predict(X_test))
        elif mse == 1:
            SUM += model.score(X_test, Y_test)
        else: # mse==2
            SUM += score_regression(Y_test, model.predict(X_test), X_test[:,2])
        ite+=1
    return SUM/K

KF_LR = KF_validation(X1_np, Y1_np, LinearRegression(), 10)

KF_MLP = KF_validation(X1_np, Y1_np, MLPRegressor(max_iter=1000), 10)

KF_KNN = KF_validation(X1_np, Y1_np, KNeighborsRegressor(n_neighbors=1), 10)  # Remis à 1 car c'est KNN 1 qu'il faut faire ?

KF_RF = KF_validation(X1_np, Y1_np, RandomForestRegressor(max_depth=2, random_state=1), 10)

print(KF_LR, KF_MLP, KF_KNN, KF_RF)
#0.34565997921242675 0.2388756036283703 0.1577997944915723 0.27733036766295954

# Model fitting
# reg = LinearRegression().fit(X1, Y1)
# Y2_LR = reg.predict(X2)

# regr = MLPRegressor(random_state=1, max_iter=1000).fit(X1, Y1)
# Y2_MLP = regr.predict(X2)

# neigh = KNeighborsRegressor(n_neighbors=10).fit(X1, Y1)
# Y2_KNN = neigh.predict(X2)

# rand_forest = RandomForestRegressor(max_depth=2, random_state=1).fit(X1, Y1)
# Y2_RF = rand_forest.predict(X2)

no true samples here
no true samples here
no true samples here
no true samples here
no true samples here
no true samples here
no true samples here
no true samples here
no true samples here
no true samples here
no true samples here
no true samples here
0.13666589917486957 0.18616839715348907 0.20056301766097517 0.16360822275926076


In [39]:
#    Prediction
# one line per prediction and no header, no quotation marks around your numbers either. 
# At the tail end of this file you will add an additional number which is the estimated performance of your model 
# on the unseen data.

estimate = 0
Y2_LR = np.append(Y2_LR, estimate)
Y2_LR = np.around(Y2_LR, 1)
prediction = pd.DataFrame(Y2_LR).to_csv('Y2.csv', index=False, header=False)

estimate = 0
Y2_MLP = np.append(Y2_MLP, estimate)
Y2_MLP = np.around(Y2_MLP, 1)
prediction = pd.DataFrame(Y2_MLP).to_csv('Y2.csv', index=False, header=False)

estimate = 0
Y2_KNN = np.append(Y2_KNN, estimate)
Y2_KNN = np.around(Y2_KNN, 1)
prediction = pd.DataFrame(Y2_KNN).to_csv('Y2.csv', index=False, header=False)

estimate = 0
Y2_RF = np.append(Y2_RF, estimate)
Y2_RF = np.around(Y2_RF, 1)
prediction = pd.DataFrame(Y2_RF).to_csv('Y2.csv', index=False, header=False)

[62.8 58.3 50.7 70.9 63.3 76.6 77.9 75.5 46.  70. ]
[62.2 59.  49.8 70.6 67.8 84.5 72.9 73.1 52.1 71.5]
[68.6 69.4 62.9 77.8 61.7 65.1 77.4 65.3 56.  64.6]
