In [62]:
#import stuff
import numpy as np
import sklearn as sk
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [63]:
#import cleaned up datasets
protein_df = pd.read_csv("data/data_clean/protein_data_clean.csv")
kg_df = pd.read_csv("data/data_clean/kg_data_clean.csv")
kcal_df = pd.read_csv("data/data_clean/kcal_data_clean.csv")
fat_df = pd.read_csv("data/data_clean/fat_data_clean.csv")

In [64]:
#To aid regression, drop features that probably highly correlate with other features
#We have absolutely no need for obesity and undernourishment
protein_df = protein_df.drop(['Obesity','Undernourished'],axis=1)
kg_df = kg_df.drop(['Obesity','Undernourished'],axis=1)
kcal_df = kcal_df.drop(['Obesity','Undernourished'],axis=1)
fat_df = fat_df.drop(['Obesity','Undernourished'],axis=1)
#protein_df = protein_df.drop(['Animal fats_p','Aquatic Products, Other_p', 'Sugar Crops_p', 'Vegetal Products_p', 'Vegetable Oils_p', 'Miscellaneous_p'],axis=1)
#kg_df = kg_df.drop(['Obesity','Animal fats_kg','Aquatic Products, Other_kg', 'Sugar Crops_kg', 'Vegetal Products_kg', 'Vegetable Oils_kg', 'Miscellaneous_kg'],axis=1)
#kcal_df = kcal_df.drop(['Obesity','Animal fats_kcal','Aquatic Products, Other_kcal', 'Sugar Crops_kcal', 'Vegetal Products_kcal', 'Vegetable Oils_kcal', 'Miscellaneous_kcal'],axis=1)
#fat_df = fat_df.drop(['Obesity','Animal fats_f','Aquatic Products, Other_f', 'Sugar Crops_f', 'Vegetal Products_f', 'Vegetable Oils_f', 'Miscellaneous_f'],axis=1)

<h3>
I want to figure out what the lambda best value to penalize our model is for LASSO, for each dataset set.
I will also print out the weight vector, so we can determine what features LASSO determines are actually important.
</h3>

In [65]:
#gonna try different scoring metrics
def loop_lasso_r2(X_train, y_train, x):
    best_a = 0.001
    best_score = -np.inf
    test_a_values = np.arange(0.001,0.1,0.001)
    for a in range(0,len(test_a_values)):
        Lreg = linear_model.Lasso(alpha = test_a_values[a], tol=0.1)
        Lreg.fit(X_train, y_train)
        score = cross_val_score(Lreg, X_train, y_train, cv=5, scoring='r2')
        if(np.average(score)>best_score):
            best_score=np.average(score)
            best_a = test_a_values[a]
    print("Best score :",best_score)
    print("Best Alpha :",best_a)
    #Get the best model's coefs
    Lreg = linear_model.Lasso(alpha = best_a)
    Lreg.fit(X_train, y_train)
    print("Weight coefficients:")
    for c in range(1,len(Lreg.coef_)):
        if(Lreg.coef_[c]>0.0):
            print(x.columns[c],":",Lreg.coef_[c])
    print('\n')

In [66]:
#gonna try different scoring metrics
def loop_lasso_mse(X_train, y_train, x):
    best_a = 0.001
    best_score = np.inf
    test_a_values = np.arange(0.001,0.1,0.001)
    for a in range(0,len(test_a_values)):
        Lreg = linear_model.Lasso(alpha = test_a_values[a], tol=0.1)
        Lreg.fit(X_train, y_train)
        score = cross_val_score(Lreg, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
        if(np.average(score)*-1<best_score):
            best_score=np.average(score)*-1
            best_a = test_a_values[a]
    print("Best score :",best_score)
    print("Best Alpha :",best_a)
    #Get the best model's coefs
    Lreg = linear_model.Lasso(alpha = best_a)
    Lreg.fit(X_train, y_train)
    print("Weight coefficients:")
    for c in range(1,len(Lreg.coef_)):
        if(Lreg.coef_[c]>0.0):
            print(x.columns[c],":",Lreg.coef_[c])
    print('\n')

In [74]:
def loop_lasso_rmse(X_train, y_train, x):
    best_a = 0.001
    best_score = np.inf
    best_std = -1
    
    test_a_values = np.arange(0.001,0.1,0.001)
    for a in range(0,len(test_a_values)):
        Lreg = linear_model.Lasso(alpha = test_a_values[a], tol=0.1)
        Lreg.fit(X_train, y_train)
        score = cross_val_score(Lreg, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
        print(score)
        if(np.average(score)*-1<best_score):
            best_score=np.average(score)*-1
            best_a = test_a_values[a]
    print("Best score :",best_score)
    print("Best Alpha :",best_a)
    #Get the best model's coefs
    Lreg = linear_model.Lasso(alpha = best_a)
    Lreg.fit(X_train, y_train)
    print("Weight coefficients:")
    for c in range(1,len(Lreg.coef_)):
        if(Lreg.coef_[c]>0.0):
            print(x.columns[c],":",Lreg.coef_[c])
    print('\n')

In [68]:
def LASSO(df):
    df_x = df.drop(['Country', 'Population','Active','Recovered','Deaths','Confirmed'],axis=1)
    #to help regression, drop some features that are probably highly correlated
    #scale the data
    x = MinMaxScaler().fit_transform(df_x)

    #confirmed
    print("Confirmed:")
    confirmed_index = -1
    for c in range(0,len(df.columns)):
        if(df.columns[c] == "Confirmed"):
            confirmed_index = c
    y = df.iloc[:,confirmed_index].values
    print("R2")
    loop_lasso_r2(x,y, df_x)
    print("RMSE")
    loop_lasso_rmse(x,y, df_x)
    print("MSE")
    loop_lasso_mse(x,y, df_x)
    
    #deaths
    print("Deaths:")
    deaths_index = -1
    for c in range(0,len(df.columns)):
        if(df.columns[c] == "Deaths"):
            deaths_index = c
    y = df.iloc[:,deaths_index].values*100
    print("R2")
    loop_lasso_r2(x,y, df_x)
    print("RMSE")
    loop_lasso_rmse(x,y, df_x)
    print("MSE")
    loop_lasso_mse(x,y, df_x)
    
    #recovered
    print("Recovered:")
    recovered_index = -1
    for c in range(0,len(df.columns)):
        if(df.columns[c] == "Recovered"):
            recovered_index = c
    y = df.iloc[:,recovered_index].values
    print("R2")
    loop_lasso_r2(x,y, df_x)
    print("RMSE")
    loop_lasso_rmse(x,y, df_x)
    print("MSE")
    loop_lasso_mse(x,y, df_x)

In [75]:
LASSO(protein_df)

Confirmed:
R2
Best score : 0.45792768427629194
Best Alpha : 0.010000000000000002
Weight coefficients:
Eggs_p : 1.4027528262178401
Meat_p : 1.1027670417476345
Milk - Excluding Butter_p : 3.6578275219856207
Stimulants_p : 1.4511019306623654
Sugar & Sweeteners_p : 0.13655962623242618
Treenuts_p : 1.3961361162737818


RMSE
[-1.39689614 -1.8414276  -1.69819103 -2.10091668 -1.60715468]
[-1.33955439 -1.83004181 -1.70385222 -2.09945125 -1.60169654]
[-1.35189516 -1.82158074 -1.71029585 -2.0986131  -1.5988952 ]
[-1.34770396 -1.81418393 -1.71630062 -2.0971309  -1.59594784]
[-1.34442882 -1.80769191 -1.72369374 -2.0955267  -1.59458745]
[-1.34133932 -1.80133033 -1.7301343  -2.09461203 -1.5976669 ]
[-1.33784288 -1.79476666 -1.73700674 -2.09390081 -1.60330711]
[-1.33496788 -1.78999695 -1.73472472 -2.09372952 -1.61115283]
[-1.32450467 -1.7859093  -1.74297479 -2.09292809 -1.61763178]
[-1.32129235 -1.78076709 -1.75490636 -2.08628567 -1.61605303]
[-1.31834104 -1.77679679 -1.76699104 -2.08503566 -1.6210472

[-4.1662918  -3.24710404 -3.58433056 -4.08395057 -3.86977379]
[-4.16724202 -3.24690747 -3.58611655 -4.08290895 -3.87417504]
[-4.17809154 -3.24677407 -3.58794081 -4.08192694 -3.87864086]
[-4.17886656 -3.24670891 -3.58997477 -4.08075901 -3.88319428]
[-4.1705748  -3.24671983 -3.5922388  -4.0789111  -3.88797553]
[-4.17176907 -3.24681003 -3.5945183  -4.07691769 -3.89152564]
[-4.17294113 -3.24731965 -3.59671158 -4.07494849 -3.89474779]
[-4.17415136 -3.24755704 -3.59890338 -4.07300354 -3.89775406]
[-4.17539971 -3.24787385 -3.60114394 -4.07108287 -3.90023621]
[-4.17675491 -3.2482697  -3.603442   -4.06918652 -3.9024975 ]
[-4.17815372 -3.24874455 -3.60581229 -4.06731452 -3.90479622]
[-4.17953963 -3.24929949 -3.6082542  -4.0654669  -3.9071226 ]
[-4.18081328 -3.24993364 -3.61075462 -4.06364371 -3.90946748]
[-4.1819725  -3.25064658 -3.61332354 -4.06184496 -3.91184933]
[-4.18316852 -3.25143823 -3.6159614  -4.0601778  -3.9142681 ]
[-4.18440717 -3.25230507 -3.61866556 -4.05861587 -3.91672371]
[-4.1856

[-1.43521136 -1.78402569 -1.73365888 -2.03847689 -1.54895187]
[-1.43604313 -1.7856226  -1.7357537  -2.0397166  -1.54984578]
[-1.43688514 -1.78723664 -1.73786351 -2.04096664 -1.55075764]
[-1.43773735 -1.78886773 -1.73998823 -2.04222701 -1.55168743]
[-1.43859977 -1.79051585 -1.74212782 -2.04349768 -1.55263511]
Best score : 1.6449533958794695
Best Alpha : 0.033
Weight coefficients:
Eggs_p : 1.0210853807802827
Meat_p : 0.1393665563655803
Milk - Excluding Butter_p : 2.4896623602022894
Stimulants_p : 0.547455402028672


MSE
Best score : 2.7466184050343356
Best Alpha : 0.033
Weight coefficients:
Eggs_p : 1.0210853807802827
Meat_p : 0.1393665563655803
Milk - Excluding Butter_p : 2.4896623602022894
Stimulants_p : 0.547455402028672




In [70]:
LASSO(fat_df)

Confirmed:
R2
Best score : 0.4173231794778037
Best Alpha : 0.016
Weight coefficients:
Animal fats_f : 2.5590395042992435
Eggs_f : 0.7905416580102429
Milk - Excluding Butter_f : 1.0804630644256479
Stimulants_f : 3.270496211931997
Treenuts_f : 0.6317506377054136


RMSE
Best score : 1.7827436864828112
Best Alpha : 0.007
Weight coefficients:
Animal fats_f : 2.3848576769578305
Eggs_f : 0.931518225938917
Milk - Excluding Butter_f : 1.0106988290638603
Stimulants_f : 3.5000778951038205
Treenuts_f : 1.0318898677971675


MSE
Best score : 3.2373745184803995
Best Alpha : 0.007
Weight coefficients:
Animal fats_f : 2.3848576769578305
Eggs_f : 0.931518225938917
Milk - Excluding Butter_f : 1.0106988290638603
Stimulants_f : 3.5000778951038205
Treenuts_f : 1.0318898677971675


Deaths:
R2
Best score : 0.29509604165160563
Best Alpha : 0.046
Weight coefficients:
Animal fats_f : 6.859203023221479
Eggs_f : 2.149432824666097
Milk - Excluding Butter_f : 1.1328317199087565
Stimulants_f : 2.556563507174045


RMS

In [71]:
LASSO(kg_df)

Confirmed:
R2
Best score : 0.36981437686985263
Best Alpha : 0.011
Weight coefficients:
Animal fats_kg : 0.695364202089345
Eggs_kg : 0.8391490044209047
Milk - Excluding Butter_kg : 3.159571986608007
Stimulants_kg : 0.513783307482931
Sugar & Sweeteners_kg : 1.626177167163774
Treenuts_kg : 1.2822109259941117
Vegetables_kg : 0.2455663322248593


RMSE
Best score : 1.8370941105948926
Best Alpha : 0.010000000000000002
Weight coefficients:
Animal fats_kg : 0.6827690731905616
Eggs_kg : 0.8437262233037577
Milk - Excluding Butter_kg : 3.0892996848242293
Stimulants_kg : 0.5369614480020173
Sugar & Sweeteners_kg : 1.6559106609622452
Treenuts_kg : 1.3099565419546302
Vegetables_kg : 0.244838562514475


MSE
Best score : 3.410092224817679
Best Alpha : 0.010000000000000002
Weight coefficients:
Animal fats_kg : 0.6827690731905616
Eggs_kg : 0.8437262233037577
Milk - Excluding Butter_kg : 3.0892996848242293
Stimulants_kg : 0.5369614480020173
Sugar & Sweeteners_kg : 1.6559106609622452
Treenuts_kg : 1.3099565

In [72]:
LASSO(kcal_df)

Confirmed:
R2
Best score : 0.4513689874669574
Best Alpha : 0.014000000000000002
Weight coefficients:
Animal fats_kcal : 1.3421664298467444
Eggs_kcal : 1.20935697846046
Fruits - Excluding Wine_kcal : 0.2551125138060378
Milk - Excluding Butter_kcal : 1.9912362724150643
Stimulants_kcal : 3.7453082760314316
Sugar & Sweeteners_kcal : 0.7074807460042866
Treenuts_kcal : 1.4100752747986813
Vegetable Oils_kcal : 0.5809534203000907


RMSE
Best score : 1.7136492037415976
Best Alpha : 0.014000000000000002
Weight coefficients:
Animal fats_kcal : 1.3421664298467444
Eggs_kcal : 1.20935697846046
Fruits - Excluding Wine_kcal : 0.2551125138060378
Milk - Excluding Butter_kcal : 1.9912362724150643
Stimulants_kcal : 3.7453082760314316
Sugar & Sweeteners_kcal : 0.7074807460042866
Treenuts_kcal : 1.4100752747986813
Vegetable Oils_kcal : 0.5809534203000907


MSE
Best score : 2.985037469369159
Best Alpha : 0.014000000000000002
Weight coefficients:
Animal fats_kcal : 1.3421664298467444
Eggs_kcal : 1.20935697846

In [73]:
#stronger coefficients = more of an influence on the line of best fit