In [5]:
#import stuff
import numpy as np
import sklearn as sk
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [6]:
#import cleaned up datasets
protein_df = pd.read_csv("data/data_clean/protein_data_clean.csv")
kg_df = pd.read_csv("data/data_clean/kg_data_clean.csv")
kcal_df = pd.read_csv("data/data_clean/kcal_data_clean.csv")
fat_df = pd.read_csv("data/data_clean/fat_data_clean.csv")

In [7]:
#To aid regression, drop features that probably highly correlate with other features
#We have absolutely no need for obesity and undernourishment
protein_df = protein_df.drop(['Obesity','Undernourished'],axis=1)
kg_df = kg_df.drop(['Obesity','Undernourished'],axis=1)
kcal_df = kcal_df.drop(['Obesity','Undernourished'],axis=1)
fat_df = fat_df.drop(['Obesity','Undernourished'],axis=1)
#protein_df = protein_df.drop(['Animal fats_p','Aquatic Products, Other_p', 'Sugar Crops_p', 'Vegetal Products_p', 'Vegetable Oils_p', 'Miscellaneous_p'],axis=1)
#kg_df = kg_df.drop(['Obesity','Animal fats_kg','Aquatic Products, Other_kg', 'Sugar Crops_kg', 'Vegetal Products_kg', 'Vegetable Oils_kg', 'Miscellaneous_kg'],axis=1)
#kcal_df = kcal_df.drop(['Obesity','Animal fats_kcal','Aquatic Products, Other_kcal', 'Sugar Crops_kcal', 'Vegetal Products_kcal', 'Vegetable Oils_kcal', 'Miscellaneous_kcal'],axis=1)
#fat_df = fat_df.drop(['Obesity','Animal fats_f','Aquatic Products, Other_f', 'Sugar Crops_f', 'Vegetal Products_f', 'Vegetable Oils_f', 'Miscellaneous_f'],axis=1)

<h3>
I want to figure out what the lambda best value to penalize our model is for LASSO, for each dataset set, for each scoring metric.
R2 will tell me if the model fits to the training data well, MSE and RMSE will tell me if it's overfitting. Also important to get the variance in the cross val scores of RMSE and MAE for that reason.
I will also print out the weight vector, so we can determine what features LASSO determines are actually important.
</h3>

In [18]:
#gonna try different scoring metrics
def loop_lasso_r2(X_train, y_train, x):
    best_a = 0.001
    best_score = -np.inf
    test_a_values = np.arange(0.001,0.1,0.001)
    for a in range(0,len(test_a_values)):
        Lreg = linear_model.Lasso(alpha = test_a_values[a], tol=0.1)
        Lreg.fit(X_train, y_train)
        score = cross_val_score(Lreg, X_train, y_train, cv=5, scoring='r2')
        if(np.average(score)>best_score):
            best_score=np.average(score)
            best_a = test_a_values[a]
    print("Best score :",best_score)
    print("Best Alpha :",best_a)
    #Get the best model's coefs
    Lreg = linear_model.Lasso(alpha = best_a)
    Lreg.fit(X_train, y_train)
    print("Weight coefficients:")
    for c in range(1,len(Lreg.coef_)):
        if(Lreg.coef_[c]>0.0):
            print(x.columns[c],":",Lreg.coef_[c])
    print('\n')

In [19]:
def loop_lasso_mse(X_train, y_train, x):
    best_a = 0.001
    best_score = np.inf
    best_var = -1
    test_a_values = np.arange(0.001,0.1,0.001)
    for a in range(0,len(test_a_values)):
        Lreg = linear_model.Lasso(alpha = test_a_values[a], tol=0.1)
        Lreg.fit(X_train, y_train)
        score = cross_val_score(Lreg, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
        if(np.average(score)*-1<best_score):
            best_score=np.average(score)*-1
            best_a = test_a_values[a]
            best_var = np.var(score)
    print("Best score :",best_score)
    print("Best Alpha :",best_a)
    print("Best Variance :",best_var)
    #Get the best model's coefs
    Lreg = linear_model.Lasso(alpha = best_a)
    Lreg.fit(X_train, y_train)
    print("Weight coefficients:")
    for c in range(1,len(Lreg.coef_)):
        if(Lreg.coef_[c]>0.0):
            print(x.columns[c],":",Lreg.coef_[c])
    print('\n')

In [20]:
def loop_lasso_rmse(X_train, y_train, x):
    best_a = 0.001
    best_score = np.inf
    best_var = -1
    test_a_values = np.arange(0.001,0.1,0.001)
    for a in range(0,len(test_a_values)):
        Lreg = linear_model.Lasso(alpha = test_a_values[a], tol=0.1)
        Lreg.fit(X_train, y_train)
        score = cross_val_score(Lreg, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
        if(np.average(score)*-1<best_score):
            best_score=np.average(score)*-1
            best_a = test_a_values[a]
            best_var = np.var(score)
    print("Best score :",best_score)
    print("Best Alpha :",best_a)
    print("Best Variance :",best_var)
    #Get the best model's coefs
    Lreg = linear_model.Lasso(alpha = best_a)
    Lreg.fit(X_train, y_train)
    print("Weight coefficients:")
    for c in range(1,len(Lreg.coef_)):
        if(Lreg.coef_[c]>0.0):
            print(x.columns[c],":",Lreg.coef_[c])
    print('\n')

In [21]:
def LASSO(df):
    df_x = df.drop(['Country', 'Population','Active','Recovered','Deaths','Confirmed'],axis=1)
    #to help regression, drop some features that are probably highly correlated
    #scale the data
    x = MinMaxScaler().fit_transform(df_x)

    #confirmed
    print("Confirmed:")
    confirmed_index = -1
    for c in range(0,len(df.columns)):
        if(df.columns[c] == "Confirmed"):
            confirmed_index = c
    y = df.iloc[:,confirmed_index].values
    print("R2")
    loop_lasso_r2(x,y, df_x)
    print("RMSE")
    loop_lasso_rmse(x,y, df_x)
    print("MSE")
    loop_lasso_mse(x,y, df_x)
    
    #deaths
    print("Deaths:")
    deaths_index = -1
    for c in range(0,len(df.columns)):
        if(df.columns[c] == "Deaths"):
            deaths_index = c
    y = df.iloc[:,deaths_index].values*100
    print("R2")
    loop_lasso_r2(x,y, df_x)
    print("RMSE")
    loop_lasso_rmse(x,y, df_x)
    print("MSE")
    loop_lasso_mse(x,y, df_x)
    
    #recovered
    print("Recovered:")
    recovered_index = -1
    for c in range(0,len(df.columns)):
        if(df.columns[c] == "Recovered"):
            recovered_index = c
    y = df.iloc[:,recovered_index].values
    print("R2")
    loop_lasso_r2(x,y, df_x)
    print("RMSE")
    loop_lasso_rmse(x,y, df_x)
    print("MSE")
    loop_lasso_mse(x,y, df_x)

In [22]:
LASSO(protein_df)

Confirmed:
R2
Best score : 0.4518501035816101
Best Alpha : 0.010000000000000002
Weight coefficients:
Animal fats_p : 0.18946673932943206
Eggs_p : 1.3425266696369968
Meat_p : 1.3166610363559794
Milk - Excluding Butter_p : 3.602900093708701
Stimulants_p : 1.4372746722711465
Sugar & Sweeteners_p : 0.11049615850011689
Treenuts_p : 1.3714115177145116


RMSE
Best score : 1.7176617133076484
Best Alpha : 0.010000000000000002
Best Variance : 0.06315369052132178
Weight coefficients:
Animal fats_p : 0.18946673932943206
Eggs_p : 1.3425266696369968
Meat_p : 1.3166610363559794
Milk - Excluding Butter_p : 3.602900093708701
Stimulants_p : 1.4372746722711465
Sugar & Sweeteners_p : 0.11049615850011689
Treenuts_p : 1.3714115177145116


MSE
Best score : 3.013515451884288
Best Alpha : 0.010000000000000002
Best Variance : 0.7378984111263226
Weight coefficients:
Animal fats_p : 0.18946673932943206
Eggs_p : 1.3425266696369968
Meat_p : 1.3166610363559794
Milk - Excluding Butter_p : 3.602900093708701
Stimulants

In [23]:
LASSO(fat_df)

Confirmed:
R2
Best score : 0.4173231794778037
Best Alpha : 0.016
Weight coefficients:
Animal fats_f : 2.5590395042992435
Eggs_f : 0.7905416580102429
Milk - Excluding Butter_f : 1.0804630644256479
Stimulants_f : 3.270496211931997
Treenuts_f : 0.6317506377054136


RMSE
Best score : 1.7827436864828112
Best Alpha : 0.007
Best Variance : 0.05919946678607628
Weight coefficients:
Animal fats_f : 2.3848576769578305
Eggs_f : 0.931518225938917
Milk - Excluding Butter_f : 1.0106988290638603
Stimulants_f : 3.5000778951038205
Treenuts_f : 1.0318898677971675


MSE
Best score : 3.2373745184803995
Best Alpha : 0.007
Best Variance : 0.6809364562524485
Weight coefficients:
Animal fats_f : 2.3848576769578305
Eggs_f : 0.931518225938917
Milk - Excluding Butter_f : 1.0106988290638603
Stimulants_f : 3.5000778951038205
Treenuts_f : 1.0318898677971675


Deaths:
R2
Best score : 0.29509604165160563
Best Alpha : 0.046
Weight coefficients:
Animal fats_f : 6.859203023221479
Eggs_f : 2.149432824666097
Milk - Excludi

In [24]:
LASSO(kg_df)

Confirmed:
R2
Best score : 0.36981437686985263
Best Alpha : 0.011
Weight coefficients:
Animal fats_kg : 0.695364202089345
Eggs_kg : 0.8391490044209047
Milk - Excluding Butter_kg : 3.159571986608007
Stimulants_kg : 0.513783307482931
Sugar & Sweeteners_kg : 1.626177167163774
Treenuts_kg : 1.2822109259941117
Vegetables_kg : 0.2455663322248593


RMSE
Best score : 1.8370941105948926
Best Alpha : 0.010000000000000002
Best Variance : 0.035177453635238415
Weight coefficients:
Animal fats_kg : 0.6827690731905616
Eggs_kg : 0.8437262233037577
Milk - Excluding Butter_kg : 3.0892996848242293
Stimulants_kg : 0.5369614480020173
Sugar & Sweeteners_kg : 1.6559106609622452
Treenuts_kg : 1.3099565419546302
Vegetables_kg : 0.244838562514475


MSE
Best score : 3.410092224817679
Best Alpha : 0.010000000000000002
Best Variance : 0.4928164496440279
Weight coefficients:
Animal fats_kg : 0.6827690731905616
Eggs_kg : 0.8437262233037577
Milk - Excluding Butter_kg : 3.0892996848242293
Stimulants_kg : 0.53696144800

In [25]:
LASSO(kcal_df)

Confirmed:
R2
Best score : 0.4513689874669574
Best Alpha : 0.014000000000000002
Weight coefficients:
Animal fats_kcal : 1.3421664298467444
Eggs_kcal : 1.20935697846046
Fruits - Excluding Wine_kcal : 0.2551125138060378
Milk - Excluding Butter_kcal : 1.9912362724150643
Stimulants_kcal : 3.7453082760314316
Sugar & Sweeteners_kcal : 0.7074807460042866
Treenuts_kcal : 1.4100752747986813
Vegetable Oils_kcal : 0.5809534203000907


RMSE
Best score : 1.7136492037415976
Best Alpha : 0.014000000000000002
Best Variance : 0.04844387588494766
Weight coefficients:
Animal fats_kcal : 1.3421664298467444
Eggs_kcal : 1.20935697846046
Fruits - Excluding Wine_kcal : 0.2551125138060378
Milk - Excluding Butter_kcal : 1.9912362724150643
Stimulants_kcal : 3.7453082760314316
Sugar & Sweeteners_kcal : 0.7074807460042866
Treenuts_kcal : 1.4100752747986813
Vegetable Oils_kcal : 0.5809534203000907


MSE
Best score : 2.985037469369159
Best Alpha : 0.014000000000000002
Best Variance : 0.5749199906164735
Weight coeffi

In [26]:
#stronger coefficients = more of an influence on the line of best fit