In [20]:
#import stuff
import numpy as np
import sklearn as sk
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [21]:
#import cleaned up datasets
protein_df = pd.read_csv("data/data_clean/protein_data_clean.csv")
kg_df = pd.read_csv("data/data_clean/kg_data_clean.csv")
kcal_df = pd.read_csv("data/data_clean/kcal_data_clean.csv")
fat_df = pd.read_csv("data/data_clean/fat_data_clean.csv")

In [22]:
#To aid regression, drop features that probably highly correlate with other features
#We have absolutely no need for obesity and undernourishment
protein_df = protein_df.drop(['Obesity','Undernourished'],axis=1)
kg_df = kg_df.drop(['Obesity','Undernourished'],axis=1)
kcal_df = kcal_df.drop(['Obesity','Undernourished'],axis=1)
fat_df = fat_df.drop(['Obesity','Undernourished'],axis=1)
#protein_df = protein_df.drop(['Animal fats_p','Aquatic Products, Other_p', 'Sugar Crops_p', 'Vegetal Products_p', 'Vegetable Oils_p', 'Miscellaneous_p'],axis=1)
#kg_df = kg_df.drop(['Obesity','Animal fats_kg','Aquatic Products, Other_kg', 'Sugar Crops_kg', 'Vegetal Products_kg', 'Vegetable Oils_kg', 'Miscellaneous_kg'],axis=1)
#kcal_df = kcal_df.drop(['Obesity','Animal fats_kcal','Aquatic Products, Other_kcal', 'Sugar Crops_kcal', 'Vegetal Products_kcal', 'Vegetable Oils_kcal', 'Miscellaneous_kcal'],axis=1)
#fat_df = fat_df.drop(['Obesity','Animal fats_f','Aquatic Products, Other_f', 'Sugar Crops_f', 'Vegetal Products_f', 'Vegetable Oils_f', 'Miscellaneous_f'],axis=1)

<h3>
I want to figure out what the lambda best value to penalize our model is for LASSO, for each dataset set, for each scoring metric.
R2 will tell me if the model fits to the training data well, MSE and RMSE will tell me if it's overfitting. Also important to get the variance in the cross val scores of RMSE and MAE for that reason.
I will also print out the weight vector, so we can determine what features LASSO determines are actually important.
</h3>

In [23]:
#gonna try different scoring metrics
def loop_lasso_r2(X_train, y_train, x):
    best_a = 0.001
    best_score = -np.inf
    test_a_values = np.arange(0.001,0.1,0.001)
    for a in range(0,len(test_a_values)):
        Lreg = linear_model.Lasso(alpha = test_a_values[a], tol=0.1)
        Lreg.fit(X_train, y_train)
        score = cross_val_score(Lreg, X_train, y_train, cv=5, scoring='r2')
        if(np.average(score)>best_score):
            best_score=np.average(score)
            best_a = test_a_values[a]
    print("Best score :",best_score)
    print("Best Alpha :",best_a)
    #Get the best model's coefs
    Lreg = linear_model.Lasso(alpha = best_a)
    Lreg.fit(X_train, y_train)
    print("Weight coefficients:")
    for c in range(1,len(Lreg.coef_)):
        if(Lreg.coef_[c]>0.0):
            print(x.columns[c],":",Lreg.coef_[c])
    print('\n')

In [24]:
def loop_lasso_mse(X_train, y_train, x):
    best_a = 0.001
    best_score = np.inf
    best_var = -1
    test_a_values = np.arange(0.001,0.1,0.001)
    for a in range(0,len(test_a_values)):
        Lreg = linear_model.Lasso(alpha = test_a_values[a], tol=0.1)
        Lreg.fit(X_train, y_train)
        score = cross_val_score(Lreg, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
        if(np.average(score)*-1<best_score):
            best_score=np.average(score)*-1
            best_a = test_a_values[a]
            best_var = np.var(score)
    print("Best score :",best_score)
    print("Best Alpha :",best_a)
    print("Best Variance :",best_var)
    #Get the best model's coefs
    Lreg = linear_model.Lasso(alpha = best_a)
    Lreg.fit(X_train, y_train)
    print("Weight coefficients:")
    for c in range(1,len(Lreg.coef_)):
        if(Lreg.coef_[c]>0.0):
            print(x.columns[c],":",Lreg.coef_[c])
    print('\n')

In [25]:
def loop_lasso_rmse(X_train, y_train, x):
    best_a = 0.001
    best_score = np.inf
    best_var = -1
    test_a_values = np.arange(0.001,0.1,0.001)
    for a in range(0,len(test_a_values)):
        Lreg = linear_model.Lasso(alpha = test_a_values[a], tol=0.1)
        Lreg.fit(X_train, y_train)
        score = cross_val_score(Lreg, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
        if(np.average(score)*-1<best_score):
            best_score=np.average(score)*-1
            best_a = test_a_values[a]
            best_var = np.var(score)
    print("Best score :",best_score)
    print("Best Alpha :",best_a)
    print("Best Variance :",best_var)
    #Get the best model's coefs
    Lreg = linear_model.Lasso(alpha = best_a)
    Lreg.fit(X_train, y_train)
    print("Weight coefficients:")
    for c in range(1,len(Lreg.coef_)):
        if(Lreg.coef_[c]>0.0):
            print(x.columns[c],":",Lreg.coef_[c])
    print('\n')

In [30]:
def LASSO(df):
    df_x = df.drop(['Country', 'Population','Active','Recovered','Deaths','Confirmed'],axis=1)
    #to help regression, drop some features that are probably highly correlated
    #scale the data
    x = MinMaxScaler().fit_transform(df_x)

    #confirmed
    print("Confirmed:")
    confirmed_index = -1
    for c in range(0,len(df.columns)):
        if(df.columns[c] == "Confirmed"):
            confirmed_index = c
    y = df.iloc[:,confirmed_index].values
    print("R2")
    loop_lasso_r2(x,y, df_x)
    print("RMSE")
    loop_lasso_rmse(x,y, df_x)
    print("MSE")
    loop_lasso_mse(x,y, df_x)
    
    #deaths
    print("Deaths:")
    deaths_index = -1
    for c in range(0,len(df.columns)):
        if(df.columns[c] == "Deaths"):
            deaths_index = c
    y = df.iloc[:,deaths_index].values*100
    print("R2")
    loop_lasso_r2(x,y, df_x)
    print("RMSE")
    loop_lasso_rmse(x,y, df_x)
    print("MSE")
    loop_lasso_mse(x,y, df_x)
    
    #recovered
    print("Recovered:")
    recovered_index = -1
    for c in range(0,len(df.columns)):
        if(df.columns[c] == "Recovered"):
            recovered_index = c
    y = df.iloc[:,recovered_index].values
    print("R2")
    loop_lasso_r2(x,y, df_x)
    print("RMSE")
    loop_lasso_rmse(x,y, df_x)
    print("MSE")
    loop_lasso_mse(x,y, df_x)

In [31]:
LASSO(protein_df)

0


KeyError: "['Population' 'Active' 'Recovered' 'Deaths' 'Confirmed'] not found in axis"

In [None]:
LASSO(fat_df)

In [None]:
LASSO(kg_df)

In [None]:
LASSO(kcal_df)

In [None]:
#stronger coefficients = more of an influence on the line of best fit