In [None]:
#import stuff
import numpy as np
import sklearn as sk
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [None]:
#import cleaned up datasets
protein_df = pd.read_csv("data/data_clean/protein_data_clean.csv")
kg_df = pd.read_csv("data/data_clean/kg_data_clean.csv")
kcal_df = pd.read_csv("data/data_clean/kcal_data_clean.csv")
fat_df = pd.read_csv("data/data_clean/fat_data_clean.csv")

In [None]:
#We have absolutely no need for obesity and undernourishment because 
protein_df = protein_df.drop(['Obesity','Undernourished'],axis=1)
kg_df = kg_df.drop(['Obesity','Undernourished'],axis=1)
kcal_df = kcal_df.drop(['Obesity','Undernourished'],axis=1)
fat_df = fat_df.drop(['Obesity','Undernourished'],axis=1)

In [None]:
def produce_x_y(df,y_var):
    df_x = df.drop(['Country', 'Population','Active','Recovered','Deaths','Confirmed'],axis=1)
    #df is the df that we are extracting x and y from
    #y_var is the name of the y variable we're testing
    #scale the data
    x = StandardScaler().fit_transform(df_x)
    
    y_index = -1
    for c in range(0,len(df.columns)):
        if(df.columns[c] == y_var):
            y_index = c
    y = df.iloc[:,y_index].values
    
    return (x,y)

In [147]:
def svr_function(df,y_var):
    X_y = produce_x_y(df,y_var)
    X = X_y[0]
    y = X_y[1]
    parameters = {'kernel':('linear', 'rbf', 'poly'), 'C': range(1,10,2), 'epsilon' : range(1,10,1)}
    svr = SVR()
    clf = GridSearchCV(svr, parameters)
    clf.fit(X,y)
    print(clf.best_params_)
    mse = cross_val_score(clf, X, y, scoring='neg_mean_squared_error', cv=5)
    rmse = cross_val_score(clf, X, y, scoring='neg_root_mean_squared_error', cv=5)
    mae = cross_val_score(clf, X, y, scoring='neg_mean_absolute_error', cv=5)
    r2 = cross_val_score(clf, X, y, scoring='r2', cv=5)

    print(f"R2: {r2.mean()} ({r2.std()})")
    print(f"MSE: {-mse.mean()} ({mse.std()})")
    print(f"RMSE: {-rmse.mean()} ({rmse.std()})")
    print(f"MAE: {-mae.mean()} ({mae.std()})")

In [150]:
svr_function(protein_df,"Confirmed")
svr_function(protein_df,"Deaths")
svr_function(protein_df,"Recovered")

{'C': 1, 'epsilon': 1, 'kernel': 'linear'}
R2: 0.377791972689192 (0.05087127462451293)
MSE: 3.395345465435601 (0.6584440880437257)
RMSE: 1.8340559951984496 (0.1777190814522348)
MAE: 1.3721300606154276 (0.11477317140869739)
{'C': 1, 'epsilon': 3, 'kernel': 'linear'}
R2: -1.2885597775869848 (0.5810482959146844)
MSE: 0.0050919736293602 (0.0005037779428054839)
RMSE: 0.07126949870464691 (0.003554178350692612)
MAE: 0.06547535881224328 (0.004033095727608879)
{'C': 1, 'epsilon': 1, 'kernel': 'linear'}
R2: 0.18015316325366945 (0.02956506485213783)
MSE: 2.9870044024016567 (0.7428428787806893)
RMSE: 1.7148335240838886 (0.2152914003848918)
MAE: 1.2899007568024703 (0.08194930821114942)


In [152]:
svr_function(kcal_df,"Confirmed")
svr_function(kcal_df,"Deaths")
svr_function(kcal_df,"Recovered")

{'C': 11, 'epsilon': 1, 'kernel': 'rbf'}
R2: 0.4132218776986208 (0.08348351369017303)
MSE: 3.2010885560744406 (0.7080505526134732)
RMSE: 1.7779758677124782 (0.19972573671537577)
MAE: 1.3623639072240232 (0.08196637585829922)
{'C': 1, 'epsilon': 3, 'kernel': 'linear'}
R2: -1.2885597775869848 (0.5810482959146844)
MSE: 0.0050919736293602 (0.0005037779428054839)
RMSE: 0.07126949870464691 (0.003554178350692612)
MAE: 0.06547535881224328 (0.004033095727608879)
{'C': 1, 'epsilon': 1, 'kernel': 'poly'}
R2: 0.19463943830350722 (0.09058651902561238)
MSE: 2.9060863066437044 (0.7244582762544153)
RMSE: 1.692534568349706 (0.20350194491694512)
MAE: 1.2973900630350133 (0.09029820939587778)


In [153]:
svr_function(kg_df,"Confirmed")
svr_function(kg_df,"Deaths")
svr_function(kg_df,"Recovered")

{'C': 1, 'epsilon': 1, 'kernel': 'rbf'}
R2: 0.4087925522410317 (0.07417539105337152)
MSE: 3.2435653694955717 (0.7495614413894323)
RMSE: 1.7884686919560235 (0.21200261174967114)
MAE: 1.3586294458457047 (0.10112960359676307)
{'C': 1, 'epsilon': 3, 'kernel': 'linear'}
R2: -1.2885597775869848 (0.5810482959146844)
MSE: 0.0050919736293602 (0.0005037779428054839)
RMSE: 0.07126949870464691 (0.003554178350692612)
MAE: 0.06547535881224328 (0.004033095727608879)
{'C': 6, 'epsilon': 1, 'kernel': 'poly'}
R2: 0.11328391687658353 (0.06282163238716541)
MSE: 3.2007288374641627 (0.7107380077967589)
RMSE: 1.778321351202257 (0.1957089914192532)
MAE: 1.3699305868010163 (0.07225745315805703)


In [154]:
svr_function(fat_df,"Confirmed")
svr_function(fat_df,"Deaths")
svr_function(fat_df,"Recovered")

{'C': 6, 'epsilon': 1, 'kernel': 'rbf'}
R2: 0.45025116194882636 (0.11446594392794239)
MSE: 2.9243828836557126 (0.49397293244486423)
RMSE: 1.7042929629583043 (0.14060007136028443)
MAE: 1.3027202229242278 (0.06471439268961382)
{'C': 1, 'epsilon': 3, 'kernel': 'linear'}
R2: -1.2885597775869848 (0.5810482959146844)
MSE: 0.0050919736293602 (0.0005037779428054839)
RMSE: 0.07126949870464691 (0.003554178350692612)
MAE: 0.06547535881224328 (0.004033095727608879)
{'C': 1, 'epsilon': 1, 'kernel': 'rbf'}
R2: 0.17973623511851072 (0.044262620583771145)
MSE: 3.000587506513812 (0.8309280943632213)
RMSE: 1.7164092385676741 (0.23350981194234355)
MAE: 1.3039768908094502 (0.1285025701339657)
