In [2]:
#import stuff
import numpy as np
import sklearn as sk
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [3]:
#import cleaned up datasets
protein_df = pd.read_csv("data/data_clean/protein_data_clean.csv")
kg_df = pd.read_csv("data/data_clean/kg_data_clean.csv")
kcal_df = pd.read_csv("data/data_clean/kcal_data_clean.csv")
fat_df = pd.read_csv("data/data_clean/fat_data_clean.csv")

In [4]:
#We have absolutely no need for obesity and undernourishment because they have nothing to do with diet
protein_df = protein_df.drop(['Obesity','Undernourished'],axis=1)
kg_df = kg_df.drop(['Obesity','Undernourished'],axis=1)
kcal_df = kcal_df.drop(['Obesity','Undernourished'],axis=1)
fat_df = fat_df.drop(['Obesity','Undernourished'],axis=1)

In [5]:
def produce_x_y(df,y_var):
    df_x = df.drop(['Country', 'Population','Active','Recovered','Deaths','Confirmed'],axis=1)
    #df is the df that we are extracting x and y from
    #y_var is the name of the y variable we're testing
    #scale the data
    x = StandardScaler().fit_transform(df_x)
    
    y_index = -1
    for c in range(0,len(df.columns)):
        if(df.columns[c] == y_var):
            y_index = c
    y = df.iloc[:,y_index].values
    if(y_var == "Deaths"):
        y = df.iloc[:,y_index].values*100
    
    return (x,y)

In [6]:
def svr_gridsearch(df,y_var):
    X_y = produce_x_y(df,y_var)
    X = X_y[0]
    y = X_y[1]
    
    c = np.arange(1,10,1)
    ep = np.arange(0.1,1,0.1)
    parameters = {'C': c, 'epsilon' : ep}
    svr = SVR(kernel='linear')
    clf = GridSearchCV(svr, parameters)
    clf.fit(X,y)
    
    #how do i get the coefficients from svr using grid search??
    #print(clf.coef_)
    
    mse = cross_val_score(clf, X, y, scoring='neg_mean_squared_error', cv=5)
    rmse = cross_val_score(clf, X, y, scoring='neg_root_mean_squared_error', cv=5)
    mae = cross_val_score(clf, X, y, scoring='neg_mean_absolute_error', cv=5)
    r2 = cross_val_score(clf, X, y, scoring='r2', cv=5)

    print(f"R2: {r2.mean()} ({r2.std()})")
    print(f"MSE: {-mse.mean()} ({mse.std()})")
    print(f"RMSE: {-rmse.mean()} ({rmse.std()})")
    print(f"MAE: {-mae.mean()} ({mae.std()})")
    print("\n")

In [7]:
def svr_baseline(df,y_var):
    X_y = produce_x_y(df,y_var)
    X = X_y[0]
    y = X_y[1]
    svr = SVR(kernel='linear')
    svr.fit(X,y)
    print(svr.coef_)
    mse = cross_val_score(svr, X, y, scoring='neg_mean_squared_error', cv=5)
    rmse = cross_val_score(svr, X, y, scoring='neg_root_mean_squared_error', cv=5)
    mae = cross_val_score(svr, X, y, scoring='neg_mean_absolute_error', cv=5)
    r2 = cross_val_score(svr, X, y, scoring='r2', cv=5)

    print(f"R2: {r2.mean()} ({r2.std()})")
    print(f"MSE: {-mse.mean()} ({mse.std()})")
    print(f"RMSE: {-rmse.mean()} ({rmse.std()})")
    print(f"MAE: {-mae.mean()} ({mae.std()})")
    print("\n")

<h1>Protein Dataframe</h1>

In [9]:
svr_baseline(protein_df,"Confirmed")
svr_baseline(protein_df,"Deaths")
svr_baseline(protein_df,"Recovered")

[[ 0.46867222  0.18880858 -0.00915033 -0.25910642 -0.08607218  0.25372624
  -0.20873955  0.06370103  0.0820223   0.43254227 -0.48475917 -0.15020546
  -0.37398893 -0.06545794  0.05940312  0.13199915  0.02726027  0.12550918
   0.2679319  -0.17241836 -0.17826261 -0.00113729 -0.30191396]]
R2: 0.34856350869780395 (0.09608435543879974)
MSE: 3.54299638830388 (0.794373697476727)
RMSE: 1.8706971126969454 (0.20853944675094963)
MAE: 1.3708320620190122 (0.09538274879225696)


[[ 0.72519735  0.2480765   0.21510156 -0.47671109 -0.07304125  0.69312476
  -0.294961    0.08566335  0.26332027  0.3376224  -0.76750435 -0.25912194
  -0.37608632 -0.25929769 -0.06569995  0.18321528  0.0200589   0.47618317
   0.04641758 -0.23667281  0.3748038   0.01611176 -0.79870474]]
R2: 0.3466419479203734 (0.15113681969332698)
MSE: 15.376982872901905 (5.211512527059505)
RMSE: 3.865572087026687 (0.6590412072868057)
MAE: 2.7808090912455214 (0.4633742913260278)


[[ 0.25772902  0.0386829   0.2728917  -0.17346291  0.0172703   0

In [8]:
svr_gridsearch(protein_df,"Confirmed")
svr_gridsearch(protein_df,"Deaths")
svr_gridsearch(protein_df,"Recovered")

R2: 0.4126865206194221 (0.12103026315514252)
MSE: 3.1700166488846238 (0.7815390759788677)
RMSE: 1.7671842471915056 (0.21697116251431778)
MAE: 1.296751395249779 (0.1201945642469059)


R2: 0.36167786586503564 (0.136721882501284)
MSE: 14.952174673644075 (4.650857117373263)
RMSE: 3.8197474397593916 (0.6014184650438051)
MAE: 2.7872125253023987 (0.37572738459872773)


R2: 0.015076676243607867 (0.27911699780656335)
MSE: 3.4529964949742054 (0.7906866341333355)
RMSE: 1.8448952170806312 (0.2221669034019593)
MAE: 1.2644858779372725 (0.13222021108510035)




<h1>Kcal Dataframe</h1>

In [10]:
svr_baseline(kcal_df,"Confirmed")
svr_baseline(kcal_df,"Deaths")
svr_baseline(kcal_df,"Recovered")

[[ 0.44273527  0.0726963   0.17217829 -0.13780645 -0.15428298  0.25412909
  -0.09198685  0.11066151 -0.18342475  0.27193183 -0.43126837 -0.31612661
  -0.09086371 -0.19048246 -0.05701045 -0.02683941  0.58974456  0.02906533
   0.1479107   0.23140193 -0.07007021  0.01314394 -0.05549206]]
R2: 0.37619142022499863 (0.0747162189926754)
MSE: 3.399947358748368 (0.7289407427648311)
RMSE: 1.8336153208341917 (0.19442791453516345)
MAE: 1.3590090482672685 (0.07010937348153759)


[[ 0.66380034  0.10723881  0.64473363 -0.29380878 -0.24806414  0.48364933
  -0.48400015  0.14900724 -0.17847172  0.12576536 -0.88717196 -0.66572891
  -0.11539186 -0.3774606  -0.44408095 -0.01291756  0.48165407  0.07712843
   0.50063039  0.6834391  -0.12203682 -0.10398193  0.02691959]]
R2: 0.3066329052886584 (0.14812972208171662)
MSE: 15.98117492532682 (3.9586035133193733)
RMSE: 3.9656810648414678 (0.5045281134730379)
MAE: 2.8786277789585677 (0.3273979072269834)


[[ 0.14497929 -0.09458003  0.51135455 -0.07648147  0.01026267 

In [32]:
svr_gridsearch(kcal_df,"Confirmed")
svr_gridsearch(kcal_df,"Deaths")
svr_gridsearch(kcal_df,"Recovered")

{'C': 9, 'epsilon': 1, 'kernel': 'rbf'}
R2: 0.42690351683691113 (0.10373212849160368)
MSE: 3.109570857475673 (0.7359387257771738)
RMSE: 1.7511922359700178 (0.2071149684450696)
MAE: 1.3232428068683055 (0.06072751022596069)
{'C': 3, 'epsilon': 1, 'kernel': 'rbf'}
R2: 0.3781865907515559 (0.11209931375795147)
MSE: 14.613671386067733 (4.353817087256721)
RMSE: 3.780264233380503 (0.5685716418287585)
MAE: 2.912755168808105 (0.3596560498242433)
{'C': 1, 'epsilon': 1, 'kernel': 'poly'}
R2: 0.20207180063242833 (0.08359590025433526)
MSE: 2.885548003739352 (0.737223001098452)
RMSE: 1.6859622002897496 (0.20755592724251148)
MAE: 1.2819096444405875 (0.08690999543862395)


<h1>KG Dataframe</h1>

In [11]:
svr_baseline(kg_df,"Confirmed")
svr_baseline(kg_df,"Deaths")
svr_baseline(kg_df,"Recovered")

[[ 0.26485396  0.11781665  0.17636491 -0.12420927 -0.26773743  0.24631653
  -0.20236988  0.05259303  0.00054344  0.24771131 -0.23200654 -0.41001778
  -0.10810771 -0.27855623 -0.11029791 -0.02376906  0.13248838  0.21983288
  -0.00364365  0.10185338 -0.05444453  0.0155192  -0.1840306 ]]
R2: 0.20861112973571688 (0.28496549505119817)
MSE: 4.255949815087734 (1.4420446157112699)
RMSE: 2.035275310235033 (0.3370522609854813)
MAE: 1.4847302533298685 (0.16162040480977466)


[[ 0.52550044  0.53392182  0.27508367 -0.21293563 -0.26968567  0.62527981
  -0.54543866 -0.00658204 -0.04730948  0.42464443 -0.47884186 -0.68224436
  -0.09898605 -0.33338515 -0.3223396  -0.11433495  0.09720335  0.24436383
   0.04984709  0.31235533 -0.17740023 -0.06289937 -0.29106478]]
R2: 0.29478400687631046 (0.16933624648751247)
MSE: 16.06476210022587 (3.622053580290898)
RMSE: 3.981847617738849 (0.45787733000588193)
MAE: 2.920205002832495 (0.3825852931389343)


[[ 0.10100426  0.36325229 -0.02370797 -0.07074858 -0.13012671  0

In [34]:
svr_gridsearch(kg_df,"Confirmed")
svr_gridsearch(kg_df,"Deaths")
svr_gridsearch(kg_df,"Recovered")

{'C': 1, 'epsilon': 1, 'kernel': 'rbf'}
R2: 0.4087925522410317 (0.07417539105337152)
MSE: 3.2435653694955717 (0.7495614413894323)
RMSE: 1.7884686919560235 (0.21200261174967114)
MAE: 1.3586294458457047 (0.10112960359676307)
{'C': 3, 'epsilon': 1, 'kernel': 'rbf'}
R2: 0.45606642624296956 (0.0788680225240705)
MSE: 12.830644187795054 (3.615949324520849)
RMSE: 3.5448166473984566 (0.5147029474578734)
MAE: 2.602818692423951 (0.3619364528973304)
{'C': 3, 'epsilon': 1, 'kernel': 'poly'}
R2: 0.11328391687658353 (0.06282163238716541)
MSE: 3.2007288374641627 (0.7107380077967589)
RMSE: 1.778321351202257 (0.1957089914192532)
MAE: 1.3699305868010163 (0.07225745315805703)


<h1>Fat Dataframe</h1>

In [12]:
svr_baseline(fat_df,"Confirmed")
svr_baseline(fat_df,"Deaths")
svr_baseline(fat_df,"Recovered")

[[ 0.0860775   0.10142173  0.62571224 -0.195469   -0.18607403  0.22148279
  -0.09270779  0.00177663 -0.29379553 -0.26693094  0.01134573 -0.05931249
  -0.08341358  0.00443157 -0.03832982 -0.00362509  0.83429832 -0.01401349
  -0.04680461  0.13609142 -0.11438332 -0.06033708  0.01450896]]
R2: 0.303728397242183 (0.11580834774617685)
MSE: 3.8926853391448533 (1.1773556499241244)
RMSE: 1.9478990191478043 (0.31364749376947676)
MAE: 1.398699871532965 (0.1707595419295335)


[[ 0.06745848  0.36227768  1.31692653 -0.32463769 -0.22033012  0.23423815
  -0.28860624  0.18186557 -0.32286589 -0.77307433  0.06563897 -0.68707017
  -0.25415719 -0.30329661  0.00291041 -0.08800079  0.22900573 -0.16709075
   0.03701174  0.10038886 -0.37212151 -0.18203062  0.07303668]]
R2: 0.259030916340108 (0.09110793786287129)
MSE: 17.45028994953394 (4.597858763179024)
RMSE: 4.13805956901467 (0.5716230863252951)
MAE: 2.9776777480790906 (0.3499194946643819)


[[ 0.03353311  0.03633154  0.52690252 -0.12772238 -0.07378122  0.213

In [36]:
svr_gridsearch(fat_df,"Confirmed")
svr_gridsearch(fat_df,"Deaths")
svr_gridsearch(fat_df,"Recovered")

{'C': 3, 'epsilon': 1, 'kernel': 'rbf'}
R2: 0.4329666835352546 (0.10300610421084935)
MSE: 3.036016111686531 (0.5214851498623707)
RMSE: 1.7364233972532002 (0.14439493466941672)
MAE: 1.3345244255371784 (0.048970678667243)
{'C': 5, 'epsilon': 1, 'kernel': 'rbf'}
R2: 0.41539882610409434 (0.12979753191790513)
MSE: 13.939717692451051 (4.891217771008849)
RMSE: 3.6723946761752932 (0.6732273277656008)
MAE: 2.743911165552009 (0.4623424286428571)
{'C': 1, 'epsilon': 1, 'kernel': 'rbf'}
R2: 0.18922795603664697 (0.043045102394941345)
MSE: 2.974358451649023 (0.8533047000270964)
RMSE: 1.7077020787789858 (0.2410644349192832)
MAE: 1.3051848984228027 (0.12800114122235284)
