In [2]:
#import stuff
import numpy as np
import sklearn as sk
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [3]:
#import cleaned up datasets
protein_df = pd.read_csv("data/data_clean/protein_data_clean.csv")
kg_df = pd.read_csv("data/data_clean/kg_data_clean.csv")
kcal_df = pd.read_csv("data/data_clean/kcal_data_clean.csv")
fat_df = pd.read_csv("data/data_clean/fat_data_clean.csv")

In [4]:
#We have absolutely no need for obesity and undernourishment because they have nothing to do with diet
protein_df = protein_df.drop(['Obesity','Undernourished'],axis=1)
kg_df = kg_df.drop(['Obesity','Undernourished'],axis=1)
kcal_df = kcal_df.drop(['Obesity','Undernourished'],axis=1)
fat_df = fat_df.drop(['Obesity','Undernourished'],axis=1)

In [5]:
def produce_x_y(df,y_var):
    df_x = df.drop(['Country', 'Population','Active','Recovered','Deaths','Confirmed'],axis=1)
    #df is the df that we are extracting x and y from
    #y_var is the name of the y variable we're testing
    #scale the data
    x = StandardScaler().fit_transform(df_x)
    
    y_index = -1
    for c in range(0,len(df.columns)):
        if(df.columns[c] == y_var):
            y_index = c
    y = df.iloc[:,y_index].values
    return (x,y)

In [None]:
def rf_gridsearch(df,y_var):
    X_y = produce_x_y(df,y_var)
    X = X_y[0]
    y = X_y[1]
    
    reg = RandomForestRegressor()
    parameters = {'n_estimators': range(100,200,100)}
    grid = GridSearchCV(reg, parameters)
    grid.fit(X,y)
    
    #for c in range(0,len(grid.feature_importances_)):
    #   print(df.columns[c],":",grid.feature_importances_[c])

    mse = cross_val_score(grid, X, y, scoring='neg_mean_squared_error', cv=5)
    rmse = cross_val_score(grid, X, y, scoring='neg_root_mean_squared_error', cv=5)
    mae = cross_val_score(grid, X, y, scoring='neg_mean_absolute_error', cv=5)
    r2 = cross_val_score(grid, X, y, scoring='r2', cv=5)
    
    print("\nScores:")
    print(f"R2: {r2.mean()} ({r2.std()})")
    print(f"MSE: {-mse.mean()} ({mse.std()})")
    print(f"RMSE: {-rmse.mean()} ({rmse.std()})")
    print(f"MAE: {-mae.mean()} ({mae.std()})")

In [17]:
def rf_baseline(df,y_var):
    X_y = produce_x_y(df,y_var)
    X = X_y[0]
    y = X_y[1]
    
    reg = RandomForestRegressor()
    reg.fit(X,y)
    
    #for c in range(0,len(reg.feature_importances_)):
    #   print(df.columns[c],":",reg.feature_importances_[c])

    mse = cross_val_score(reg, X, y, scoring='neg_mean_squared_error', cv=5)
    rmse = cross_val_score(reg, X, y, scoring='neg_root_mean_squared_error', cv=5)
    mae = cross_val_score(reg, X, y, scoring='neg_mean_absolute_error', cv=5)
    r2 = cross_val_score(reg, X, y, scoring='r2', cv=5)
    
    print("\nScores:")
    print(f"R2: {r2.mean()} ({r2.std()})")
    print(f"MSE: {-mse.mean()} ({mse.std()})")
    print(f"RMSE: {-rmse.mean()} ({rmse.std()})")
    print(f"MAE: {-mae.mean()} ({mae.std()})")

<h1>Protein Dataset</h1>

In [18]:
print("Confirmed:")
rf_baseline(protein_df,"Confirmed")
print("\n Deaths:")
rf_baseline(protein_df,"Deaths")
print("\n Recovered:")
rf_baseline(protein_df,"Recovered")

Confirmed:

Scores:
R2: 0.45453984205279435 (0.1439042155531377)
MSE: 2.9474076501897684 (0.9128862170770653)
RMSE: 1.6639069723848638 (0.23955171425985117)
MAE: 1.2079160364284436 (0.1679695360932834)

 Deaths:

Scores:
R2: 0.39769114161409125 (0.10216009396096991)
MSE: 0.0013393660668754865 (0.000260649960626544)
RMSE: 0.03717004042491217 (0.0030811193668085213)
MAE: 0.02772584492962482 (0.00207578634370773)

 Recovered:

Scores:
R2: 0.15919616524865718 (0.06570008040338653)
MSE: 2.9127214224503217 (0.616645269545371)
RMSE: 1.6820636035019891 (0.21105038830568532)
MAE: 1.2050803717233853 (0.10731046382898307)


In [None]:
print("Confirmed:")
rf_gridsearch(protein_df,"Confirmed")
print("\n Deaths:")
rf_gridsearch(protein_df,"Deaths")
print("\n Recovered:")
rf_gridsearch(protein_df,"Recovered")

<h1>Kcal Dataset</h1>

In [20]:
print("Confirmed:")
rf_baseline(kcal_df,"Confirmed")
print("\n Deaths:")
rf_baseline(kcal_df,"Deaths")
print("\n Recovered:")
rf_baseline(kcal_df,"Recovered")

Confirmed:

Scores:
R2: 0.4166177990855391 (0.12614431528579106)
MSE: 3.2267644658825354 (0.7167041283410959)
RMSE: 1.7843156550836212 (0.19540124855581556)
MAE: 1.2745317857080927 (0.0919509523345363)

 Deaths:

Scores:
R2: 0.40726516872125484 (0.10885445057142162)
MSE: 0.0013583328462212743 (0.00036510358731662245)
RMSE: 0.03637530238301206 (0.004256859187321885)
MAE: 0.02657806769650379 (0.0029664195810204493)

 Recovered:

Scores:
R2: 0.2878654110418403 (0.0801848173308378)
MSE: 2.5409718792682066 (0.7678941739372293)
RMSE: 1.6144972783353257 (0.2224398774552968)
MAE: 1.1180972349230947 (0.12527845544853486)


In [21]:
print("Confirmed:")
rf_gridsearch(kcal_df,"Confirmed")
print("\n Deaths:")
rf_gridsearch(kcal_df,"Deaths")
print("\n Recovered:")
rf_gridsearch(kcal_df,"Recovered")

Confirmed:

Scores:
R2: 0.42115732920628357 (0.11350975804035017)
MSE: 3.1901832383352895 (0.6962752965864575)
RMSE: 1.7583980720110177 (0.20449800275400684)
MAE: 1.2514716886006962 (0.0897964411353843)

 Deaths:

Scores:
R2: 0.4162101240272831 (0.1011542631736401)
MSE: 0.0013527668331245095 (0.0003395518215476218)
RMSE: 0.036390480072471906 (0.004280547054622913)
MAE: 0.027010751088056024 (0.0024810003295706357)

 Recovered:


KeyboardInterrupt: 

<h1>KG Dataset</h1>

In [22]:
print("Confirmed:")
rf_baseline(kg_df,"Confirmed")
print("\n Deaths:")
rf_baseline(kg_df,"Deaths")
print("\n Recovered:")
rf_baseline(kg_df,"Recovered")

Confirmed:

Scores:
R2: 0.3881557353766394 (0.15914726203253693)
MSE: 3.209126900350978 (0.7534457744138326)
RMSE: 1.7850912009753501 (0.242380308873397)
MAE: 1.2843541583327538 (0.13097617098907666)

 Deaths:

Scores:
R2: 0.38435409249943564 (0.15369551612981125)
MSE: 0.0014202083655007066 (0.00043365215680313284)
RMSE: 0.03736520319738209 (0.005408644802927715)
MAE: 0.027790254610211017 (0.0028006462938735105)

 Recovered:

Scores:
R2: 0.12434312469320026 (0.07866779909579472)
MSE: 3.0241766232822336 (0.7798181559387295)
RMSE: 1.7396902634489284 (0.2132038029148947)
MAE: 1.245435785793948 (0.11167598326258)


<h1>Fat Dataset</h1>

In [23]:
print("Confirmed:")
rf_baseline(fat_df,"Confirmed")
print("\n Deaths:")
rf_baseline(fat_df,"Deaths")
print("\n Recovered:")
rf_baseline(fat_df,"Recovered")

Confirmed:

Scores:
R2: 0.4555088006506683 (0.11940210874865563)
MSE: 3.0478400237614274 (0.9017574908712642)
RMSE: 1.73904856113144 (0.26671338642954046)
MAE: 1.2428385424284554 (0.18152583559703037)

 Deaths:

Scores:
R2: 0.3984958661477857 (0.12071927099373632)
MSE: 0.0014054244200613231 (0.0002675136605945745)
RMSE: 0.03647617581323812 (0.003483836844595583)
MAE: 0.027742358341821323 (0.0017378914348911761)

 Recovered:

Scores:
R2: 0.29637992525274925 (0.08255822342518035)
MSE: 2.6239456281256803 (0.6850889593095645)
RMSE: 1.5934356671513126 (0.21817963284693417)
MAE: 1.1731166015032877 (0.11556269326007898)
