In [17]:
#import stuff
import numpy as np
import sklearn as sk
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [18]:
#import cleaned up datasets
protein_df = pd.read_csv("data/data_clean/protein_data_clean.csv")
kg_df = pd.read_csv("data/data_clean/kg_data_clean.csv")
kcal_df = pd.read_csv("data/data_clean/kcal_data_clean.csv")
fat_df = pd.read_csv("data/data_clean/fat_data_clean.csv")

In [19]:
#We have absolutely no need for obesity and undernourishment because they have nothing to do with diet
protein_df = protein_df.drop(['Obesity','Undernourished'],axis=1)
kg_df = kg_df.drop(['Obesity','Undernourished'],axis=1)
kcal_df = kcal_df.drop(['Obesity','Undernourished'],axis=1)
fat_df = fat_df.drop(['Obesity','Undernourished'],axis=1)

In [34]:
def produce_x_y(df,y_var):
    df_x = df.drop(['Country', 'Population','Active','Recovered','Deaths','Confirmed'],axis=1)
    #df is the df that we are extracting x and y from
    #y_var is the name of the y variable we're testing
    #scale the data
    x = StandardScaler().fit_transform(df_x)
    
    y_index = -1
    for c in range(0,len(df.columns)):
        if(df.columns[c] == y_var):
            y_index = c
    y = df.iloc[:,y_index].values
    return (x,y)

In [37]:
def rf_function(df,y_var):
    X_y = produce_x_y(df,y_var)
    X = X_y[0]
    y = X_y[1]
    
    reg = RandomForestRegressor(max_depth=2, random_state=0)
    reg.fit(X,y)
    
    for c in range(0,len(reg.feature_importances_)):
        print(df.columns[c],":",reg.feature_importances_[c])

    mse = cross_val_score(reg, X, y, scoring='neg_mean_squared_error', cv=5)
    rmse = cross_val_score(reg, X, y, scoring='neg_root_mean_squared_error', cv=5)
    mae = cross_val_score(reg, X, y, scoring='neg_mean_absolute_error', cv=5)
    r2 = cross_val_score(reg, X, y, scoring='r2', cv=5)
    
    print("\nScores:")
    print(f"R2: {r2.mean()} ({r2.std()})")
    print(f"MSE: {-mse.mean()} ({mse.std()})")
    print(f"RMSE: {-rmse.mean()} ({rmse.std()})")
    print(f"MAE: {-mae.mean()} ({mae.std()})")

In [42]:
print("\n Confirmed:")
rf_function(protein_df,"Confirmed")
print("\n Deaths:")
rf_function(protein_df,"Deaths")
print("\n Recovered:")
rf_function(protein_df,"Recovered")


 Confirmed:
Alcoholic Beverages_p : 0.005535938596018728
Animal Products_p : 0.020417315023131915
Animal fats_p : 0.005799467073251401
Aquatic Products, Other_p : 0.0
Cereals - Excluding Beer_p : 0.005790519837338758
Eggs_p : 0.24979070750652357
Fish, Seafood_p : 0.005713471068298709
Fruits - Excluding Wine_p : 0.0
Meat_p : 0.005189508027077446
Milk - Excluding Butter_p : 0.5032166942087829
Offals_p : 0.0005661850951410406
Oilcrops_p : 0.0016319945513370185
Pulses_p : 0.00031370660826118537
Spices_p : 0.0001892183046283787
Starchy Roots_p : 0.004870129384356967
Stimulants_p : 0.010541634720376059
Sugar Crops_p : 0.0
Sugar & Sweeteners_p : 0.13596157291945163
Treenuts_p : 0.02526429750446003
Vegetal Products_p : 0.008528546719160498
Vegetable Oils_p : 0.0051377657353007845
Vegetables_p : 0.0034406235700587874
Miscellaneous_p : 0.0021007035470441237

Scores:
R2: 0.4605105792660689 (0.0981448118078642)
MSE: 2.9276709762168 (0.7411194962698011)
RMSE: 1.6979889070343603 (0.2109612471641620

In [39]:
rf_function(protein_df,"Confirmed")
rf_function(protein_df,"Deaths")
rf_function(protein_df,"Recovered")

Alcoholic Beverages_p : 0.005535938596018728
Animal Products_p : 0.020417315023131915
Animal fats_p : 0.005799467073251401
Aquatic Products, Other_p : 0.0
Cereals - Excluding Beer_p : 0.005790519837338758
Eggs_p : 0.24979070750652357
Fish, Seafood_p : 0.005713471068298709
Fruits - Excluding Wine_p : 0.0
Meat_p : 0.005189508027077446
Milk - Excluding Butter_p : 0.5032166942087829
Offals_p : 0.0005661850951410406
Oilcrops_p : 0.0016319945513370185
Pulses_p : 0.00031370660826118537
Spices_p : 0.0001892183046283787
Starchy Roots_p : 0.004870129384356967
Stimulants_p : 0.010541634720376059
Sugar Crops_p : 0.0
Sugar & Sweeteners_p : 0.13596157291945163
Treenuts_p : 0.02526429750446003
Vegetal Products_p : 0.008528546719160498
Vegetable Oils_p : 0.0051377657353007845
Vegetables_p : 0.0034406235700587874
Miscellaneous_p : 0.0021007035470441237

Scores:
R2: 0.4605105792660689 (0.0981448118078642)
MSE: 2.9276709762168 (0.7411194962698011)
RMSE: 1.6979889070343603 (0.21096124716416206)
MAE: 1.252