In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import pi
import csv
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.svm import SVR
import copy

In [11]:
header = None
features = None
with open('../feature_process/aligned_features.csv' ,'r')as f:
    features = csv.reader(f)
    features = list(features)
    header = features[0]
    features = features[1:]
    

In [12]:
def split_train_test(lower,upper,data): # fips code upper and lower bound for a state. ex. wisconsin 55000,56000
    data = copy.deepcopy(data)
    training = []
    testing =[]
    fips_train = []
    fips_test = []
    for row in data:
        if lower<= int(row[0]) < upper:
            testing.append(row[1:])
            fips_test.append(row[0])
        else:
            training.append(row[1:])
            fips_train.append(row[0])
    for i in range(len(training)):
        for j in range(len(training[0])):
            training[i][j] = float(training[i][j])
    for i in range(len(testing)):
        for j in range(len(testing[0])):
            testing[i][j] = float(testing[i][j])        
    return training, testing,fips_train,fips_test

In [13]:
train_x,test_x,fips_train,fips_test = split_train_test(55000,56000,features)

# Continuous Problem (increase rate)

##  k = 5  Cross Validation Error

In [91]:
labels = None
k = 5
file_name = '../label_process/labels2/{}_increase_{}'
data_type = ['deaths','cases']
interval_num = ['1','5','10']
for d_type in data_type:
    for i_num in interval_num:
        with open(file_name.format(d_type,i_num),'r') as f:
            labels = csv.reader(f)
            labels = list(labels)
        
    
        train_y,test_y,fips_train,fips_test = split_train_test(55000,56000,labels)
        #  cross validation split train test k = 5
        interval_length = int(len(train_x)/k)
        tr_set = []
        tr_labels = []
        va_set = []
        va_labels = []
        total_mse = []
        svm_mse = []
        training_set = copy.deepcopy(train_x)
        training_labels = copy.deepcopy(train_y)
        for kv_idx in range(k):
            va_set = training_set[interval_length * kv_idx:interval_length*(kv_idx+1)]
            va_labels = training_labels[interval_length * kv_idx:interval_length*(kv_idx+1)]
            tr_set = training_set[:interval_length * kv_idx] + training_set[interval_length*(kv_idx+1):]
            tr_labels = training_labels[:interval_length * kv_idx] + training_labels[interval_length*(kv_idx+1):]
            
            va_label_df = pd.DataFrame(va_labels,dtype = float)
            tr_label_df = pd.DataFrame(tr_labels,dtype = float)
            
            va_prediction = []
            svr_va_prediction = []
            va_answer = []
            for time_zone in range(int(float(i_num))):
                tr_y = list(tr_label_df.loc[:,time_zone])
                va_y = list(va_label_df.loc[:,time_zone])
                regr = RandomForestRegressor()
                svr_regr = SVR()
                regr.fit(tr_set, tr_y)
                svr_regr.fit(tr_set, tr_y)
                va_prediction = va_prediction + list(regr.predict(va_set))
                svr_va_prediction = svr_va_prediction + list(svr_regr.predict(va_set))
                va_answer += va_y
        
            total_mse.append(mean_absolute_error(va_answer,va_prediction))
            svm_mse.append(mean_absolute_error(va_answer,svr_va_prediction))
        print('MAPE for N = {}, {} increase rates, random forest'.format(i_num,d_type))
        print(sum(total_mse)/len(total_mse))
        print('MAPE for N = {}, {} increase rates, SVR'.format(i_num,d_type))
        print(sum(svm_mse)/len(svm_mse))


MAPE for N = 1, deaths increase rates, random forest
0.00039258050013288624
MAPE for N = 1, deaths increase rates, SVR
0.002806304550705679
MAPE for N = 5, deaths increase rates, random forest
0.00012156162799185823
MAPE for N = 5, deaths increase rates, SVR
0.0012325553659813322
MAPE for N = 10, deaths increase rates, random forest
7.115228389125938e-05
MAPE for N = 10, deaths increase rates, SVR
0.000728304645987609
MAPE for N = 1, cases increase rates, random forest
0.012664492404519685
MAPE for N = 1, cases increase rates, SVR
0.06209782592115588
MAPE for N = 5, cases increase rates, random forest
0.0036503566204986945
MAPE for N = 5, cases increase rates, SVR
0.0454246680474594
MAPE for N = 10, cases increase rates, random forest
0.0020570714046789225
MAPE for N = 10, cases increase rates, SVR
0.029007918521587205


## testing error

In [18]:
labels = None
k = 5
file_name = '../label_process/labels2/{}_increase_{}'
data_type = ['deaths','cases']
interval_num = ['1','5','10']
for d_type in data_type:
    for i_num in interval_num:
        with open(file_name.format(d_type,i_num),'r') as f:
            labels = csv.reader(f)
            labels = list(labels)
        
        train_y,test_y,fips_train,fips_test = split_train_test(55000,56000,labels)

        tr_set = []
        tr_labels = []
        va_set = []
        va_labels = []
        total_mse = []
        svm_mse = []
        training_set = copy.deepcopy(train_x)
        training_labels = copy.deepcopy(train_y)
        tr_label_df = pd.DataFrame(train_y)
        va_label_df = pd.DataFrame(test_y)
        
        va_prediction = []
        svr_va_prediction = []
        va_answer = []
        for time_zone in range(int(float(i_num))):
            tr_y = list(tr_label_df.loc[:,time_zone])
            va_y = list(va_label_df.loc[:,time_zone])
            regr = RandomForestRegressor()
            svr_regr = SVR()
            regr.fit(train_x, tr_y)
            svr_regr.fit(train_x, tr_y)
            va_prediction = va_prediction + list(regr.predict(test_x))
            svr_va_prediction = svr_va_prediction + list(svr_regr.predict(test_x))
            va_answer += va_y

        total_mse.append(mean_absolute_error(va_answer,va_prediction))
        svm_mse.append(mean_absolute_error(va_answer,svr_va_prediction))
        print('MAPE for N = {}, {} increase rates, random forest'.format(i_num,d_type))
        print(sum(total_mse)/len(total_mse))
        print('MAPE for N = {}, {} increase rates, SVR'.format(i_num,d_type))
        print(sum(svm_mse)/len(svm_mse))


MAPE for N = 1, deaths increase rates, random forest
0.0002782352006245808
MAPE for N = 1, deaths increase rates, SVR
0.0030582337692961637
MAPE for N = 5, deaths increase rates, random forest
8.310205512667283e-05
MAPE for N = 5, deaths increase rates, SVR
0.0012672066744095131
MAPE for N = 10, deaths increase rates, random forest
4.6659028876153586e-05
MAPE for N = 10, deaths increase rates, SVR
0.0007445946470889617
MAPE for N = 1, cases increase rates, random forest
0.014728564849270542
MAPE for N = 1, cases increase rates, SVR
0.04480892217406902
MAPE for N = 5, cases increase rates, random forest
0.0038593331759570575
MAPE for N = 5, cases increase rates, SVR
0.04501633457449545
MAPE for N = 10, cases increase rates, random forest
0.0021357080800325145
MAPE for N = 10, cases increase rates, SVR
0.029492560625535333


# Classification Problem