## Read Libraries

In [1]:
import numpy as np
import numpy.random as random
import scipy as sp
import pandas as pd
from pandas import Series, DataFrame

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline
import math
from sklearn.model_selection import train_test_split,GridSearchCV, LeaveOneOut
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor as RFR
import pickle
import time
# %precision 3

## Mobility&beta データ読み込み

In [2]:
#csvのインポート
env= pd.read_csv('env_data.csv')
beta = pd.read_csv('beta_fm_exIandR.csv')

#日数に関する前処理
env['date'] = pd.to_datetime(env['date'])
beta['date'] = pd.to_datetime(beta['date'])
    #開始日の差と終了日の差を計算
diff_startday = int((env.iloc[0]['date'] - beta.iloc[0]['date']) / pd.Timedelta(days = 1))
diff_endday = int((env.iloc[-1]['date'] - beta.iloc[-1]['date']) / pd.Timedelta(days = 1))
    #差に応じてデータをカット
if diff_startday > 0:
    beta = beta.drop(beta.index[0:diff_startday])
if diff_startday < 0:
    env = env.drop(env.index[0:-diff_startday])
if diff_endday > 0:
    env = env.drop(env.index[-diff_endday:])
if diff_endday < 0:
    beta = beta.drop(beta.index[diff_endday:])
    #データの長さを取得
number_of_days = len(beta)
    #初期学習期間
number_of_trainingdays = 238 #2020/2/21～2020/10/15の日数


In [3]:
env

Unnamed: 0,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline,Temp,Humidity,pcr_test
0,2020-02-21,-1.571429,1.285714,-1.857143,-2.857143,0.285714,0.714286,9.528571,64.714286,74.142857
1,2020-02-22,-2.142857,0.857143,-4.285714,-3.428571,-0.428571,1.000000,9.542857,66.285714,75.571429
2,2020-02-23,-0.714286,2.142857,3.000000,-1.714286,-0.142857,0.714286,9.700000,58.285714,71.857143
3,2020-02-24,2.142857,2.571429,11.428571,-5.428571,-8.000000,2.571429,9.628571,53.142857,70.571429
4,2020-02-25,2.000000,3.285714,11.285714,-5.428571,-7.857143,2.571429,10.042857,55.428571,75.571429
...,...,...,...,...,...,...,...,...,...,...
637,2021-11-19,-4.285714,6.000000,6.571429,-13.000000,-8.142857,3.571429,13.457143,63.571429,45551.142860
638,2021-11-20,-4.000000,6.285714,7.142857,-12.428571,-7.571429,3.428571,13.471429,65.571429,45093.428570
639,2021-11-21,-3.857143,6.142857,6.714286,-12.000000,-7.571429,3.428571,13.257143,68.571429,44544.285710
640,2021-11-22,-3.571429,5.285714,2.571429,-12.285714,-8.428571,3.714286,13.142857,74.428571,43823.714290


In [4]:
beta

Unnamed: 0,date,beta
30,2020-02-21,0.431753
31,2020-02-22,0.357252
32,2020-02-23,0.283115
33,2020-02-24,0.258568
34,2020-02-25,0.282388
...,...,...
667,2021-11-19,0.049694
668,2021-11-20,0.052520
669,2021-11-21,0.053231
670,2021-11-22,0.049301


# 関数

In [5]:
def RF_calc (x,y):

    search_params = {
        'n_estimators'      : [5, 10, 20, 30, 50, 100, 300],
        'max_features'      : [i for i in range(3, x.shape[1])],
        'random_state'      : [1], #i for i in range(10)],
        'n_jobs'            : [1],
        'min_samples_split' : [5], #, 10],
        'max_depth'         : [5, 10, 15, 20]
    }

    gsr = GridSearchCV(
        RFR(),
        search_params,
        cv = 10,
        n_jobs = -7,
        verbose=True
    )

    gsr.fit(x,y)
    beta_tree=gsr.best_estimator_
    beta_score_train=gsr.best_score_
    beta_importances = gsr.best_estimator_.feature_importances_
    return gsr


In [6]:
day_delay = 6
clm_env = env.columns.values[1:]
feature_importance = pd.DataFrame(index = ['feature_importance'], columns = clm_env)

x = env.iloc[0 : number_of_days - day_delay - 1, 1:]
y = beta.iloc[day_delay : number_of_days - 1]['beta']

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.0000001, random_state = 100)

X_new = pd.concat([X_train, X_test])
Y_new = pd.concat([Y_train, Y_test])

gsr = RF_calc(X_new, Y_new)

fi_series = pd.Series(gsr.best_estimator_.feature_importances_, index = feature_importance.columns)
feature_importance.iloc[0] = fi_series
feature_importance.to_csv('feature-importance.csv')

Fitting 10 folds for each of 168 candidates, totalling 1680 fits


In [8]:
ind_beta = np.arange(beta.iloc[0]['date'] + np.timedelta64(day_delay,'D'), beta.iloc[-1]['date'] + np.timedelta64(1 + day_delay,'D'), np.timedelta64(1,'D'), dtype='datetime64')
clm_beta = ['beta']

Pred_Beta = pd.DataFrame(index = ind_beta, columns = clm_beta)


x_t = env.iloc[0: , 1:]
#    x_t = x_scenario.iloc[0 : , 1:]
#    print(x_t)
y_t = gsr.predict(x_t)
Pred_Beta[Pred_Beta.columns[0]] = y_t

Pred_Beta.to_csv('beta_pred.csv')