In [1]:
import numpy as np
import pandas as pd
import os
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

In [2]:
# read from level file
level = pd.read_csv('../../dataset/data_xyz/provinceData/level.csv')
gdp = pd.read_csv('../../dataset/gdp_2021.csv')
population_density = pd.read_csv('../../dataset/population_density.csv')
population7th = pd.read_csv('../../dataset/population7th.csv')
aging_rate_2021 = pd.read_csv('../../dataset/aging_rate_2021.csv')
vaccinate = pd.read_csv('../../dataset/vaccinate/vaccinataion_china.csv')

In [3]:
def findAllFile(base):
    for root, ds, fs in os.walk(base):
        for f in fs:
            if f.endswith('.json'):
#                 yield f
                fullname = os.path.join(root, f)
                yield fullname,f

In [4]:
# load data from every province
province_days = []
province_names = []
base = '../../dataset/data_xyz/provinceData'
total_data = pd.DataFrame()
for i,f in findAllFile(base):
    # transform date
    data = pd.read_json(i)
    data = pd.json_normalize(data['data'])
    data['dateId'] = pd.to_datetime(data['dateId'], format='%Y%m%d')
    # transform end
    data['Province'] = f[:-5]
    province_days.append(data.shape[0])
    province_names.append(f[:-5])
    total_data = pd.concat([total_data,data],axis=0)

In [5]:
total_data.isnull().any()

confirmedCount           False
confirmedIncr            False
curedCount               False
curedIncr                False
currentConfirmedCount    False
currentConfirmedIncr     False
dateId                   False
deadCount                False
deadIncr                 False
highDangerCount          False
midDangerCount           False
suspectedCount           False
suspectedCountIncr       False
Province                 False
dtype: bool

In [6]:
total_data = pd.merge(total_data, level, on='Province', how='left')

In [7]:
total_data = pd.merge(total_data, gdp, on='Province', how='left')

In [8]:
total_data = pd.merge(total_data, population_density, on='Province', how='left')

In [9]:
total_data = pd.merge(total_data, population7th, on='Province', how='left')

In [10]:
total_data = pd.merge(total_data, aging_rate_2021, on='Province', how='left')

In [11]:
vaccinate = vaccinate.iloc[:,1:]

In [12]:
vaccinate['time'] = pd.to_datetime(vaccinate['time'], format='%Y-%m-%d')

In [13]:
vaccinate

Unnamed: 0,time,vaccination number (w)
0,2021-03-23,8284.60
1,2021-03-24,8585.97
2,2021-03-25,9134.60
3,2021-03-26,9747.00
4,2021-03-27,10241.70
...,...,...
237,2022-05-11,335712.00
238,2022-05-12,335857.60
239,2022-05-13,336005.00
240,2022-05-14,336133.30


In [14]:
total_data

Unnamed: 0,confirmedCount,confirmedIncr,curedCount,curedIncr,currentConfirmedCount,currentConfirmedIncr,dateId,deadCount,deadIncr,highDangerCount,midDangerCount,suspectedCount,suspectedCountIncr,Province,level,GDP,population_km,population,rate
0,9,9,0,0,9,9,2020-01-23,0,0,0,0,0,0,山东,4,83095.9,579.0,101527453,16.0
1,21,12,0,0,21,12,2020-01-24,0,0,0,0,0,0,山东,4,83095.9,579.0,101527453,16.0
2,39,18,0,0,39,18,2020-01-25,0,0,0,0,0,0,山东,4,83095.9,579.0,101527453,16.0
3,63,24,0,0,63,24,2020-01-26,0,0,0,0,0,0,山东,4,83095.9,579.0,101527453,16.0
4,87,24,0,0,87,24,2020-01-27,0,0,0,0,0,0,山东,4,83095.9,579.0,101527453,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29003,1383,0,1354,6,28,-6,2022-05-11,1,0,0,0,0,0,江西,4,29619.7,247.0,45188635,12.3
29004,1383,0,1361,7,21,-7,2022-05-12,1,0,0,0,0,0,江西,4,29619.7,247.0,45188635,12.3
29005,1383,0,1364,3,18,-3,2022-05-13,1,0,0,0,0,0,江西,4,29619.7,247.0,45188635,12.3
29006,1383,0,1369,5,13,-5,2022-05-14,1,0,0,0,0,0,江西,4,29619.7,247.0,45188635,12.3


In [15]:
total_data = pd.merge(total_data, vaccinate, left_on='dateId',right_on='time', how='left').drop(columns=['time']).fillna(method='bfill', limit=10).fillna(0)

In [16]:
total_data

Unnamed: 0,confirmedCount,confirmedIncr,curedCount,curedIncr,currentConfirmedCount,currentConfirmedIncr,dateId,deadCount,deadIncr,highDangerCount,midDangerCount,suspectedCount,suspectedCountIncr,Province,level,GDP,population_km,population,rate,vaccination number (w)
0,9,9,0,0,9,9,2020-01-23,0,0,0,0,0,0,山东,4,83095.9,579.0,101527453,16.0,0.0
1,21,12,0,0,21,12,2020-01-24,0,0,0,0,0,0,山东,4,83095.9,579.0,101527453,16.0,0.0
2,39,18,0,0,39,18,2020-01-25,0,0,0,0,0,0,山东,4,83095.9,579.0,101527453,16.0,0.0
3,63,24,0,0,63,24,2020-01-26,0,0,0,0,0,0,山东,4,83095.9,579.0,101527453,16.0,0.0
4,87,24,0,0,87,24,2020-01-27,0,0,0,0,0,0,山东,4,83095.9,579.0,101527453,16.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29038,1383,0,1354,6,28,-6,2022-05-11,1,0,0,0,0,0,江西,4,29619.7,247.0,45188635,12.3,335712.0
29039,1383,0,1361,7,21,-7,2022-05-12,1,0,0,0,0,0,江西,4,29619.7,247.0,45188635,12.3,335857.6
29040,1383,0,1364,3,18,-3,2022-05-13,1,0,0,0,0,0,江西,4,29619.7,247.0,45188635,12.3,336005.0
29041,1383,0,1369,5,13,-5,2022-05-14,1,0,0,0,0,0,江西,4,29619.7,247.0,45188635,12.3,336133.3


In [17]:
# Data cleaning now end, start to train module

In [18]:
province_days

[828,
 831,
 829,
 829,
 828,
 828,
 829,
 829,
 829,
 829,
 831,
 830,
 828,
 829,
 831,
 829,
 826,
 830,
 828,
 828,
 828,
 827,
 832,
 829,
 829,
 828,
 822,
 829,
 828,
 830,
 829,
 828,
 830,
 830]

In [19]:
# province_days diff in province may means that some days missing!!! In there, just ignore...

In [20]:
groups = total_data.groupby(total_data.Province)
province_data = []
for i in province_names:
    province_data.append(groups.get_group(i))
# province data get end

In [21]:
step1_data = []
for df in province_data:
    df_1 = df.shift(1)
    df_1.rename(columns=lambda x:str(x)+"1", inplace=True)
    step1_data.append(pd.concat([df, df_1], axis=1)[1:])

In [22]:
data_x_y = pd.concat(step1_data, axis=0)

In [23]:
data_x_y

Unnamed: 0,confirmedCount,confirmedIncr,curedCount,curedIncr,currentConfirmedCount,currentConfirmedIncr,dateId,deadCount,deadIncr,highDangerCount,...,midDangerCount1,suspectedCount1,suspectedCountIncr1,Province1,level1,GDP1,population_km1,population1,rate1,vaccination number (w)1
1,21,12,0,0,21,12,2020-01-24,0,0,0,...,0.0,0.0,0.0,山东,4.0,83095.9,579.0,101527453.0,16.0,0.0
2,39,18,0,0,39,18,2020-01-25,0,0,0,...,0.0,0.0,0.0,山东,4.0,83095.9,579.0,101527453.0,16.0,0.0
3,63,24,0,0,63,24,2020-01-26,0,0,0,...,0.0,0.0,0.0,山东,4.0,83095.9,579.0,101527453.0,16.0,0.0
4,87,24,0,0,87,24,2020-01-27,0,0,0,...,0.0,0.0,0.0,山东,4.0,83095.9,579.0,101527453.0,16.0,0.0
5,121,34,0,0,121,34,2020-01-28,0,0,0,...,0.0,0.0,0.0,山东,4.0,83095.9,579.0,101527453.0,16.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29038,1383,0,1354,6,28,-6,2022-05-11,1,0,0,...,0.0,0.0,0.0,江西,4.0,29619.7,247.0,45188635.0,12.3,335553.5
29039,1383,0,1361,7,21,-7,2022-05-12,1,0,0,...,0.0,0.0,0.0,江西,4.0,29619.7,247.0,45188635.0,12.3,335712.0
29040,1383,0,1364,3,18,-3,2022-05-13,1,0,0,...,0.0,0.0,0.0,江西,4.0,29619.7,247.0,45188635.0,12.3,335857.6
29041,1383,0,1369,5,13,-5,2022-05-14,1,0,0,...,0.0,0.0,0.0,江西,4.0,29619.7,247.0,45188635.0,12.3,336005.0


In [24]:
Y = data_x_y['currentConfirmedCount']
X = data_x_y[['confirmedCount1','confirmedIncr1','curedCount1','curedIncr1','currentConfirmedCount1','currentConfirmedIncr1','deadCount1','deadIncr1','highDangerCount1','midDangerCount1','suspectedCount1','suspectedCountIncr1','level1','GDP1','population_km1','population1','rate1']]

In [25]:
X

Unnamed: 0,confirmedCount1,confirmedIncr1,curedCount1,curedIncr1,currentConfirmedCount1,currentConfirmedIncr1,deadCount1,deadIncr1,highDangerCount1,midDangerCount1,suspectedCount1,suspectedCountIncr1,level1,GDP1,population_km1,population1,rate1
1,9.0,9.0,0.0,0.0,9.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,83095.9,579.0,101527453.0,16.0
2,21.0,12.0,0.0,0.0,21.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,83095.9,579.0,101527453.0,16.0
3,39.0,18.0,0.0,0.0,39.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,83095.9,579.0,101527453.0,16.0
4,63.0,24.0,0.0,0.0,63.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,83095.9,579.0,101527453.0,16.0
5,87.0,24.0,0.0,0.0,87.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,83095.9,579.0,101527453.0,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29038,1383.0,0.0,1348.0,16.0,34.0,-16.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,29619.7,247.0,45188635.0,12.3
29039,1383.0,0.0,1354.0,6.0,28.0,-6.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,29619.7,247.0,45188635.0,12.3
29040,1383.0,0.0,1361.0,7.0,21.0,-7.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,29619.7,247.0,45188635.0,12.3
29041,1383.0,0.0,1364.0,3.0,18.0,-3.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,29619.7,247.0,45188635.0,12.3


In [26]:
train_X,test_X,train_y,test_y = train_test_split(X,Y,test_size=0.3, random_state=4)

In [27]:
le = LabelEncoder()
train_y = le.fit_transform(train_y)
test_y = le.fit_transform(test_y)

In [28]:
# model start
clf = XGBClassifier(
    n_estimators=220,
    verbosity=1,
    gpu_id=0
#     eval_metric=mean_squared_error
)

In [29]:
clf.fit(train_X,
        train_y,
        eval_set=[(test_X,test_y)],
        verbose=True
       )

[0]	validation_0-mlogloss:4.90722
[1]	validation_0-mlogloss:4.90014
[2]	validation_0-mlogloss:10.46619
[3]	validation_0-mlogloss:12.69274
[4]	validation_0-mlogloss:17.99202
[5]	validation_0-mlogloss:16.63756
[6]	validation_0-mlogloss:17.05880
[7]	validation_0-mlogloss:16.34307
[8]	validation_0-mlogloss:16.14214
[9]	validation_0-mlogloss:15.11872
[10]	validation_0-mlogloss:14.63095
[11]	validation_0-mlogloss:14.01207
[12]	validation_0-mlogloss:13.64046
[13]	validation_0-mlogloss:13.52030
[14]	validation_0-mlogloss:13.26355
[15]	validation_0-mlogloss:13.26758
[16]	validation_0-mlogloss:13.22190
[17]	validation_0-mlogloss:13.10877
[18]	validation_0-mlogloss:13.03524
[19]	validation_0-mlogloss:13.02507
[20]	validation_0-mlogloss:12.88773
[21]	validation_0-mlogloss:12.79523
[22]	validation_0-mlogloss:12.70342
[23]	validation_0-mlogloss:12.44880
[24]	validation_0-mlogloss:12.36599
[25]	validation_0-mlogloss:12.28164
[26]	validation_0-mlogloss:12.20370
[27]	validation_0-mlogloss:12.13657
[28]

KeyboardInterrupt: 

In [None]:
train_X