In [9]:
import pandas as pd
df = pd.read_excel('../../data/interim/iva_kumulativ.xlsx')
df = df[df['Region'] != 'Hela riket']
befolkning = pd.read_excel('../../data/raw/befolkning.xlsx', skiprows=9)
befolkning = befolkning[['Hela riket', 10327589, 41.313715]]
befolkning.dropna(inplace=True)
befolkning.columns = ['Region', 'Befolkning', 'Medelålder']
befolkning = befolkning[befolkning['Region'] != 'Gotlands län']
befolkning.reset_index(inplace=True)
befolkning.drop('index', axis=1, inplace=True)

### Change from län to region

In [10]:
lan_to_region = {'Stockholms län': 'Region Stockholm', 'Södermanlands län': 'Region Sörmland', 
                 'Östergötlands län': 'Region Östergötland', 'Jönköpings län': 'Region Jönköpings län',
                 'Kronobergs län': 'Region Kronoberg', 'Kalmar län': 'Region Kalmar län',
                 'Blekinge län': 'Region Blekinge', 'Skåne län': 'Region Skåne',
                 'Hallands län': 'Region Halland', 'Västra Götalands län': 'Västra Götalandsregionen',
                 'Värmlands län': 'Region Värmland', 'Örebro län': 'Region Örebro län', 
                 'Västmanlands län': 'Region Västmanland', 'Dalarnas län': 'Region Dalarna',
                 'Gävleborgs län': 'Region Gävleborg', 'Västernorrlands län': 'Region Västernorrland',
                 'Jämtlands län': 'Region Jämtland Härjedalen', 'Västerbottens län': 'Region Västerbotten',
                 'Norrbottens län': 'Region Norrbotten', 'Uppsala län': 'Region Uppsala'
                }

befolkning['Region'] = [lan_to_region[val] for val in befolkning['Region']]


In [11]:
df = df.merge(befolkning, on='Region', how='inner')

In [64]:
# Skapa data per region
def create_dataframe_per_region(df, region):
    tmp_df = df[df['Region'] == region]
    #tmp_df.drop(['2020-04-01', '2020-03-31', '2020-03-30', '2020-03-29'], axis=1, inplace=True)
    dates = tmp_df.drop(['Region', 'Befolkning', 'Medelålder'], axis=1).columns
    values = tmp_df.drop(['Region', 'Befolkning', 'Medelålder'], axis=1).iloc[0].values
    befolkning = tmp_df['Befolkning'].values[0]
    mean_age = tmp_df['Medelålder'].values[0]
    tmp_df = pd.DataFrame({'date': dates, 'iva': values})
    tmp_df['befolkning'] = befolkning 
    tmp_df['mean_age'] =  mean_age
    tmp_df['Region'] = region
    tmp_df = add_change_coming_x_days(tmp_df, 3)
    tmp_df = add_change_since_x_days_ago(tmp_df, 1)
    tmp_df = add_change_since_x_days_ago(tmp_df, 3)
    tmp_df = add_change_since_x_days_ago(tmp_df, 7)
    tmp_df = add_change_in_nbrs_since_x_days_ago(tmp_df, 1)
    tmp_df = add_change_in_nbrs_since_x_days_ago(tmp_df, 3)
    tmp_df = add_change_in_nbrs_since_x_days_ago(tmp_df, 7)
    return tmp_df

In [63]:
def add_change_since_x_days_ago(df, x):
    change = []
    for i in range(len(df)):
        if i < x:
            change.append(None)
        else:
            change.append(df['iva'].iloc[i]/df['iva'].iloc[i-x])
    df['change_since_'+str(x)+'_days'] = change
    return df


def add_change_coming_x_days(df, x):
    change = []
    for i in range(len(df)):
        if len(df) - i - 1 < x:
            change.append(None)
        else:
            change.append(df['iva'].iloc[i+x]/df['iva'].iloc[i])
    df['change_coming_'+str(x)+'_days'] = change
    return df


def add_change_in_nbrs_since_x_days_ago(df, x):
    change = []
    for i in range(len(df)):
        if i < x:
            change.append(None)
        else:
            change.append(df['iva'].iloc[i] - df['iva'].iloc[i-x])
    df['change_since_in_nbrs_'+str(x)+'_days'] = change
    return df


In [65]:
# Create dataframe with all regions

regions = list(df['Region'].unique())

for i, region in enumerate(regions):
    if i == 0:
        data = create_dataframe_per_region(df, region)
        
    else:
        tmp_data = create_dataframe_per_region(df, region)
        data = pd.concat([data, tmp_data])
        
data['iva_per_1000'] = data['iva']/data['befolkning']*1000
data
    

  import sys
  import sys


Unnamed: 0,date,iva,befolkning,mean_age,Region,change_coming_3_days,change_since_1_days,change_since_3_days,change_since_7_days,change_since_in_nbrs_1_days,change_since_in_nbrs_3_days,change_since_in_nbrs_7_days,iva_per_1000
0,2020-03-06,,159606.0,43.358574,Region Blekinge,,,,,,,,
1,2020-03-07,,159606.0,43.358574,Region Blekinge,,,,,,,,
2,2020-03-08,0.0,159606.0,43.358574,Region Blekinge,,,,,,,,0.000000
3,2020-03-09,0.0,159606.0,43.358574,Region Blekinge,,,,,0.0,,,0.000000
4,2020-03-10,0.0,159606.0,43.358574,Region Blekinge,,,,,0.0,,,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22,2020-03-28,49.0,1725881.0,41.079673,Västra Götalandsregionen,1.428571,1.113636,1.484848,2.722222,5.0,16.0,31.0,0.028391
23,2020-03-29,60.0,1725881.0,41.079673,Västra Götalandsregionen,1.166667,1.224490,1.500000,2.857143,11.0,20.0,39.0,0.034765
24,2020-03-30,65.0,1725881.0,41.079673,Västra Götalandsregionen,,1.083333,1.477273,2.500000,5.0,21.0,39.0,0.037662
25,2020-03-31,70.0,1725881.0,41.079673,Västra Götalandsregionen,,1.076923,1.428571,2.333333,5.0,21.0,40.0,0.040559


In [66]:
from sklearn.linear_model import LinearRegression
import numpy as np
from xgboost import XGBClassifier

data = data.replace(np.inf, np.nan)
data = data.dropna()
train = data[data['date'] < '2020-03-24']
test = data[data['date'] >= '2020-03-24']
X = train.drop(['date', 'change_coming_3_days', 'Region'], axis=1)
y = train['change_coming_3_days']

model = XGBClassifier()
model.fit(X, y)
X_test = test.drop(['date', 'change_coming_3_days', 'Region'], axis=1)
y_test = test['change_coming_3_days']
pred = model.predict(X_test)


In [67]:
test['predicted_change'] = pred
test['predicted_nbr_in_3_days'] = test['iva']*test['predicted_change']
test['iva_in_3_days'] = test['iva']*test['change_coming_3_days']
test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,date,iva,befolkning,mean_age,Region,change_coming_3_days,change_since_1_days,change_since_3_days,change_since_7_days,change_since_in_nbrs_1_days,change_since_in_nbrs_3_days,change_since_in_nbrs_7_days,iva_per_1000,predicted_change,predicted_nbr_in_3_days,iva_in_3_days
19,2020-03-25,8.0,287966.0,43.698243,Region Dalarna,1.250000,1.600000,4.000000,8.000000,3.0,6.0,7.0,0.027781,1.000000,8.000000,10.0
20,2020-03-26,9.0,287966.0,43.698243,Region Dalarna,1.111111,1.125000,3.000000,9.000000,1.0,6.0,8.0,0.031254,1.500000,13.500000,10.0
21,2020-03-27,9.0,287966.0,43.698243,Region Dalarna,1.222222,1.000000,1.800000,9.000000,0.0,4.0,8.0,0.031254,1.222222,11.000000,11.0
22,2020-03-28,10.0,287966.0,43.698243,Region Dalarna,1.100000,1.111111,1.250000,5.000000,1.0,2.0,8.0,0.034726,1.500000,15.000000,11.0
23,2020-03-29,10.0,287966.0,43.698243,Region Dalarna,1.100000,1.000000,1.111111,5.000000,0.0,1.0,8.0,0.034726,1.500000,15.000000,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19,2020-03-25,33.0,1725881.0,41.079673,Västra Götalandsregionen,1.484848,1.100000,1.571429,5.500000,3.0,12.0,27.0,0.019121,2.021277,66.702128,49.0
20,2020-03-26,40.0,1725881.0,41.079673,Västra Götalandsregionen,1.500000,1.212121,1.538462,3.636364,7.0,14.0,29.0,0.023177,1.612500,64.500000,60.0
21,2020-03-27,44.0,1725881.0,41.079673,Västra Götalandsregionen,1.477273,1.100000,1.466667,3.142857,4.0,14.0,30.0,0.025494,2.021277,88.936170,65.0
22,2020-03-28,49.0,1725881.0,41.079673,Västra Götalandsregionen,1.428571,1.113636,1.484848,2.722222,5.0,16.0,31.0,0.028391,2.021277,99.042553,70.0


In [68]:
test['error'] = abs(test['predicted_change']-test['change_coming_3_days'])/test['change_coming_3_days']
test.sort_values('error', ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,date,iva,befolkning,mean_age,Region,change_coming_3_days,change_since_1_days,change_since_3_days,change_since_7_days,change_since_in_nbrs_1_days,change_since_in_nbrs_3_days,change_since_in_nbrs_7_days,iva_per_1000,predicted_change,predicted_nbr_in_3_days,iva_in_3_days,error
22,2020-03-28,4.0,282414.0,43.681182,Region Värmland,1.000000,1.000,1.000000,2.000000,0.0,0.0,2.0,0.014164,2.000000,8.0,4.0,1.000000
21,2020-03-27,4.0,282414.0,43.681182,Region Värmland,1.000000,1.000,1.000000,2.000000,0.0,0.0,2.0,0.014164,2.000000,8.0,4.0,1.000000
20,2020-03-26,4.0,282414.0,43.681182,Region Värmland,1.000000,1.000,1.000000,2.000000,0.0,0.0,2.0,0.014164,2.000000,8.0,4.0,1.000000
23,2020-03-29,4.0,282414.0,43.681182,Region Värmland,1.000000,1.000,1.000000,1.333333,0.0,0.0,1.0,0.014164,2.000000,8.0,4.0,1.000000
23,2020-03-29,28.0,465495.0,41.404762,Region Östergötland,1.035714,1.120,1.333333,3.500000,3.0,7.0,20.0,0.060151,2.017857,56.5,29.0,0.948276
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23,2020-03-29,11.0,304805.0,41.671205,Region Örebro län,1.090909,1.375,1.375000,3.666667,3.0,3.0,8.0,0.036089,1.090909,12.0,12.0,0.000000
22,2020-03-28,9.0,363599.0,41.432354,Region Jönköpings län,1.222222,1.000,1.500000,9.000000,0.0,3.0,8.0,0.024753,1.222222,11.0,11.0,0.000000
21,2020-03-27,9.0,363599.0,41.432354,Region Jönköpings län,1.222222,1.125,1.800000,9.000000,1.0,4.0,8.0,0.024753,1.222222,11.0,11.0,0.000000
21,2020-03-27,5.0,271736.0,41.752542,Region Västerbotten,1.000000,1.250,1.666667,5.000000,1.0,2.0,4.0,0.018400,1.000000,5.0,5.0,0.000000


In [83]:
test[test['date'] == '2020-03-27']['error'].median()

0.20463320463320453

In [72]:
test[test['Region'] == 'Region Östergötland']

Unnamed: 0,date,iva,befolkning,mean_age,Region,change_coming_3_days,change_since_1_days,change_since_3_days,change_since_7_days,change_since_in_nbrs_1_days,change_since_in_nbrs_3_days,change_since_in_nbrs_7_days,iva_per_1000,predicted_change,predicted_nbr_in_3_days,iva_in_3_days,error
18,2020-03-24,14.0,465495.0,41.404762,Region Östergötland,1.642857,1.272727,2.8,14.0,3.0,9.0,13.0,0.030076,1.909091,26.727273,23.0,0.162055
19,2020-03-25,19.0,465495.0,41.404762,Region Östergötland,1.315789,1.357143,2.375,9.5,5.0,11.0,17.0,0.040817,1.860465,35.348837,25.0,0.413953
20,2020-03-26,21.0,465495.0,41.404762,Region Östergötland,1.333333,1.105263,1.909091,7.0,2.0,10.0,18.0,0.045113,1.75,36.75,28.0,0.3125
21,2020-03-27,23.0,465495.0,41.404762,Region Östergötland,1.217391,1.095238,1.642857,5.75,2.0,9.0,19.0,0.04941,1.75,40.25,28.0,0.4375
22,2020-03-28,25.0,465495.0,41.404762,Region Östergötland,1.16,1.086957,1.315789,5.0,2.0,6.0,20.0,0.053706,1.714286,42.857143,29.0,0.477833
23,2020-03-29,28.0,465495.0,41.404762,Region Östergötland,1.035714,1.12,1.333333,3.5,3.0,7.0,20.0,0.060151,2.017857,56.5,29.0,0.948276
