In [1]:
import pandas as pd
import numpy as np

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly
import plotly.graph_objs as go
from plotly.subplots import make_subplots
init_notebook_mode(connected=True)

# отключим предупреждения Anaconda
import warnings
warnings.simplefilter('ignore')

df_rec=pd.read_csv('https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_recovered_global.csv&filename=time_series_covid19_recovered_global.csv')
df_deaths=pd.read_csv('https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_deaths_global.csv&filename=time_series_covid19_deaths_global.csv')
df=pd.read_csv('https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_confirmed_global.csv&filename=time_series_covid19_confirmed_global.csv')

In [2]:
from sklearn.linear_model import LinearRegression #импорт из библиотеки функции с линейной регрессией
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

from tqdm import tqdm
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs
from scipy.optimize import minimize

## Функция формирует датасет из топ стран

In [43]:
def top_countries(df,start,top):
    df_countries=df.sort_values(by=df.columns[-1],ascending=False).reset_index(drop=True)
    start_cases=start
    top_cases=top
    df_countries_top=df_countries[:top_cases].T
    df_countries_top.columns=df_countries[:top_cases]['Country/Region'].values
    df_countries_top=df_countries_top[4:][2:].reset_index(drop=True)
    for column in range(len(df_countries_top.columns)):
        n=0
        for i in range(len(df_countries_top)):
            if df_countries_top.iloc[i][column]<start_cases: n+=1
        df_countries_top[df_countries_top.columns[column]]=df_countries_top[df_countries_top.columns[column]].shift(-n)
    df_countries_top=df_countries_top.dropna()
    return df_countries_top

 |              aliceblue, antiquewhite, aqua, aquamarine, azure,
 |              beige, bisque, black, blanchedalmond, blue,
 |              blueviolet, brown, burlywood, cadetblue,
 |              chartreuse, chocolate, coral, cornflowerblue,
 |              cornsilk, crimson, cyan, darkblue, darkcyan,
 |              darkgoldenrod, darkgray, darkgrey, darkgreen,
 |              darkkhaki, darkmagenta, darkolivegreen, darkorange,
 |              darkorchid, darkred, darksalmon, darkseagreen,
 |              darkslateblue, darkslategray, darkslategrey,
 |              darkturquoise, darkviolet, deeppink, deepskyblue,
 |              dimgray, dimgrey, dodgerblue, firebrick,
 |              floralwhite, forestgreen, fuchsia, gainsboro,
 |              ghostwhite, gold, goldenrod, gray, grey, green,
 |              greenyellow, honeydew, hotpink, indianred, indigo,
 |              ivory, khaki, lavender, lavenderblush, lawngreen,
 |              lemonchiffon, lightblue, lightcoral, lightcyan,
 |              lightgoldenrodyellow, lightgray, lightgrey,
 |              lightgreen, lightpink, lightsalmon, lightseagreen,
 |              lightskyblue, lightslategray, lightslategrey,
 |              lightsteelblue, lightyellow, lime, limegreen,
 |              linen, magenta, maroon, mediumaquamarine,
 |              mediumblue, mediumorchid, mediumpurple,
 |              mediumseagreen, mediumslateblue, mediumspringgreen,
 |              mediumturquoise, mediumvioletred, midnightblue,
 |              mintcream, mistyrose, moccasin, navajowhite, navy,
 |              oldlace, olive, olivedrab, orange, orangered,
 |              orchid, palegoldenrod, palegreen, paleturquoise,
 |              palevioletred, papayawhip, peachpuff, peru, pink,
 |              plum, powderblue, purple, red, rosybrown,
 |              royalblue, rebeccapurple, saddlebrown, salmon,
 |              sandybrown, seagreen, seashell, sienna, silver,
 |              skyblue, slateblue, slategray, slategrey, snow,
 |              springgreen, steelblue, tan, teal, thistle, tomato,
 |              turquoise, violet, wheat, white, whitesmoke,
 |              yellow, yellowgreen

## Функция рисует график по датасету

In [3]:
def plotly_df(df, title = '',selected=0):
    data = []
    for column in df.columns:
        trace = go.Scatter(
            x = df.index,
            y = df[column],
            mode = 'lines',
            name = column,
            line_width=1
            #line_color='LightGray' if column!=selected else 'darkblue'
        )
        data.append(trace)  
    fig = go.Figure(data=data)
    fig.update_layout(
        plot_bgcolor='rgb(255,250,245)',
        title=title,
        showlegend = True,
    
    )
    iplot(fig, show_link=False)
    plotly.offline.plot(fig, filename=str(title), show_link=True)

## Функция готовит датасеты для обучения

In [4]:
 def prepareData(data, test_size,lag_start, lag_end):
 
    data = pd.DataFrame(data.copy())
    data.columns = ["y"]

    # считаем индекс в датафрейме, после которого начинается тестовыый отрезок
    test_index = int(len(data)*(1-test_size))

    # добавляем лаги исходного ряда в качестве признаков
    for i in range(lag_start, lag_end):
        data["lag_{}".format(i)] = data.y.shift(i)

    data = data.dropna()
    data = data.reset_index(drop=True)

    # разбиваем весь датасет на тренировочную и тестовую выборку
    X_train = data.loc[:test_index].drop(["y"], axis=1)
    y_train = data.loc[:test_index]["y"]
    X_test = data.loc[test_index:].drop(["y"], axis=1)
    y_test = data.loc[test_index:]["y"]

    return X_train, X_test, y_train, y_test


## Функция предсказывает будущие значения на основе датасета

In [5]:
def prediction_data(dataset,history_lag, future_days,prediction_f):
    data=pd.DataFrame(dataset.copy())
    data.columns = ["y"]
    k=0
    for i in range(len(new_tr)-1):
        if data['y'][i]==data['y'][i+1]:
            k+=1
    dataset_predict=data[k-history_lag:].reset_index(drop=True)
    l=len(data)
    for i in range(1, future_days+1):
        X=dataset_predict[-history_lag+1:][::-1].T
        y=prediction_f(X)
        dataset_predict.loc[l+i]=y[0]
    return dataset_predict

## Подготовка исходных данных

In [6]:
df=df.replace(np.nan,'')
df_rec=df_rec.replace(np.nan,'')
df_deaths=df_deaths.replace(np.nan,'')

In [7]:
place_df=df.iloc[:,:4].replace(np.nan,'') #местоположение
days_df=df.iloc[:,4:].replace(np.nan,'') #абсолютные заболевших значения по дням
delta_df=days_df-days_df.shift(periods=1, axis=1) #изменение количество заболевших по дням

In [8]:
increase_df=pd.concat((place_df,delta_df),axis=1)
total_df=pd.concat((place_df,days_df),axis=1)

# Модель для выбранной страны

## Формируем датасет по выбранной стране

In [9]:
Country='Russia' #задаём страну
Province='' #задаём провинцию / либо пусто
if Country=='World':
    total_Country=pd.DataFrame(df.sum(numeric_only=True))
    total_Country_deaths=pd.DataFrame(df_deaths.sum(numeric_only=True))
    total_Country_recovery=pd.DataFrame(df_rec.sum(numeric_only=True))
    increase_Country=pd.DataFrame(increase_df.sum(numeric_only=True))
    
    new_tr=total_Country[2:]
    new_ir=increase_Country[2:]
    new_dr=total_Country_deaths[2:]
    new_rr=total_Country_recovery[2:]
else:
    total_Country=total_df[total_df['Country/Region']+total_df['Province/State']==(Country+Province)]
    total_Country_deaths=df_deaths[df_deaths['Country/Region']+df_deaths['Province/State']==(Country+Province)]
    total_Country_recovery=df_rec[df_rec['Country/Region']+df_rec['Province/State']==(Country+Province)]
    increase_Country=increase_df[increase_df['Country/Region']+increase_df['Province/State']==(Country+Province)]
    
    new_tr=total_Country.iloc[:,4:].T
    new_ir=increase_Country.iloc[:,4:].T
    new_dr=total_Country_deaths.iloc[:,4:].T
    new_rr=total_Country_recovery.iloc[:,4:].T

new_tr.columns=['Total']
new_ir.columns=['Increase']
new_dr.columns=['Deaths']
new_rr.columns=['Recovery']

In [10]:
df_Country=pd.concat((new_tr,new_ir,new_dr,new_rr),axis=1)

## Рисуем графики для выбранной страны

In [53]:
plotly_df(df_Country,title = 'Coronavirus statistics in '+Country+'.'+Province,selected='Deaths')

## Предсказание по стране

In [12]:
history_lag=15 #временной лаг в прошлое
future_days=7 #задаём глубину прогноза
#линейная регрессия
lr = LinearRegression()

In [13]:
k=0
for i in range(len(new_tr)-1):
    if new_tr.Total[i]==new_tr.Total[i+1]:
        k+=1
dataset_total=new_tr[k-history_lag:]

X_train, X_test, y_train, y_test = prepareData(dataset_total, test_size=0.20, lag_start=1, lag_end=history_lag)
lr.fit(X_train, y_train)
prediction_lr_t = pd.DataFrame(lr.predict(X_test))
print('линейная регрессия: ', mean_absolute_error(prediction_lr_t,y_test))

predict_total=prediction_data(new_tr,history_lag, future_days,lr.predict) # датасет с прогнозными значениями Total

линейная регрессия:  919.1065621602611


In [14]:
k=0
for i in range(len(new_ir)-1):
    if new_ir.Increase[i]==new_ir.Increase[i+1]:
        k+=1
dataset_Increase=new_ir[k-history_lag:]

X_train, X_test, y_train, y_test = prepareData(dataset_Increase, test_size=0.20, lag_start=1, lag_end=history_lag)

lr.fit(X_train, y_train)
prediction_lr_i = pd.DataFrame(lr.predict(X_test))

print('линейная регрессия: ', mean_absolute_error(prediction_lr_i,y_test))
predict_Increase=prediction_data(new_ir,history_lag, future_days,lr.predict) # датасет с прогнозными значениями Increase

линейная регрессия:  960.8125402965911


In [15]:
dataset_Death=new_dr[history_lag:]

X_train, X_test, y_train, y_test = prepareData(dataset_Death, test_size=0.25, lag_start=1, lag_end=history_lag)

lr.fit(X_train, y_train)
prediction_lr_d = pd.DataFrame(lr.predict(X_test))

print('линейная регрессия: ', mean_absolute_error(prediction_lr_d,y_test))

predict_Death=prediction_data(new_dr,history_lag, future_days,lr.predict) # датасет с прогнозными значениями Death

линейная регрессия:  61.77877966776312


In [16]:
dataset_Recovery=new_rr[history_lag:]

X_train, X_test, y_train, y_test = prepareData(dataset_Recovery, test_size=0.2, lag_start=1, lag_end=history_lag)

lr.fit(X_train, y_train)
prediction_lr_r = pd.DataFrame(lr.predict(X_test))

print('линейная регрессия: ', mean_absolute_error(prediction_lr_r,y_test))

predict_Recovery=prediction_data(new_rr,history_lag, future_days,lr.predict) # датасет с прогнозными значениями Recovery

линейная регрессия:  934.2846475974271


## Рисуем несколько графиков на одном листе

In [35]:
fig_dash = make_subplots(
    rows=2, cols=2,
    specs=[[{}, {"rowspan": 2}],
           [{}, None]],
    subplot_titles=('Actual: Total vs Increase in '+Country+'. '+ Province,
                    'Prediction in '+Country+'. '+ Province+str(future_days)+' days',
                    'Actual: Deaths vs Recovery in '+Country+ '. '+ Province
                    ),
    
    column_widths=[0.5,0.5],
    horizontal_spacing=0.1,
    vertical_spacing=0.15,
    print_grid=False)


fig_dash.add_trace(go.Scatter(x=df_Country.index,
                              y=df_Country.Total,
                              name='Total'),
                   row=1, col=1)
fig_dash.add_trace(go.Scatter(x=df_Country.index,
                              y=df_Country.Increase,
                              name='Increase'),
                   row=1, col=1)
fig_dash.add_trace(go.Scatter(x=df_Country.index,
                              y=df_Country.Deaths,
                              name='Deaths'),
                   row=2, col=1)
fig_dash.add_trace(go.Scatter(x=df_Country.index,
                              y=df_Country.Recovery,
                              name='Recovery'
                             ),
                   row=2, col=1)
fig_dash.add_trace(go.Scatter(x=predict_Increase[-future_days:].reset_index(drop=True).index,
                              y=predict_Increase['y'][-future_days:].values,
                              name='Prediction_Increase'),
                   row=1, col=2)
fig_dash.add_trace(go.Scatter(x=predict_total[-future_days:].reset_index(drop=True).index,
                              y=predict_total['y'][-future_days:].values,
                              name='Prediction_Total'),
                   row=1, col=2)
fig_dash.add_trace(go.Scatter(x=predict_Death[-future_days:].reset_index(drop=True).index,
                              y=predict_Death['y'][-future_days:].values,
                              name='Prediction_Death'),
                   row=1, col=2)
fig_dash.add_trace(go.Scatter(x=predict_Recovery[-future_days:].reset_index(drop=True).index,
                              y=predict_Recovery['y'][-future_days:].values,
                              name='Prediction_Recovery'),
                   row=1, col=2)
fig_dash.update_layout(
    height=768, width=1440,
    plot_bgcolor='rgb(255,250,245)',
    showlegend = True,
)
plotly.offline.plot(fig_dash, filename='Coronavirus dash in '+Country+Province, show_link=False)

'Coronavirus dash in Russia.html'

# Данные по миру на текущий день

## Подготавливаем данные

In [18]:
df_last_world=pd.concat((df['Country/Region']+' '+df['Province/State'].replace(np.nan,''),
                         df.iloc[:,-1],df_deaths.iloc[:,-1],
                         df['Lat'],df['Long']),axis=1)
df_last_world.columns=['Country','Cases','Deaths','Lat','Long']
All_cases=df_last_world['Cases'].sum()
All_Deaths=df_last_world['Deaths'].sum()

df_last_world.Cases[df_last_world.Cases < 0] = 0
df_last_world.Deaths[df_last_world.Deaths < 0] = 0
df_last_world=df_last_world.sort_values(by='Cases',ascending=False)

## Рисуем мировую карту

In [19]:
df_last_world['text'] = df_last_world['Country'] + '<br>Cases: ' + (df_last_world['Cases']).astype(str)+'<br>Deaths:' + (df_last_world['Deaths']).astype(str)#+'<br>Recovery' + (df_last_world['Recovery']).astype(str)

tc=len(df_last_world)
limits=[(0,int(tc/20)),#топ 5% стран по количеству заражённых
        (int(tc/20+1),int(tc/10)), #5-10% стран 
        (int(tc/10+1),int(tc/3)), #10-33% стран 
        (int(tc/3+1),int(tc*2/3)), #33-66% стран
        (int(tc*2/3+1),int(tc))] #остальные стран

colors = ["purple","darkred","orange","pink","blue","magenta"]
scale = 200
fig_world = go.Figure()

for i in range(len(limits)):
    lim = limits[i]
    df_sub = df_last_world[lim[0]:lim[1]]
    fig_world.add_trace(go.Scattergeo(
        locationmode = 'country names',
        lon = df_sub['Long'],
        lat = df_sub['Lat'],
        text = df_sub['text'],
        marker = dict(
            size = df_sub['Cases']/scale,
            color = colors[i],
            line_color='rgb(40,40,40)',
            line_width=0.5,
            sizemode = 'area'
        ),
        name = '{0} - {1}'.format(lim[0],lim[1])))
fig_world.update_geos(
    projection_type="natural earth",
    visible=False, resolution=50,
    showcountries=True, countrycolor="RebeccaPurple",
    showland=True, landcolor="LightGreen",
    showocean=True, oceancolor="LightBlue",
    showlakes=True, lakecolor="Blue",
)
fig_world.update_layout(
        title_text = 'World coronavirus. Cases: '+str(All_cases)+' Deaths: '+str(All_Deaths),
        showlegend = False,
    )
#fig_world.show()
plotly.offline.plot(fig_world, filename='Coronavirus in the world', show_link=False)

'Coronavirus in the world.html'

import dash
import dash_core_components as dcc
import dash_html_components as html
app = dash.Dash()
app.layout = html.Div([
    html.Div([
        dcc.Graph(figure=fig_country)
    ],style={'width': '50%', 'display': 'inline-block'}),
    html.Div([
        dcc.Graph(figure=fig_world)
    ],style={'width': '50%', 'display': 'inline-block'})    
])
app.run_server(debug=True, use_reloader=False)

## Сравниваем топ стран по заражениям

In [55]:
df_countries_cases_top=top_countries(df=total_df,start=500,top=10)
plotly_df(df_countries_cases_top,title = 'Сравнение динамики заражений лидирующих стран',selected=Country)

## Сравниваем топ стран по смертям

In [56]:
df_countries_death_top=top_countries(df=df_deaths,start=10,top=10)
plotly_df(df_countries_death_top,title = 'Сравнение динамики смертей лидирующих стран',selected=Country)

## Сравниваем топ стран по смертям