In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from IPython.display import Image, HTML, display
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import datetime
import re
from pandas_profiling import ProfileReport
from sklearn.metrics import confusion_matrix, recall_score, accuracy_score, precision_score
import requests
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

  import pandas.util.testing as tm


## Use API for Coronavirus Data

In [2]:
URL = "https://wuhan-coronavirus-api.laeyoung.endpoint.ainize.ai/jhu-edu/timeseries"
r = requests.get(url = URL)
data = r.json()
data

[{'provincestate': '',
  'countryregion': 'Afghanistan',
  'lastupdate': '2020-04-28T15:42:00.003Z',
  'location': {'lat': 33, 'lng': 65},
  'countrycode': {'iso2': 'AF', 'iso3': 'AFG'},
  'timeseries': {'1/22/20': {'confirmed': 0, 'deaths': 0, 'recovered': 0},
   '1/23/20': {'confirmed': 0, 'deaths': 0, 'recovered': 0},
   '1/24/20': {'confirmed': 0, 'deaths': 0, 'recovered': 0},
   '1/25/20': {'confirmed': 0, 'deaths': 0, 'recovered': 0},
   '1/26/20': {'confirmed': 0, 'deaths': 0, 'recovered': 0},
   '1/27/20': {'confirmed': 0, 'deaths': 0, 'recovered': 0},
   '1/28/20': {'confirmed': 0, 'deaths': 0, 'recovered': 0},
   '1/29/20': {'confirmed': 0, 'deaths': 0, 'recovered': 0},
   '1/30/20': {'confirmed': 0, 'deaths': 0, 'recovered': 0},
   '1/31/20': {'confirmed': 0, 'deaths': 0, 'recovered': 0},
   '2/1/20': {'confirmed': 0, 'deaths': 0, 'recovered': 0},
   '2/2/20': {'confirmed': 0, 'deaths': 0, 'recovered': 0},
   '2/3/20': {'confirmed': 0, 'deaths': 0, 'recovered': 0},
   '2/4/2

In [3]:
cols = ['Country','Province_State','Last_Update','Date','Latitude','Longitude', 'Confirmed', 'Deaths', 'Recovered']
corona = pd.DataFrame(columns=cols)

for i in range(len(data)):
    for j in data[i]['timeseries'].keys():
        corona = corona.append({'Country':data[i]['countryregion'],
                'Province_State':data[i]['provincestate'],
                'Last_Update':data[i]['lastupdate'],
                'Latitude':data[i]['location']['lat'],
                'Longitude':data[i]['location']['lng'],
                'Date': j,
                'Confirmed':data[i]['timeseries'][j].get('confirmed'),
                'Deaths':data[i]['timeseries'][j].get('deaths'),
                'Recovered':data[i]['timeseries'][j].get('recovered')}, ignore_index=True)
# display(HTML(corona.to_html()))

In [4]:
corona = corona.rename(columns={"Confirmed": "Cases"})

In [5]:
# latitude and longtitude are according to province
corona['Date'] = pd.to_datetime(corona['Date']).dt.date
corona['Last_Update'] = pd.to_datetime(corona['Last_Update']).dt.date
corona = corona[['Country', 'Province_State', 'Date', 'Latitude', 'Longitude', 'Cases', 'Deaths', 'Recovered']]
corona.sort_values(by=['Date', 'Cases'], ascending=[True, False], inplace=True)
corona = corona.reset_index(drop=True)
corona.head()

Unnamed: 0,Country,Province_State,Date,Latitude,Longitude,Cases,Deaths,Recovered
0,China,Hubei,2020-01-22,30.9756,112.271,444,17,28
1,China,Guangdong,2020-01-22,23.3417,113.424,26,0,0
2,China,Beijing,2020-01-22,40.1824,116.414,14,0,0
3,China,Zhejiang,2020-01-22,29.1832,120.093,10,0,0
4,China,Shanghai,2020-01-22,31.202,121.449,9,0,0


In [6]:
corona['Cases'] = pd.to_numeric(corona['Cases'])
corona['Deaths'] = pd.to_numeric(corona['Deaths'])
corona['Recovered'] = pd.to_numeric(corona['Recovered'])
corona['Latitude'] = pd.to_numeric(corona['Latitude'])
corona['Longitude'] = pd.to_numeric(corona['Longitude'])

In [7]:
# fill nan value with interpolation
corona["Cases"] = corona["Cases"].interpolate(method ='linear', limit_direction ='forward')
corona["Deaths"] = corona["Deaths"].interpolate(method ='linear', limit_direction ='forward') 

In [8]:
# check for missing values 
corona.isna().sum() 

Country           0
Province_State    0
Date              0
Latitude          0
Longitude         0
Cases             0
Deaths            0
Recovered         0
dtype: int64

## Useful functions

In [9]:
def evaluate(y_true, y_pred, print_cm=False):
    # calculate and display confusion matrix
    labels = np.unique(y_true)
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    if print_cm:
        print('Confusion matrix\n- x-axis is true labels (none, comp1, etc.)\n- y-axis is predicted labels')
        print(cm)

    # calculate precision, recall, and F1 score
    accuracy = float(np.trace(cm)) / np.sum(cm)
    precision = precision_score(y_true, y_pred, average=None, labels=labels)[1]
    recall = recall_score(y_true, y_pred, average=None, labels=labels)[1]
    f1 = 2 * precision * recall / (precision + recall)
    print("accuracy:", accuracy)
    print("precision:", precision)
    print("recall:", recall)
    print("f1 score:", f1)

In [10]:
import matplotlib.pyplot as plt
import matplotlib
from sklearn.metrics import roc_curve, auc
%matplotlib inline
def plot_roc_curve(fprs, tprs):
    """Plot the Receiver Operating Characteristic from a list
    of true positive rates and false positive rates."""
    
   
    tprs_interp = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)
    f, ax = plt.subplots(figsize=(14,10))
    
    for i, (fpr, tpr) in enumerate(zip(fprs, tprs)):
        tprs_interp.append(np.interp(mean_fpr, fpr, tpr))
        tprs_interp[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        ax.plot(fpr, tpr, lw=1, alpha=0.3,
                 label='ROC fold %d (AUC = %0.2f)' % (i+1, roc_auc))
        

    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
             label='Luck', alpha=.8)
    

    mean_tpr = np.mean(tprs_interp, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    ax.plot(mean_fpr, mean_tpr, color='b',
             label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
             lw=2, alpha=.8)
    

    std_tpr = np.std(tprs_interp, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                     label=r'$\pm$ 1 std. dev.')
    

    ax.set_xlim([-0.05, 1.05])
    ax.set_ylim([-0.05, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('Receiver operating characteristic')
    ax.legend(loc="lower right")
    plt.show()
    return (f, ax)

def compute_roc_auc(index, X, y):
    y_predict = clf.predict_proba(X.iloc[index])[:,1]
    fpr, tpr, thresholds = roc_curve(y.iloc[index], y_predict)
    auc_score = auc(fpr, tpr)
    y_true = y.iloc[index]
    return fpr, tpr, auc_score, y_true, y_predict

## Machine Learning

In [11]:
dfM = corona[['Country', 'Date', 'Cases', 'Deaths', 'Recovered']]

dfCategorical = ['Country', 'Date']
for feature in dfCategorical:
    dfM[feature] = dfM[feature].astype('category')
print(dfM.dtypes)
dfNumerical = ['Cases','Deaths','Recovered']
dfM[dfNumerical] = (dfM[dfNumerical] - dfM[dfNumerical].mean()) / dfM[dfNumerical].std()
y = dfM[['Cases']]
X = dfM[['Country', 'Date', 'Deaths', 'Recovered']]
X_one = pd.get_dummies(X, columns=dfCategorical)
X_one.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Country      category
Date         category
Cases         float64
Deaths        float64
Recovered       int64
dtype: object


Unnamed: 0,Deaths,Recovered,Country_Afghanistan,Country_Albania,Country_Algeria,Country_Andorra,Country_Angola,Country_Antigua and Barbuda,Country_Argentina,Country_Armenia,...,Date_2020-04-18 00:00:00,Date_2020-04-19 00:00:00,Date_2020-04-20 00:00:00,Date_2020-04-21 00:00:00,Date_2020-04-22 00:00:00,Date_2020-04-23 00:00:00,Date_2020-04-24 00:00:00,Date_2020-04-25 00:00:00,Date_2020-04-26 00:00:00,Date_2020-04-27 00:00:00
0,-0.090686,-0.115005,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-0.100659,-0.120201,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-0.100659,-0.120201,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-0.100659,-0.120201,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-0.100659,-0.120201,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_one = y.apply(lambda col: le.fit_transform(col))
y_one.head()
for feature in y_one.columns:
    y_one[feature] = y_one[feature].astype('category')
y_one.dtypes

Cases    category
dtype: object

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
clf = RandomForestClassifier()

cv = StratifiedKFold(n_splits=10, random_state=123, shuffle=True)
results = pd.DataFrame(columns=['training_score', 'test_score'])
fprs, tprs, scores = [], [], []

def compute_roc_auc(index, X, y):
    y_predict = clf.predict_proba(X.iloc[index])[:,1]
    fpr, tpr, thresholds = roc_curve(y.iloc[index], y_predict)
    auc_score = auc(fpr, tpr)
    y_true = y.iloc[index]
    return fpr, tpr, auc_score, y_true, y_predict
    
for (train, test), i in zip(cv.split(X_one, y_one), range(5)):        #BURA
    clf.fit(X_one.iloc[train], y_one.iloc[train])                    #BURA
    _, _, auc_score_train, _, _, = compute_roc_auc(train, X_one, y_one)      #BURA
    fpr, tpr, auc_score, y_true, y_predict = compute_roc_auc(test, X_one, y_one)    #BURA
    scores.append((auc_score_train, auc_score))
    fprs.append(fpr)
    tprs.append(tpr)
    print(evaluate(y_true, y_predict.round(), print_cm=False))

plot_roc_curve(fprs, tprs);
pd.DataFrame(scores, columns=['AUC Train', 'AUC Test'])



MemoryError: could not allocate 801374208 bytes

In [None]:
import imblearn
oversample = imblearn.over_sampling.RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(X_one, y_one)
print("Shape of data set of X before oversampling is: %s and %s"%(X_one.shape, y_one.shape))
print("Shapes of classes before oversampling is: %s and %s"%(len(y_one.loc[y_one.Cases == 1]), len(y_one.loc[y_one.Cases == 0])))
print("Shape of oversampled data set of X is: %s and %s"%(X_over.shape, y_over.shape))
print("Shapes of classes after oversampling is: %s and %s"%(len(y_over.loc[y_over.Cases == 1]), len(y_over.loc[y_over.Cases == 0])))
undersample = imblearn.under_sampling.RandomUnderSampler(sampling_strategy='majority')
X_under, y_under = undersample.fit_resample(X_one, y_one)
print("Shape of data set of X before undersampling is: %s and %s"%(X_one.shape, y_one.shape))
print("Shapes of classes before undersampling is: %s and %s"%(len(y_one.loc[y_one.Cases == 1]), len(y_one.loc[y_one.Cases == 0])))
print("Shape of undersampled data set of X is: %s and %s"%(X_under.shape, y_under.shape))
print("Shapes of classes after undersampling is: %s and %s"%(len(y_under.loc[y_under.Cases == 1]), len(y_under.loc[y_under.Cases == 0])))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
clf = RandomForestClassifier()

cv = StratifiedKFold(n_splits=10, random_state=123, shuffle=True)
results = pd.DataFrame(columns=['training_score', 'test_score'])
fprs, tprs, scores = [], [], []

def compute_roc_auc(index, X, y):
    y_predict = clf.predict_proba(X.iloc[index])[:,1]
    fpr, tpr, thresholds = roc_curve(y.iloc[index], y_predict)
    auc_score = auc(fpr, tpr)
    y_true = y.iloc[index]
    return fpr, tpr, auc_score, y_true, y_predict
    
for (train, test), i in zip(cv.split(X_over, y_over), range(3)):        
    clf.fit(X_over.iloc[train], y_over.iloc[train])                    
    _, _, auc_score_train, _, _, = compute_roc_auc(train, X_over, y_over)      
    fpr, tpr, auc_score, y_true, y_predict = compute_roc_auc(test, X_over, y_over)    
    scores.append((auc_score_train, auc_score))
    fprs.append(fpr)
    tprs.append(tpr)
    print(evaluate(y_true, y_predict.round(), print_cm=False))

plot_roc_curve(fprs, tprs);
pd.DataFrame(scores, columns=['AUC Train', 'AUC Test'])

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
clf = RandomForestClassifier()

cv = StratifiedKFold(n_splits=10, random_state=123, shuffle=True)
results = pd.DataFrame(columns=['training_score', 'test_score'])
fprs, tprs, scores = [], [], []

def compute_roc_auc(index, X, y):
    y_predict = clf.predict_proba(X.iloc[index])[:,1]
    fpr, tpr, thresholds = roc_curve(y.iloc[index], y_predict)
    auc_score = auc(fpr, tpr)
    y_true = y.iloc[index]
    return fpr, tpr, auc_score, y_true, y_predict
    
for (train, test), i in zip(cv.split(X_under, y_under), range(3)):        #BURA
    clf.fit(X_under.iloc[train], y_under.iloc[train])                    #BURA
    _, _, auc_score_train, _, _, = compute_roc_auc(train, X_under, y_under)      #BURA
    fpr, tpr, auc_score, y_true, y_predict = compute_roc_auc(test, X_under, y_under)    #BURA
    scores.append((auc_score_train, auc_score))
    fprs.append(fpr)
    tprs.append(tpr)
    print(evaluate(y_true, y_predict.round(), print_cm=False))

plot_roc_curve(fprs, tprs);
pd.DataFrame(scores, columns=['AUC Train', 'AUC Test'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.3, random_state=0)

rfc = RandomForestClassifier() 

param_grid = {
              "n_estimators": [100, 300],
              "max_depth": [None, 100],
              "min_samples_split": [2, 5],
              "min_samples_leaf": [1, 5, 10],
              'n_jobs' : [-1],  
             }

#cv = StratifiedKFold(n_splits=10, random_state=123, shuffle=True)
grid = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=3)

grid_result = grid.fit(X_train, y_train)  #BURA

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))   
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
y_true, y_pred = y_test, grid.predict(X_test)
print(classification_report(y_true, y_pred))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.3, random_state=0)

rfc = RandomForestClassifier(n_estimators=300, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_jobs=-1) 
rf = rfc.fit(X_train, y_train) 
y_true, y_pred = y_test, rf.predict(X_test)
print(classification_report(y_true, y_pred))
print(evaluate(y_true, y_pred, print_cm=False))