In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [None]:
df = pd.read_csv('../input/2460data/data.csv')
df.head()

In [None]:
X = df[['year', 'week', 'first_dose_per', 'second_dose_per', 'unknown_dose_per']]
y_case = df[['case 1']].astype('int')
y_death = df[['death1']].astype('int')

In [None]:
# Transform into dummy variables
dum = pd.get_dummies(df[['Country name']])
X = pd.concat([X, dum], axis=1)
scaler = MinMaxScaler()
X[['year']] = scaler.fit_transform(X[['year']])
X[['week']] = scaler.fit_transform(X[['week']])
X.head()

In [None]:
X.fillna(0, inplace=True)

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier,plot_tree
from sklearn.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

Modelling for new cases

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_case, test_size = 0.3)

In [None]:
# gini score is better

clf_gini = DecisionTreeClassifier(criterion = 'gini')
clf_gini = clf_gini.fit(X_train, y_train)
y_pred_gini = clf_gini.predict(X_test)
accuracy_gini = clf_gini.score(X_test, y_test)  #accuracy
precision_gini = precision_score(y_pred_gini, y_test, average='weighted') #precision
recall_gini = recall_score(y_pred_gini, y_test, average='weighted') #recall
f1_gini = f1_score(y_pred_gini, y_test, average='weighted') #f1

clf_entro = DecisionTreeClassifier(criterion = 'entropy')
clf_entro = clf_entro.fit(X_train, y_train)
y_pred_entro = clf_entro.predict(X_test)
accuracy_entro = clf_entro.score(X_test, y_test)  #accuracy
precision_entro = precision_score(y_pred_entro, y_test, average='weighted') #precision
recall_entro = recall_score(y_pred_entro, y_test, average='weighted') #recall
f1_entro = f1_score(y_pred_entro, y_test, average='weighted') #f1

print(f'gini accuracy: {accuracy_gini}, entropy accuracy: {accuracy_entro}')
print(f'gini precision: {precision_gini}, entropy precision: {precision_entro}')
print(f'gini recall: {recall_gini}, entropy recall: {recall_entro}')
print(f'gini f1: {f1_gini}, entropy f1: {f1_entro}')

In [None]:
# the best max_depth is 10
train_list = []
test_list = []
for i in range(20):
    clf = DecisionTreeClassifier(criterion = 'gini', max_depth = i+1)
    clf = clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    train_list.append(train_score)
    test_score = clf.score(X_test, y_test)
    test_list.append(test_score)

print(max(test_list))
plt.plot(range(1,21), train_list, color='red', label='train')
plt.plot(range(1,21), test_list, color='blue', label='test')
plt.xticks(range(1,21))
plt.legend()
plt.show()

In [None]:
# the best min_samples_split is 6
train_list = []
test_list = []
for i in range(2,11):
    clf = DecisionTreeClassifier(criterion = 'gini', max_depth = 7, min_samples_split = i)
    clf = clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    train_list.append(train_score)
    test_score = clf.score(X_test, y_test)
    test_list.append(test_score)

print(max(test_list))
plt.plot(range(2,11), train_list, color='red', label='train')
plt.plot(range(2,11), test_list, color='blue', label='test')
plt.xticks(range(2,11))
plt.legend()
plt.show()

In [None]:
# the best min_samples_leaf is 2
train_list = []
test_list = []
for i in range(1,11):
    clf = DecisionTreeClassifier(criterion = 'gini', max_depth = 7, min_samples_split = 8, min_samples_leaf = i)
    clf = clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    train_list.append(train_score)
    test_score = clf.score(X_test, y_test)
    test_list.append(test_score)

print(max(test_list))
plt.plot(range(1,11), train_list, color='red', label='train')
plt.plot(range(1,11), test_list, color='blue', label='test')
plt.xticks(range(1,11))
plt.legend()
plt.show()

In [None]:
# Grid Searching
parameter = {
    'criterion': ('gini','entropy'),
    'max_depth': [*range(5,21)],
    'min_samples_split': [*range(5,10)],
    'min_samples_leaf': [*range(1,5)]   
}
clf = DecisionTreeClassifier()
GS = GridSearchCV(clf, parameter, cv=10)
GS = GS.fit(X_train, y_train)
print(GS.best_params_)
print(GS.best_score_)

In [None]:
clf = DecisionTreeClassifier(criterion = 'gini', max_depth = 10, min_samples_split = 6)
clf = clf.fit(X_train, y_train)
weight = clf.feature_importances_
plt.bar(range(1,36), weight)

In [None]:
y_pred = clf.predict(X_test)
accuracy = clf.score(X_test, y_test)  #accuracy
precision = precision_score(y_pred, y_test, average='weighted') #precision
recall = recall_score(y_pred, y_test, average='weighted') #recall
f1 = f1_score(y_pred, y_test, average='weighted') #f1

print(f'accuracy: {accuracy}, precision: {precision}, recall: {recall}, f1-measure: {f1}')

Modelling for deaths

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_death, test_size = 0.3)

In [None]:
# gini score is better

clf_gini = DecisionTreeClassifier(criterion = 'gini')
clf_gini = clf_gini.fit(X_train, y_train)
y_pred_gini = clf_gini.predict(X_test)
accuracy_gini = clf_gini.score(X_test, y_test)  #accuracy
precision_gini = precision_score(y_pred_gini, y_test, average='weighted') #precision
recall_gini = recall_score(y_pred_gini, y_test, average='weighted') #recall
f1_gini = f1_score(y_pred_gini, y_test, average='weighted') #f1

clf_entro = DecisionTreeClassifier(criterion = 'entropy')
clf_entro = clf_entro.fit(X_train, y_train)
y_pred_entro = clf_entro.predict(X_test)
accuracy_entro = clf_entro.score(X_test, y_test)  #accuracy
precision_entro = precision_score(y_pred_entro, y_test, average='weighted') #precision
recall_entro = recall_score(y_pred_entro, y_test, average='weighted') #recall
f1_entro = f1_score(y_pred_entro, y_test, average='weighted') #f1

print(f'gini accuracy: {accuracy_gini}, entropy accuracy: {accuracy_entro}')
print(f'gini precision: {precision_gini}, entropy precision: {precision_entro}')
print(f'gini recall: {recall_gini}, entropy recall: {recall_entro}')
print(f'gini f1: {f1_gini}, entropy f1: {f1_entro}')

In [None]:
# the best max_depth is 7
train_list = []
test_list = []
for i in range(20):
    clf = DecisionTreeClassifier(criterion = 'gini', max_depth = i+1)
    clf = clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    train_list.append(train_score)
    test_score = clf.score(X_test, y_test)
    test_list.append(test_score)

print(max(test_list))
plt.plot(range(1,21), train_list, color='red', label='train')
plt.plot(range(1,21), test_list, color='blue', label='test')
plt.xticks(range(1,21))
plt.legend()
plt.show()

In [None]:
# the best min_samples_split is 8
train_list = []
test_list = []
for i in range(2,11):
    clf = DecisionTreeClassifier(criterion = 'gini', max_depth = 7, min_samples_split = i)
    clf = clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    train_list.append(train_score)
    test_score = clf.score(X_test, y_test)
    test_list.append(test_score)

print(max(test_list))
plt.plot(range(2,11), train_list, color='red', label='train')
plt.plot(range(2,11), test_list, color='blue', label='test')
plt.xticks(range(2,11))
plt.legend()
plt.show()

In [None]:
# the best min_samples_leaf is 1
train_list = []
test_list = []
for i in range(1,11):
    clf = DecisionTreeClassifier(criterion = 'gini', max_depth = 7, min_samples_split = 8, min_samples_leaf = i)
    clf = clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    train_list.append(train_score)
    test_score = clf.score(X_test, y_test)
    test_list.append(test_score)

print(max(test_list))
plt.plot(range(1,11), train_list, color='red', label='train')
plt.plot(range(1,11), test_list, color='blue', label='test')
plt.xticks(range(1,11))
plt.legend()
plt.show()

In [None]:
# Grid Searching
parameter = {
    'criterion': ('gini','entropy'),
    'max_depth': [*range(5,21)],
    'min_samples_split': [*range(5,10)],
    'min_samples_leaf': [*range(1,5)]   
}
clf = DecisionTreeClassifier()
GS = GridSearchCV(clf, parameter, cv=10)
GS = GS.fit(X_train, y_train)
print(GS.best_params_)
print(GS.best_score_)

In [None]:
clf = DecisionTreeClassifier(criterion = 'gini', max_depth = 10, min_samples_split = 6)
clf = clf.fit(X_train, y_train)
weight = clf.feature_importances_
plt.bar(range(1,36), weight)

In [None]:
y_pred = clf.predict(X_test)
accuracy = clf.score(X_test, y_test)  #accuracy
precision = precision_score(y_pred, y_test, average='weighted') #precision
recall = recall_score(y_pred, y_test, average='weighted') #recall
f1 = f1_score(y_pred, y_test, average='weighted') #f1

print(f'accuracy: {accuracy}, precision: {precision}, recall: {recall}, f1-measure: {f1}')