In [None]:
# install required library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression


from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import GridSearchCV

%matplotlib inline

In [None]:
data = pd.read_csv("cases_malaysia.csv") #Get the data from the cases_malaysia.csv
data #view the data

In [None]:
data.shape #Find the dimension of the array

In [None]:
data[(data['date'] >= '2022-01-01') & (data['date'] <= '2022-06-30')] # Find the data range from 1st Jan 2022 to 30th June 2022


In [None]:
data[(data['date'] >= '2022-01-01') & (data['date'] <= '2022-06-30')].sum() # adds the items of an iterable and returns the sum


In [None]:
data[(data['date'] >= '2022-01-01') & (data['date'] <= '2022-06-30')].shape #Find the dimension of the array

In [None]:
data_new = data[(data['date'] >= '2022-01-01') & (data['date'] <= '2022-06-30')] #set the data to data_new for further step (correlation)

In [None]:
data_new # Check the data value and compare result with data (no different)

In [None]:
# Plot a correlation graph to observe the correlation of data
plt.figure(figsize=(35,35))
cor = data_new.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.PiYG, vmin=-1, vmax=1)
plt.show()

In [None]:
Total_Users = len(data_new) #Find total users
print('There are', Total_Users,'total users')
data_new.describe()

In [None]:
# Combine the data by the date from Jan to June 2022
data_date_6months = data_new.groupby('date', as_index=False).sum()
data_date_6months

In [None]:
# Create dataframes from combined data for each month (January, Februrary, March, April, May, June)
data_date_jan = data_date_6months[(data_date_6months['date'] >= '2022-01-01') & (data_date_6months['date'] <= '2022-01-31')]
data_date_feb = data_date_6months[(data_date_6months['date'] >= '2022-02-01') & (data_date_6months['date'] <= '2022-02-28')]    
data_date_mar = data_date_6months[(data_date_6months['date'] >= '2022-03-01') & (data_date_6months['date'] <= '2022-03-31')]
data_date_apr = data_date_6months[(data_date_6months['date'] >= '2022-04-01') & (data_date_6months['date'] <= '2022-04-30')]
data_date_may = data_date_6months[(data_date_6months['date'] >= '2022-05-01') & (data_date_6months['date'] <= '2022-05-31')]
data_date_jun = data_date_6months[(data_date_6months['date'] >= '2022-06-01') & (data_date_6months['date'] <= '2022-06-30')]

In [None]:
# Find columns for Jan
data_jan_temp = data_date_jan
data_jan_temp['date'] = 1
data_jan = data_jan_temp.groupby('date', as_index=False).sum()

# Find columns for Feb
data_feb_temp = data_date_feb
data_feb_temp['date'] = 2
data_feb = data_feb_temp.groupby('date', as_index=False).sum()

# Find columns for Mar
data_mar_temp = data_date_mar
data_mar_temp['date'] = 3
data_mar = data_mar_temp.groupby('date', as_index=False).sum()

# Find columns for April
data_apr_temp = data_date_apr
data_apr_temp['date'] = 4
data_apr = data_apr_temp.groupby('date', as_index=False).sum()

# Find columns for May
data_may_temp = data_date_may
data_may_temp['date'] = 5
data_may = data_may_temp.groupby('date', as_index=False).sum()

# Find columns for Jun
data_jun_temp = data_date_jun
data_jun_temp['date'] = 6
data_jun = data_jun_temp.groupby('date', as_index=False).sum()

In [None]:
# Show the items for each months (1 month per row)
# This is to compare the total with the data by dates (181)
# To ensure no difference in total cases
data_6months = pd.concat([data_jan, data_feb, data_mar, data_apr, data_may, data_jun], ignore_index=True)
data_6months.rename(columns = {'date':'month'}, inplace = True)
data_6months

In [None]:
# Plot a correlation graph by date over the 181 days (January to June 2022)
plt.figure(figsize=(35,35))
cor = data_date_6months.corr() 
sns.heatmap(cor, annot=True, cmap=plt.cm.PiYG, vmin=-1, vmax=1)
plt.show()

In [None]:
# Plot a correlation graph by month (January to June 2022)
plt.figure(figsize=(25,25))
cor = data_6months.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.PiYG, vmin=-1, vmax=1)
plt.show()

In [None]:
# Finding details of the dataset (by date) of the 181 days (January to June 2022 )
data_date_6months.describe()

In [None]:
# Delete date from data_181 (date is not required in the KNN supervised learning)
data_181 = data_date_6months.drop(["date"], axis=1)
data_181

In [None]:
# K-nearest neighbors requires scaled data.
# The data is scaled using one of the scaling methods. (0 or 1)
msc = MinMaxScaler()
data_181 = pd.DataFrame(msc.fit_transform(data_181), # this is an np.array, not a␣,→dataframe.
                    columns=data_181.columns)

In [None]:
# Get a list of all the columns that don't contain the label
x_cols = [x for x in data_181.columns if x != 'cases_active'] # For this case, cases_active will be our target. Rest is features
# Split the data into two dataframes
x_data = data_181[x_cols] #Features
y_data = data_181['cases_active'] #Target

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=42) #Do cross validation 70:30

In [None]:
#Find probability 
np.mean(y_data), np.mean(1-y_data)

In [None]:
from sklearn.neighbors import KNeighborsRegressor #Use regressor as our target is continuous 
start_time = time.time()

# randomize the k-neighbors parameter of the knn algorithm
param = {'n_neighbors': np.arange(1,20)}

# use knn algorithm
knn = KNeighborsRegressor() #to predict the value of the output variable by using a local average

# use Grid search cv to find the best parameter
knn_cv = GridSearchCV(knn, param, cv = 5)
# fit the algorithm
knn_cv.fit(x_train, y_train)
# get the best params and best score results
print("Best K-neighbor: " , knn_cv.best_params_)
print("Best score achieved: " , knn_cv.best_score_)

In [None]:
# Find the accuracy score
y_pred = knn_cv.predict(x_test)
score = knn_cv.score(x_test, y_test)

def accuracy(real, predict):
    return sum(y_data == y_pred) / float(real.shape[0])

print("Accuracy score of y_pred:", score)
print(y_pred)

In [None]:
# Fit the K-nearest neighbors model with different values of k (from 1 to 21)
# Store the accuracy measurement for each k
score_list = list()
for k in range(1, 21): #The range can be choose, in this case 1 to 21, k=20
    knn = KNeighborsRegressor(n_neighbors=k)
    knn = knn.fit(x_data, y_data)
    y_pred = knn.predict(x_data)
    score = accuracy(y_data, y_pred)
    score_list.append((k, score))

score_data = pd.DataFrame(score_list, columns=['k', 'accuracy'])
sns.set_context('talk')
sns.set_style('ticks')
sns.set_palette('dark')
ax = score_data.set_index('k').plot()
ax.set(xlabel='k', ylabel='accuracy')
ax.set_xticks(range(1, 21));

**Linear Regression**

In [None]:
#Drop unused column
data_new=data_new.drop(columns=['date'])

#Define training and testing data
x_cols = [x for x in data_new.columns]
x_data = data_new[x_cols]

y_col = 'cases_active'

x_cols = [x for x in data_new.columns if x != y_col]
X_data = data_new[x_cols]
y_data = data_new[y_col]
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data,test_size=0.3)

In [None]:
#LR fit
lr=LinearRegression()

lr=lr.fit(X_train,y_train)

y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

#Calculate mean square error
mse_train=mean_squared_error(y_train, y_train_pred)
mse_test=mean_squared_error(y_test, y_test_pred)

print(mse_train,mse_test)

**LR with scaler**

In [None]:
lr = LinearRegression()

scalers = {'Standard': StandardScaler(),
'Minmax': MinMaxScaler(),
'Maxabs': MaxAbsScaler()}

predList=[]
for label,scaler in scalers.items():
    trainingset=X_train.copy()
    testset = X_test.copy()
    trainingset[x_cols]=scaler.fit_transform(trainingset[x_cols])
    testset[x_cols] = scaler.fit_transform(testset[x_cols])

    lr.fit(trainingset, y_train)
    
    test_pred = lr.predict(testset)
    predList.append(test_pred)
    
    mse=mean_squared_error(y_test, test_pred)

    print(label,"Scaler - ",mse)

In [None]:
#Plot predictions
ax = plt.axes()
ax.plot(y_test,y_test,c="black",alpha=0.5)
ax.scatter(y_test, y_test_pred, alpha=0.5)
ax.scatter(y_test, predList[0], alpha=0.5,c="red")
ax.scatter(y_test, predList[1], alpha=0.5,c="yellow")
ax.scatter(y_test, predList[2], alpha=0.5,c="green")

ax.set(xlabel='Ground truth',
ylabel='Predictions',
title='Linear Regression');

In [None]:
#Plot best fit line
ax = plt.subplot(221)
ax.axes.xaxis.set_ticklabels([])
ax.axes.yaxis.set_ticklabels([])
#ax2 = plt.subplot(222)
#ax3 = plt.subplot(223)
#ax4 = plt.subplot(224)
#ax = plt.axes()
x = y_test

ax.plot(x, x,c="black",alpha=0.3) 

a, b = np.polyfit(x, y_test_pred, 1)
ax.plot(x, a*x+b,c="red",alpha=0.5)
ax.title.set_text('No scaler')

i=0
for label,scaler in scalers.items():
    ax = plt.subplot(222+i)
    ax.axes.xaxis.set_ticklabels([])
    ax.axes.yaxis.set_ticklabels([])
    
    a, b = np.polyfit(x, predList[i], 1)
    ax.plot(x, x,c="black",alpha=0.3) 
    ax.plot(x, a*x+b,c="red",alpha=0.5)
    ax.title.set_text(label)
    i+=1

In [None]:
#Define test function
def LRTest(data,target):
    x_cols = [x for x in data.columns]
    x_data = data[x_cols]
    mseList=[]

    y_col = target

    x_cols = [x for x in data.columns if x != y_col]
    X_data = data[x_cols]
    y_data = data[y_col]
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data,test_size=0.2)
    
    lr=LinearRegression()

    lr=lr.fit(X_train,y_train)
    
    y_train_pred = lr.predict(X_train)
    y_test_pred = lr.predict(X_test)

    mse_train=mean_squared_error(y_train, y_train_pred)
    mse_test=mean_squared_error(y_test, y_test_pred)
    
    mseList.append(mse_test)
    
    #With Scaler
    
    scalers = {'Standard': StandardScaler(),
    'Minmax': MinMaxScaler(),
    'Maxabs': MaxAbsScaler()}
    
    predList=[]
    for label,scaler in scalers.items():
        trainingset=X_train.copy()
        testset = X_test.copy()
        trainingset[x_cols]=scaler.fit_transform(trainingset[x_cols])
        testset[x_cols] = scaler.fit_transform(testset[x_cols])

        lr.fit(trainingset, y_train)

        test_pred = lr.predict(testset)
        predList.append(test_pred)

        mse=mean_squared_error(y_test, test_pred)
        mseList.append(mse)
        
    return mseList

In [None]:
nsTotal=0
stdTotal=0
mmTotal=0
maTotal=0

for i in range (100):
    print("Test ",i+1)
    mseList=LRTest(data_new,'cases_active')
    print("MSE for no scaling",mseList[0])
    print("MSE for Standard scaling",mseList[1])
    print("MSE for Minmax scaling",mseList[2])
    print("MSE for Maxabs scaling",mseList[3])
    
    nsTotal+=mseList[0]
    stdTotal+=mseList[1]
    mmTotal+=mseList[2]
    maTotal+=mseList[3]
    
print("Average MSE for no scaling: ",nsTotal/i)
print("Average MSE for Standard scaling: ",stdTotal/i)
print("Average MSE for Minmax scaling: ",mmTotal/i)
print("Average MSE for Maxabs scaling: ",maTotal/i)