In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt

import math
import seaborn as sns
import multiprocessing
from multiprocessing import Process

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

In [2]:
class Result:
    def __init__(self, dataset_type, random_state, test_size, algorithm_name, fit_time,
                 figure, r_square, mea, mse, rmse, k):
        self.dataset_type = dataset_type
        self.random_state = random_state
        self.test_size = test_size
        self.algorithm_name = algorithm_name
        self.fit_time = fit_time
        self.figure = figure
        self.r_square = r_square
        self.mea = mea
        self.mse = mse
        self.rmse = rmse
        self.k = k
    
    def print_object(self):
        print(f"""
        Algorithm: {self.algorithm_name}
        Dataset: {self.dataset_type}
        Test Size: {self.test_size}
        Random State: {self.random_state}
        Fit Time: {self.fit_time}
        Figure Path: {self.figure}
        R Square: {self.r_square}
        MEA: {self.mea}
        MSE: {self.mse}
        RMSE: {self.rmse}
        K: {self.k}\n\n
        """)

In [3]:
results1 = multiprocessing.Manager().list()
results2 = multiprocessing.Manager().list()
results3 = multiprocessing.Manager().list()

In [4]:
def runInParallel(*fns):
    proc = []
    for fn in fns:
        p = Process(target=fn)
        p.start()
        proc.append(p)
    for p in proc:
        p.join()

In [5]:
def standard_scaler(dataset):
    columns = dataset.columns.to_list()
    scaler = StandardScaler() 
    scaled = scaler.fit_transform(dataset.to_numpy())
    return pd.DataFrame(scaled, columns=columns)

In [6]:
def min_max_scaler(dataset):
    columns = dataset.columns.to_list()
    scaler = MinMaxScaler() 
    scaled = scaler.fit_transform(dataset.to_numpy())
    return pd.DataFrame(scaled, columns=columns)

In [7]:
data1 = pd.read_csv('process.csv')
data2 = standard_scaler(data1)
data3 = min_max_scaler(data1)

In [8]:
def plotGraph(y_test, y_pred, regressorName, figName):
    plt.figure(figsize=(10,10))
    plt.scatter(y_test,y_pred, c='crimson')
    plt.yscale('log')
    plt.xscale('log')

    p1 = max(max(y_pred), max(y_test))
    p2 = min(min(y_pred), min(y_test))
    plt.plot([p1, p2], [p1, p2], 'b-')
    plt.xlabel('True Values', fontsize=15)
    plt.ylabel('Predictions', fontsize=15)
    plt.axis('equal')
    plt.savefig(figName, format="svg")
    plt.close()

In [9]:
def split_test(rand, size, dataset):
    y = dataset['new_cases'].values
    x = dataset.drop(['new_cases'], axis=1).values
    return train_test_split(x, y, random_state=rand, test_size=size)

In [10]:
def liner_test(name, rand, size, X_train, X_test, y_train, y_test):
        start = datetime.datetime.now()
        
        regressor = LinearRegression()
        regressor.fit(X_train, y_train)
        y_pred = regressor.predict(X_test)
        
        end = datetime.datetime.now()
        
        figureName = f"LN-{name}-{rand}-{size}"
        plotGraph(y_test, y_pred, 'Linear Regression', figureName)

        r_square = (1 - ((1 - metrics.r2_score(y_test, y_pred)) * (len(y_test) - 1)) / (len(y_test) - X_train.shape[1] - 1))
        mea = metrics.mean_absolute_error(y_test, y_pred)
        mse = metrics.mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
        
        res = Result(name, rand, size, "LinearRegression", (end-start), figureName, r_square, mea, mse, rmse, 0)
        results1.append(res)

In [11]:
def rand_forest(name, rand, size, X_train, X_test, y_train, y_test):
    start = datetime.datetime.now()
    
    rf = RandomForestRegressor(n_estimators = 1000 - rand, random_state = rand + 10)
    rf.fit(X_train, y_train);
    y_pred = rf.predict(X_test)
    
    end = datetime.datetime.now()
    
    figureName = f"RF-{name}-{rand}-{size}"
    plotGraph(y_test, y_pred, 'Random Forest', figureName)
    r_square = (1 - ((1 - metrics.r2_score(y_test, y_pred)) * (len(y_test) - 1)) / (len(y_test) - X_train.shape[1] - 1))
    mea = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    
    res = Result(name, rand, size, "RandomForest", (end-start), figureName, r_square, mea, mse, rmse, 0)
    results2.append(res)

In [12]:
def knn_regression(name, rand, size, X_train, X_test, y_train, y_test):
    start = datetime.datetime.now()
    for K in range(20):
        K = K+1
        
        clf = KNeighborsRegressor(n_neighbors = K)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        end = datetime.datetime.now()
        
        figureName = f"RF-{name}-{rand}-{size}-{K}"
        plotGraph(y_test, y_pred, f"KNN with K {K}", figureName)
        
        r_square = (1 - ((1 - metrics.r2_score(y_test, y_pred)) * (len(y_test) - 1)) / (len(y_test) - X_train.shape[1] - 1))
        mea = metrics.mean_absolute_error(y_test, y_pred)
        mse = metrics.mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
        
        res = Result(name, rand, size, "KNN", (end-start), figureName, r_square, mea, mse, rmse, K)
        results3.append(res)

In [13]:
def test1():
    start = datetime.datetime.now()
    first_time = True
    for i in range(0, 11):        
        if first_time:
            rand = 0
            size = 0.05
            first_time = False
        else:
            rand = i * 5
            size = i * 0.05
        
        X_train, X_test, y_train, y_test = split_test(rand, size, data1)
        liner_test('Original', rand, size, X_train, X_test, y_train, y_test)
        rand_forest('Original', rand, size, X_train, X_test, y_train, y_test)
        knn_regression('Original', rand, size, X_train, X_test, y_train, y_test)
        
    end = datetime.datetime.now()
    print(f"\nTest1 Time: ", end-start)

In [14]:
def test2():
    start = datetime.datetime.now()
    first_time = True
    for i in range(0, 11):
        if first_time:
            rand = 0
            size = 0.05
            first_time = False
        else:
            rand = i * 5
            size = i * 0.05
        
        X_train, X_test, y_train, y_test = split_test(rand, size, data2)
        liner_test('Standard Scaler', rand, size, X_train, X_test, y_train, y_test)
        rand_forest('Standard Scaler', rand, size, X_train, X_test, y_train, y_test)
        knn_regression('Standard Scaler', rand, size, X_train, X_test, y_train, y_test)
        
    end = datetime.datetime.now()
    print(f"\nTest2 Time: ", end-start)

In [15]:
def test3():
    start = datetime.datetime.now()
    first_time = True
    
    for i in range(0, 11):        
        if first_time:
            rand = 0
            size = 0.05
            first_time = False
        else:
            rand = i * 5
            size = i * 0.05
            
        rand = int(rand)
        X_train, X_test, y_train, y_test = split_test(rand, size, data3)
        liner_test('MinMax Scaler', rand, size, X_train, X_test, y_train, y_test)
        rand_forest('MinMax Scaler', rand, size, X_train, X_test, y_train, y_test)
        knn_regression('MinMax Scaler', rand, size, X_train, X_test, y_train, y_test)
        
    end = datetime.datetime.now()
    print(f"\nTest3 Time: ", end-start)

In [16]:
start = datetime.datetime.now()
print(f"Parallel start at {start}")
runInParallel(test1, test2, test3)
end = datetime.datetime.now()
print(f"\n\nParallel time: ", end-start)

Parallel start at 2021-09-20 13:02:31.311882

Test2 Time:  0:55:57.357720

Test1 Time:  0:56:46.425211

Test3 Time:  0:56:52.686900


Parallel time:  0:56:52.963106


In [17]:
for item in results2:
    results3.append(item)
for item in results1:
    results3.append(item)

In [18]:
results = list(results3)

In [33]:
results.sort(key=lambda x: x.r_square, reverse=True)
for i in range(0, 11):
    print(f"Position {i}:")
    results[i].print_object()

Position 0:

        Algorithm: RandomForest
        Dataset: Original
        Test Size: 0.05
        Random State: 0
        Fit Time: 0:05:44.010615
        Figure Path: RF-Original-0-0.05
        R Square: 0.9897803986681057
        MEA: 830.7430192307692
        MSE: 5773648.723257692
        RMSE: 2402.84180154618
        K: 0


        
Position 1:

        Algorithm: RandomForest
        Dataset: MinMax Scaler
        Test Size: 0.05
        Random State: 0
        Fit Time: 0:05:43.657389
        Figure Path: RF-MinMax Scaler-0-0.05
        R Square: 0.989761264286866
        MEA: 0.0020010982974572444
        MSE: 3.371845847702826e-05
        RMSE: 0.005806759722687711
        K: 0


        
Position 2:

        Algorithm: RandomForest
        Dataset: Standard Scaler
        Test Size: 0.05
        Random State: 0
        Fit Time: 0:05:44.157249
        Figure Path: RF-Standard Scaler-0-0.05
        R Square: 0.9897119651150071
        MEA: 0.03315663830361251
        MSE

In [34]:
results.sort(key=lambda x: x.mea, reverse=False)
for i in range(0, 11):
    print(f"Position {i}:")
    results[i].print_object()

Position 0:

        Algorithm: RandomForest
        Dataset: MinMax Scaler
        Test Size: 0.05
        Random State: 0
        Fit Time: 0:05:43.657389
        Figure Path: RF-MinMax Scaler-0-0.05
        R Square: 0.989761264286866
        MEA: 0.0020010982974572444
        MSE: 3.371845847702826e-05
        RMSE: 0.005806759722687711
        K: 0


        
Position 1:

        Algorithm: KNN
        Dataset: MinMax Scaler
        Test Size: 0.05
        Random State: 0
        Fit Time: 0:00:02.287624
        Figure Path: RF-MinMax Scaler-0-0.05-2
        R Square: 0.9871701292210329
        MEA: 0.002251687739432621
        MSE: 4.2251648762777155e-05
        RMSE: 0.006500126826668627
        K: 2


        
Position 2:

        Algorithm: RandomForest
        Dataset: MinMax Scaler
        Test Size: 0.25
        Random State: 25
        Fit Time: 0:04:07.975943
        Figure Path: RF-MinMax Scaler-25-0.25
        R Square: 0.9646185248451241
        MEA: 0.0024116355924535

In [37]:
results.sort(key=lambda x: x.mse, reverse=False)
for i in range(0, 11):
    print(f"Position {i}:")
    results[i].print_object()

Position 0:

        Algorithm: RandomForest
        Dataset: MinMax Scaler
        Test Size: 0.05
        Random State: 0
        Fit Time: 0:05:43.657389
        Figure Path: RF-MinMax Scaler-0-0.05
        R Square: 0.989761264286866
        MEA: 0.0020010982974572444
        MSE: 3.371845847702826e-05
        RMSE: 0.005806759722687711
        K: 0


        
Position 1:

        Algorithm: KNN
        Dataset: MinMax Scaler
        Test Size: 0.05
        Random State: 0
        Fit Time: 0:00:02.287624
        Figure Path: RF-MinMax Scaler-0-0.05-2
        R Square: 0.9871701292210329
        MEA: 0.002251687739432621
        MSE: 4.2251648762777155e-05
        RMSE: 0.006500126826668627
        K: 2


        
Position 2:

        Algorithm: KNN
        Dataset: MinMax Scaler
        Test Size: 0.05
        Random State: 0
        Fit Time: 0:00:09.444304
        Figure Path: RF-MinMax Scaler-0-0.05-6
        R Square: 0.9828831869625648
        MEA: 0.0024576181034257297
     

In [38]:
results.sort(key=lambda x: x.rmse, reverse=False)
for i in range(0, 11):
    print(f"Position {i}:")
    results[i].print_object()

Position 0:

        Algorithm: RandomForest
        Dataset: MinMax Scaler
        Test Size: 0.05
        Random State: 0
        Fit Time: 0:05:43.657389
        Figure Path: RF-MinMax Scaler-0-0.05
        R Square: 0.989761264286866
        MEA: 0.0020010982974572444
        MSE: 3.371845847702826e-05
        RMSE: 0.005806759722687711
        K: 0


        
Position 1:

        Algorithm: KNN
        Dataset: MinMax Scaler
        Test Size: 0.05
        Random State: 0
        Fit Time: 0:00:02.287624
        Figure Path: RF-MinMax Scaler-0-0.05-2
        R Square: 0.9871701292210329
        MEA: 0.002251687739432621
        MSE: 4.2251648762777155e-05
        RMSE: 0.006500126826668627
        K: 2


        
Position 2:

        Algorithm: KNN
        Dataset: MinMax Scaler
        Test Size: 0.05
        Random State: 0
        Fit Time: 0:00:09.444304
        Figure Path: RF-MinMax Scaler-0-0.05-6
        R Square: 0.9828831869625648
        MEA: 0.0024576181034257297
     