# **HW1: Regression** 
In *assignment 1*, you need to finish:

1.  Basic Part: Implement the regression model to predict the number of dengue cases


> *   Step 1: Split Data
> *   Step 2: Preprocess Data
> *   Step 3: Implement Regression
> *   Step 4: Make Prediction
> *   Step 5: Train Model and Generate Result

2.  Advanced Part: Implementing a regression model to predict the number of dengue cases in a different way than the basic part

# 1. Basic Part (60%)
In the first part, you need to implement the regression to predict the number of dengue cases

Please save the prediction result in a csv file **hw1_basic.csv**


## Import Packages

> Note: You **cannot** import any other package in the basic part

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import csv
import math
import random

## Global attributes
Define the global attributes

In [2]:
input_dataroot = 'hw1_basic_input.csv' # Input file named as 'hw1_basic_input.csv'
output_dataroot = 'hw1_basic.csv' # Output file will be named as 'hw1_basic.csv'

input_datalist =  [] # Initial datalist, saved as numpy array
output_datalist =  [] # Your prediction, should be 10 * 4 matrix and saved as numpy array
             # The format of each row should be ['epiweek', 'CityA', 'CityB', 'CityC']

You can add your own global attributes here


In [3]:
# x = temperature, y = cases
split_data = [] # data that doesnt contain ''

a_train_x = []
a_train_y = []
a_test_x = []
a_test_y = []
a_predict_x = []
a_predict_y = []

b_train_x = []
b_train_y = []
b_test_x = []
b_test_y = []
b_predict_x = []
b_predict_y = []

c_train_x = []
c_train_y = []
c_test_x = []
c_test_y = []
c_predict_x = []
c_predict_y = []

## Load the Input File
First, load the basic input file **hw1_basic_input.csv**

Input data would be stored in *input_datalist*

In [4]:
# Read input csv to datalist
with open(input_dataroot, newline='') as csvfile:
    input_datalist = np.array(list(csv.reader(csvfile)))

## Implement the Regression Model

> Note: It is recommended to use the functions we defined, you can also define your own functions


### Step 1: Split Data
Split data in *input_datalist* into training dataset and validation dataset 



In [5]:
def SplitData():
    global a_train_x, a_train_y, a_test_x, a_test_y, a_predict_x, a_predict_y
    global b_train_x, b_train_y, b_test_x, b_test_y, b_predict_x, b_predict_y
    global c_train_x, c_train_y, c_test_x, c_test_y, c_predict_x, c_predict_y

    # skip [0] since it's column name
    # remove the data that contains ''
    for row in input_datalist[1:]:
        if '' not in row:
            split_data.append(list(row))

    """
    MAPE(A, B, C) of training data len:
    [-60:-15] => 51.63%, 6.24%, 26.49%  => 27.6
    [-60:-20] => 46.03%, 18.43%, 31.80% => 31.6
    [-50:-15] => 46.10%, 4.71%, 12.20%  => 20.6 <===
    [-50:-20] => 39.44%, 17.32%, 18.78% => 24.6
    [-40:-15] => 58.18%, 6.24%, 11.55%  => 25
    [-40:-20] => 52.10%, 17.93%, 10.45% => 26.3
    """

    # training data
    for row in split_data[-50:-15]:
        a_train_x.append(float(row[1]))
        a_train_y.append(int(row[4]))

        b_train_x.append(float(row[2]))
        b_train_y.append(int(row[5]))

        c_train_x.append(float(row[3]))
        c_train_y.append(int(row[6]))


    # test data, [-10:] is for prediction, not testing
    for row in split_data[-15:-10]:
        a_test_x.append(float(row[1]))
        a_test_y.append(int(row[4]))

        b_test_x.append(float(row[2]))
        b_test_y.append(int(row[5]))

        c_test_x.append(float(row[3]))
        c_test_y.append(int(row[6]))


    # predict data => [-10:]
    for row in split_data[-10:]:
        a_predict_x.append(float(row[1]))
        a_predict_y.append(int(row[4]))

        b_predict_x.append(float(row[2]))
        b_predict_y.append(int(row[5]))

        c_predict_x.append(float(row[3]))
        c_predict_y.append(int(row[6]))

### Step 2: Preprocess Data
Handle the unreasonable data
> Hint: Outlier and missing data can be handled by removing the data or adding the values with the help of statistics  

In [6]:
def PreprocessData():
    global a_train_x, a_train_y, a_test_x, a_test_y, a_predict_x, a_predict_y
    global b_train_x, b_train_y, b_test_x, b_test_y, b_predict_x, b_predict_y
    global c_train_x, c_train_y, c_test_x, c_test_y, c_predict_x, c_predict_y

    # city a temperature
    for i in range(len(a_train_x)):
        temp = a_train_x[i]
        if temp<=15 or temp>=35:
            a_train_x[i] = 25
    
    # city a cases
    for i in range(len(a_train_y)):
        case = a_train_y[i]
        if case>=100:
            a_train_y[i] = int(a_train_y[i]*0.3)
    
    # city b temperature
    for i in range(len(b_train_x)):
        temp = b_train_x[i]
        if temp<=15 or temp>=30:
            b_train_x[i] = 22

    # city b cases
    for i in range(len(b_train_y)):
        temp = b_train_y[i]
        if temp>=60:
            b_train_y[i] -= 40

    # city c temperature
    for i in range(len(c_train_x)):
        temp = c_train_x[i]
        if temp<=17 or temp>=40:
            c_train_x[i] = 25

    # city c cases
    for i in range(len(c_train_y)):
        temp = c_train_y[i]
        if temp>=80:
            c_train_y[i] -= 40

    a_train_x = np.array(a_train_x)
    a_train_y = np.array(a_train_y)
    a_test_x = np.array(a_test_x)
    a_test_y = np.array(a_test_y)
    a_predict_x = np.array(a_predict_x)
    a_predict_y = np.array(a_predict_y)

    b_train_x = np.array(b_train_x)
    b_train_y = np.array(b_train_y)
    b_test_x = np.array(b_test_x)
    b_test_y = np.array(b_test_y)
    b_predict_x = np.array(b_predict_x)
    b_predict_y = np.array(b_predict_y)

    c_train_x = np.array(c_train_x)
    c_train_y = np.array(c_train_y)
    c_test_x = np.array(c_test_x)
    c_test_y = np.array(c_test_y)
    c_predict_x = np.array(c_predict_x)
    c_predict_y = np.array(c_predict_y)


### Step 3: Implement Regression
> Hint: You can use Matrix Inversion, or Gradient Descent to finish this part




In [7]:
def Regression():
    model_a = np.polyfit(a_train_x, a_train_y, 2)
    model_b = np.polyfit(b_train_x, b_train_y, 1)
    model_c = np.polyfit(c_train_x, c_train_y, 1)
    print("city a model = ", model_a)
    print("city b model = ", model_b)
    print("city c model = ", model_c, "\n")
    return model_a, model_b, model_c

### Step 4: Make Prediction
Make prediction of testing dataset and store the value in *output_datalist*

In [8]:
def MakePrediction(model_a, model_b, model_c):
    global a_predict_x, a_predict_y
    global b_predict_x, b_predict_y
    global c_predict_x, c_predict_y

    # predict_y = [0, 0, ..., 0] at first
    a_predict_y = np.polyval(model_a, a_predict_x)
    b_predict_y = np.polyval(model_b, b_predict_x)
    c_predict_y = np.polyval(model_c, c_predict_x)

    a_predict_y = np.array([int(i) for i in list(a_predict_y)])
    b_predict_y = np.array([int(i) for i in list(b_predict_y)])
    c_predict_y = np.array([int(i) for i in list(c_predict_y)])

In [9]:
# test test_x, test_y
def Test(model_a, model_b, model_c):
    global a_test_x, a_test_y
    global b_test_x, b_test_y
    global c_test_x, c_test_y

    def mape(actual, predict):
        return np.mean(np.abs((actual - predict) / actual)) * 100

    # test_y => standard answer
    # result => our evaluation
    result = np.polyval(model_a, a_test_x)
    result = np.array([int(i) for i in list(result)])
    print(f"A: mape = {mape(a_test_y, result):.2f}%")
    
    result = np.polyval(model_b, b_test_x)
    result = np.array([int(i) for i in list(result)])
    print(f"B: mape = {mape(b_test_y, result):.2f}%")

    result = np.polyval(model_c, c_test_x)
    result = np.array([int(i) for i in list(result)])
    print(f"C: mape = {mape(c_test_y, result):.2f}%")

In [10]:
def Plot():
    global a_train_x, a_train_y, a_test_x, a_test_y, a_predict_x, a_predict_y
    global b_train_x, b_train_y, b_test_x, b_test_y, b_predict_x, b_predict_y
    global c_train_x, c_train_y, c_test_x, c_test_y, c_predict_x, c_predict_y
    
    plt.title("city a")
    plt.scatter(a_train_x, a_train_y, color = 'blue')
    plt.scatter(a_test_x, a_test_y, color = 'red')
    plt.scatter(a_predict_x, a_predict_y, color = 'orange')
    plt.show()

    plt.title("city b")
    plt.scatter(b_train_x, b_train_y, color = 'blue')
    plt.scatter(b_test_x, b_test_y, color = 'red')
    plt.scatter(b_predict_x, b_predict_y, color = 'orange')
    plt.show()

    plt.title("city c")
    plt.scatter(c_train_x, c_train_y, color = 'blue')
    plt.scatter(c_test_x, c_test_y, color = 'red')
    plt.scatter(c_predict_x, c_predict_y, color = 'orange')
    plt.show()

### Step 5: Train Model and Generate Result

> Notice: **Remember to output the coefficients of the model here**, otherwise 5 points would be deducted
* If your regression model is *3x^2 + 2x^1 + 1*, your output would be: 
```
3 2 1
```





In [11]:
# remember to test the function
SplitData()
PreprocessData()
model_a, model_b, model_c = Regression()    # remember to print the coef
MakePrediction(model_a, model_b, model_c)
Test(model_a, model_b, model_c)
#Plot()

city a model =  [  -0.48105859   24.01307842 -260.79126575]
city b model =  [-0.57694117 36.11129474]
city c model =  [ -3.71290268 138.97913453] 

A: mape = 46.10%
B: mape = 4.71%
C: mape = 12.20%


## Write the Output File
Write the prediction to output csv
> Format: 'epiweek', 'CityA', 'CityB', 'CityC'

In [12]:
week_of_year = [row[0] for row in split_data[-10:]]

output_datalist = list(zip(week_of_year, a_predict_y, b_predict_y, c_predict_y))

with open(output_dataroot, 'w', newline='', encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    for row in output_datalist:
        writer.writerow(row)

# 2. Advanced Part (35%)
In the second part, you need to implement the regression in a different way than the basic part to help your predictions for the number of dengue cases

We provide you with two files **hw1_advanced_input1.csv** and **hw1_advanced_input2.csv** that can help you in this part

Please save the prediction result in a csv file **hw1_advanced.csv** 


In [13]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import csv
import math
import random
from sklearn import linear_model
from sklearn.model_selection import train_test_split

# global var
basic_input_dataroot = 'hw1_basic_input.csv'
advanced_input_dataroot1 = 'hw1_advanced_input1.csv'
advanced_input_dataroot2 = 'hw1_advanced_input2.csv'
output_dataroot = 'hw1_advanced.csv'

basic_input_datalist =  []
advanced_input_datalist1 =  []
advanced_input_datalist2 =  []
output_datalist =  []

# y = cases, x1 = temperature, x2 = precipitation, x3 = # of Houses per km^2
split_data = [] # data that doesnt contain ''
empty_entry_idx = []

a_housesperKm2 = b_housesperKm2 = c_housesperKm2 = 0

a_train_x1 = []
a_train_x2 = []
a_train_x3 = []
a_train_y = []
a_test_x1 = []
a_test_x2 = []
a_test_x3 = []
a_test_y = []
a_predict_x1 = []
a_predict_x2 = []
a_predict_x3 = []
a_predict_y = []

b_train_x1 = []
b_train_x2 = []
b_train_x3 = []
b_train_y = []
b_test_x1 = []
b_test_x2 = []
b_test_x3 = []
b_test_y = []
b_predict_x1 = []
b_predict_x2 = []
b_predict_x3 = []
b_predict_y = []

c_train_x1 = []
c_train_x2 = []
c_train_x3 = []
c_train_y = []
c_test_x1 = []
c_test_x2 = []
c_test_x3 = []
c_test_y = []
c_predict_x1 = []
c_predict_x2 = []
c_predict_x3 = []
c_predict_y = []

model_a = model_b = model_c = 0

# Read input csv to datalist
with open(basic_input_dataroot, newline='') as csvfile:
    basic_input_datalist = np.array(list(csv.reader(csvfile)))

with open(advanced_input_dataroot1, newline='') as csvfile:
    advanced_input_datalist1 = np.array(list(csv.reader(csvfile)))

with open(advanced_input_dataroot2, newline='') as csvfile:
    advanced_input_datalist2 = np.array(list(csv.reader(csvfile)))

def SplitBasicData():
    global a_train_x1, a_train_y, a_test_x1, a_test_y, a_predict_x1, a_predict_y
    global b_train_x1, b_train_y, b_test_x1, b_test_y, b_predict_x1, b_predict_y
    global c_train_x1, c_train_y, c_test_x1, c_test_y, c_predict_x1, c_predict_y


    # skip [0] since it's column name
    # remove the data that contains ''
    for idx, row in enumerate(basic_input_datalist[1:]):
        if '' not in row:
            split_data.append(list(row))
        else:
            empty_entry_idx.append(idx)

    """
    MAPE(A, B, C) of training data len:
    [-60:-15] => 53.45%, 6.05%, 19.90%  => 26
    [-60:-20] => 49.54%, 18.10%, 24.58% => 30.3
    [-50:-15] => 46.54%, 6.42%, 19.43%  => 23.6 <===
    [-50:-20] => 46.80%, 18.30%, 16.72% => 26.6
    [-40:-15] => 54.63%, 11.89%, 15.31% => 26.6
    [-40:-20] => 65.73%, 20.82%, 14.43% => 33
    """

    # training data
    for row in split_data[-50:-15]:
        a_train_x1.append(float(row[1]))
        a_train_y.append(int(row[4]))

        b_train_x1.append(float(row[2]))
        b_train_y.append(int(row[5]))

        c_train_x1.append(float(row[3]))
        c_train_y.append(int(row[6]))


    # test data, [-10:] is for prediction, not testing
    for row in split_data[-15:-10]:
        a_test_x1.append(float(row[1]))
        a_test_y.append(int(row[4]))

        b_test_x1.append(float(row[2]))
        b_test_y.append(int(row[5]))

        c_test_x1.append(float(row[3]))
        c_test_y.append(int(row[6]))


    # predict data
    for row in split_data[-10:]:
        a_predict_x1.append(float(row[1]))
        a_predict_y.append(int(row[4]))

        b_predict_x1.append(float(row[2]))
        b_predict_y.append(int(row[5]))

        c_predict_x1.append(float(row[3]))
        c_predict_y.append(int(row[6]))


def SplitAdvancedData():
    global split_data
    global a_housesperKm2, b_housesperKm2, c_housesperKm2
    global a_train_x3, a_test_x3, a_predict_x3
    global b_train_x3, b_test_x3, b_predict_x3
    global c_train_x3, c_test_x3, c_predict_x3


    split_data = list(advanced_input_datalist1[1:])    # reset
    for idx in empty_entry_idx[::-1]:   # popping the element in reverse order ensures ordering in split_data
        split_data.pop(idx)


    # training data
    for row in split_data[-50:-15]:
        a_train_x2.append(float(row[1]))
        b_train_x2.append(float(row[2]))
        c_train_x2.append(float(row[3]))


    # test data, [-10:] is for prediction, not testing
    for row in split_data[-15:-10]:
        a_test_x2.append(float(row[1]))
        b_test_x2.append(float(row[2]))
        c_test_x2.append(float(row[3]))


    # predict data => [-10:]
    for row in split_data[-10:]:
        a_predict_x2.append(float(row[1]))
        b_predict_x2.append(float(row[2]))
        c_predict_x2.append(float(row[3]))


    a_housesperKm2 = float(advanced_input_datalist2[1][24])
    b_housesperKm2 = float(advanced_input_datalist2[2][24])
    c_housesperKm2 = float(advanced_input_datalist2[3][24])

    tmp = [a_housesperKm2 for i in range(99)]
    a_train_x3, a_test_x3, a_predict_x3 = tmp[-50:-15], tmp[-15:-10], tmp[-10:]

    tmp = [b_housesperKm2 for i in range(99)]
    b_train_x3, b_test_x3, b_predict_x3 = tmp[-50:-15], tmp[-15:-10], tmp[-10:]

    tmp = [c_housesperKm2 for i in range(99)]
    c_train_x3, c_test_x3, c_predict_x3 = tmp[-50:-15], tmp[-15:-10], tmp[-10:]
    
def SplitData():
    SplitBasicData()
    SplitAdvancedData()

def PreprocessData():
    global a_train_x1, a_train_y
    global b_train_x1, b_train_y
    global c_train_x1, c_train_y

    # city a temperature
    for i in range(len(a_train_x1)):
        temp = a_train_x1[i]
        if temp<=15 or temp>=35:
            a_train_x1[i] = 25
    
    # city a cases
    for i in range(len(a_train_y)):
        temp = a_train_y[i]
        if temp>=125:
            a_train_y[i] -= 75
    
    # city b temperature
    for i in range(len(b_train_x1)):
        temp = b_train_x1[i]
        if temp<=15 or temp>=30:
            b_train_x1[i] = 22

    # city b cases
    for i in range(len(b_train_y)):
        temp = b_train_y[i]
        if temp>=60:
            b_train_y[i] -= 40

    # city c temperature
    for i in range(len(c_train_x1)):
        temp = c_train_x1[i]
        if temp<=15 or temp>=40:
            c_train_x1[i] = 25

    # city c cases
    for i in range(len(c_train_y)):
        temp = c_train_y[i]
        if temp>=80:
            c_train_y[i] -= 20

def Regression():
    global model_a, model_b, model_c
    
    model_a = linear_model.LinearRegression()  # linear regression object
    model_b = linear_model.LinearRegression()
    model_c = linear_model.LinearRegression()

    X = list(zip(a_train_x1, a_train_x2, a_train_x3))
    model_a.fit(X, a_train_y)
    print("model A: coef = ", model_a.coef_, "intercept = ", model_a.intercept_)

    X = list(zip(b_train_x1, b_train_x2, b_train_x3))
    model_b.fit(X, b_train_y)
    print("model B: coef = ", model_b.coef_, "intercept = ", model_b.intercept_)

    X = list(zip(c_train_x1, c_train_x2, c_train_x3))
    model_c.fit(X, c_train_y)
    print("model C: coef = ", model_c.coef_, "intercept = ", model_c.intercept_)

def MakePrediction():
    global a_predict_y, b_predict_y, c_predict_y

    # predict_y = [0, 0, ..., 0] at first
    X = list(zip(a_predict_x1, a_predict_x2, a_predict_x3))
    a_predict_y = model_a.predict(X)
    a_predict_y = [int(i) for i in list(a_predict_y)]

    X = list(zip(b_predict_x1, b_predict_x2, b_predict_x3))
    b_predict_y = model_b.predict(X)
    b_predict_y = [int(i) for i in list(b_predict_y)]

    X = list(zip(c_predict_x1, c_predict_x2, c_predict_x3))
    c_predict_y = model_c.predict(X)
    c_predict_y = [int(i) for i in list(c_predict_y)]

def Test():
    def mape(actual, predict):
        return np.mean(np.abs((actual - predict) / actual)) * 100
    
    # test_y => standard answer
    # result => our evaluation
    X = list(zip(a_test_x1, a_test_x2, a_test_x3))
    result = model_a.predict(X)
    print(f"MAPE(A) = {mape(a_test_y, result):.2f}%")

    X = list(zip(b_test_x1, b_test_x2, b_test_x3))
    result = model_b.predict(X)
    print(f"MAPE(B) = {mape(b_test_y, result):.2f}%")

    X = list(zip(c_test_x1, c_test_x2, c_test_x3))
    result = model_c.predict(X)
    print(f"MAPE(C) = {mape(c_test_y, result):.2f}%")

# remember to test the function
SplitData()
PreprocessData()
Regression()
MakePrediction()
Test()

week_of_year = [row[0] for row in split_data[-10:]]

output_datalist = list(zip(week_of_year, a_predict_y, b_predict_y, c_predict_y))

# write the output file
with open(output_dataroot, 'w', newline='', encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    for row in output_datalist:
        writer.writerow(row)

model A: coef =  [-0.91526857 -0.15460752  0.00528221] intercept =  55.65772237222846
model B: coef =  [-0.49186969 -0.17731504 -0.00057258] intercept =  36.25035847666443
model C: coef =  [-2.89837988  0.8911305  -0.11469951] intercept =  122.0325296288359
MAPE(A) = 46.54%
MAPE(B) = 6.42%
MAPE(C) = 19.43%


# Report *(5%)*

Report should be submitted as a pdf file **hw1_report.pdf**

*   Briefly describe the difficulty you encountered 
*   Summarize your work and your reflections 
*   No more than one page






# Save the Code File
Please save your code and submit it as an ipynb file! (**hw1.ipynb**)