# F06 G05 2D project 2021

Install libraries:
- pandas
- numpy
- scipy
- seaborn
- matplotlib
- tk        (Testing purpose)
- scipy     (Testing purpose)

In [1]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'C:\Users\RY\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


Import required libraries:
- Display graphs
- numpy, matplotlib and seaborn for graphing

In [2]:
from IPython.display import display
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

From cohort and homework, we will be using similar functions such as:
- Maximum and minimum normalisation.
- Convertion of x and y axis to numpy array
- Data splitting
- r^2 score
- Mean square error
- Prediction

In [24]:
def normalize_z(df):
    dfout = (df - df.min(axis=0)) / (df.max(axis=0) - df.min(axis=0))
    return dfout


def prepare_x(df_x):
    xAxis = df_x.to_numpy()
    array1 = np.ones((df_x.shape[0], 1))
    return np.concatenate((array1, xAxis), axis=1)


def prepare_y(df_y):
    return df_y.to_numpy()


def predict(df_x, beta):
    x = prepare_x(normalize_z(df_x))
    return predict_norm(x, beta)


def predict_norm(X, beta):
    return np.matmul(X, beta)


def split_data(df_x, df_y, random_state=None, test_size=0.5):
    np.random.seed(random_state)
    feature_index = df_x.index

    test_index = np.random.choice(feature_index, size=int(test_size * len(df_x)), replace=False)
    train_index = [x for x in range(len(feature_index)) if x not in test_index]

    df_x_train = df_x.loc[train_index, :]
    df_x_test = df_x.loc[test_index, :]
    df_y_train = df_y.loc[train_index, :]
    df_y_test = df_y.loc[test_index, :]
    return df_x_train, df_x_test, df_y_train, df_y_test


def r2_score(y, ypred):
    ybar = np.mean(y)
    SStot = np.sum((y - ybar) ** 2)
    SSres = np.sum((y - ypred) ** 2)
    return 1 - SSres / SStot


def mean_squared_error(target, pred):
    n = target.shape[0]
    s = np.sum((target - pred) ** 2)
    return (1 / n) * s


def gradient_descent(X, y, beta, alpha, num_iters):
    m = X.shape[0]
    J_storage = np.zeros((num_iters, 1))
    for n in range(num_iters):
        yp = np.matmul(X, beta)
        error = yp - y
        delta = np.matmul(X.T, error)
        beta = beta - (alpha/m)*delta
        J_storage[n] = compute_cost(X,y,beta)
    return beta, J_storage


def compute_cost(X, y, beta):
    size = X.shape[0]
    yp = np.matmul(X, beta)
    error = yp - y
    J = (1/(2*size))*np.matmul(error.T, error)
    return J


In [88]:
dataFrame = pd.read_csv("2Ddata6.csv")
columns = dataFrame.columns

yVal = pd.DataFrame(dataFrame.loc[:, "total_deaths"].fillna(0))
xVal = pd.DataFrame(columns=[])

xVal["Location"] = pd.DataFrame(dataFrame["location"])
xVal["Days"] = pd.DataFrame(dataFrame["Days"].fillna(0))
xVal["Vaccinated"] = pd.DataFrame(dataFrame["total_vaccinations"].fillna(0))
xVal["GDP"] = pd.DataFrame(dataFrame["gdp_per_capita"].fillna(0))
xVal["Population"] = pd.DataFrame(dataFrame["population_density"].fillna(0))
xVal["HDI"] = pd.DataFrame(dataFrame["human_development_index"].fillna(0))
xVal["Facilities"] = pd.DataFrame(dataFrame["hospital_beds_per_thousand"].fillna(0))


xVal_train, xVal_test, yVal_train, yVal_test = split_data(xVal, yVal, 70, 0.3)

columns = ["Days","Vaccinated","GDP","Population","HDI","Facilities"]
xtrain = xVal_train[columns]
xtest = xVal_test[columns]

xMVal = normalize_z(xtrain)

xAxis = prepare_x(xMVal)
yAxis = prepare_y(yVal_train)

iterations = 10500
alpha = 0.05
beta = np.zeros((xAxis.shape[1], 1))

beta, dummy = gradient_descent(xAxis, yAxis, beta, alpha, iterations)

prediction = pd.DataFrame(predict(xtest, beta))

for x in columns:
    sns.set()

    scat0 = sns.scatterplot(x=xtest[x], y=prediction[0])
    scat1 = sns.scatterplot(x=xtest[x], y=yVal_test["total_deaths"])

    scat0.set_ylabel("Deaths")
    scat0.set_xlabel(x)
    plt.show()

print("r^2 Value:", r2_score(prepare_y(yVal_test), prediction))
# display(beta)
# display(xtest.columns)

country = {}
for i in xVal_test['Location']:
    if i not in country.keys():
        country[i] = xVal_test.loc[(xVal_test['Location'] == i),:]

r^2 Value: 0    0.417818
dtype: float64
{'Japan':      Location  Days  Vaccinated       GDP  Population    HDI  Facilities
1236    Japan   156     1288566  40113.06     347.778  0.919       13.05
1138    Japan    58           0  40113.06     347.778  0.919       13.05
1355    Japan   275    91697753  40113.06     347.778  0.919       13.05
1200    Japan   120       31785  40113.06     347.778  0.919       13.05
1253    Japan   173     2977156  40113.06     347.778  0.919       13.05
...       ...   ...         ...       ...         ...    ...         ...
1267    Japan   187     5044329  40113.06     347.778  0.919       13.05
1293    Japan   213    18512075  40113.06     347.778  0.919       13.05
1305    Japan   225    30348520  40113.06     347.778  0.919       13.05
1335    Japan   255    69975928  40113.06     347.778  0.919       13.05
1401    Japan   321   149882538  40113.06     347.778  0.919       13.05

[105 rows x 7 columns], 'Taiwan':      Location  Days  Vaccinated      GD

In [69]:
y = beta[0][0]
pData = { \
    "Days": 1000, \
    "Vaccinated": 1000000, \
    "GDP": 50000, \
    "Population_Density": 7000, \
    "Human_Development_Index": 0.928, \
    "Facilities": 2.2
    }
# pData = pd.DataFrame([pData])
# pData = normalize_z(pData)
for index, x in enumerate(pData):
    y += pData[x]*beta[index+1][0]

print("Predicted deaths:", y)

Predicted deaths: -40215720677.33754
