In [1]:
import pandas as pd
import numpy as np
import math as mt

In [2]:
# Load the datasets
mobility_df = pd.read_csv("changes-visitors-covid_final.csv")
cases_df = pd.read_csv("covid-data.csv")
mobility_df

Unnamed: 0,Entity,Code,Day,retail_and_recreation,grocery_and_pharmacy,residential,transit_stations,parks,workplaces
0,Afghanistan,AFG,2020-02-17,0.000,4.000,1.333,5.667,1.000,-6.333
1,Afghanistan,AFG,2020-02-18,0.750,4.250,0.750,6.500,1.500,-3.250
2,Afghanistan,AFG,2020-02-19,0.400,4.000,0.800,5.200,1.400,-1.600
3,Afghanistan,AFG,2020-02-20,0.000,3.833,0.833,5.000,1.000,-0.333
4,Afghanistan,AFG,2020-02-21,0.000,4.571,0.714,5.286,1.429,0.571
...,...,...,...,...,...,...,...,...,...
61945,Zimbabwe,ZWE,2021-05-28,32.286,57.429,-0.286,32.714,40.286,11.857
61946,Zimbabwe,ZWE,2021-05-29,33.143,58.143,-0.714,33.429,41.571,11.714
61947,Zimbabwe,ZWE,2021-05-30,34.714,60.000,-1.286,34.143,42.571,12.000
61948,Zimbabwe,ZWE,2021-05-31,35.143,61.000,-1.857,34.286,42.571,13.143


In [3]:
# Extract the relevant columns from the datasets
mobility_df = mobility_df.loc[mobility_df['Entity'] == 'India'][["Day", "retail_and_recreation", "grocery_and_pharmacy", "residential", "transit_stations", "parks", "workplaces"]]
cases_df = cases_df.loc[cases_df['iso_code'] == "IND"][["date", "new_cases"]]

In [4]:
# Merge the two datasets on the date column
d = pd.merge(cases_df, mobility_df, left_on="date", right_on="Day")
d
d = d[['new_cases', 'retail_and_recreation', 'grocery_and_pharmacy', 'residential', 'transit_stations', 'parks', 'workplaces']]
d

Unnamed: 0,new_cases,retail_and_recreation,grocery_and_pharmacy,residential,transit_stations,parks,workplaces
0,0.0,0.667,1.667,0.000,2.000,3.000,3.000
1,0.0,0.500,1.750,0.000,2.000,3.250,3.000
2,0.0,0.400,1.800,0.200,1.800,2.800,3.200
3,0.0,0.500,2.000,0.000,2.333,3.167,3.333
4,0.0,-0.143,1.714,0.714,1.429,3.571,0.143
...,...,...,...,...,...,...,...
466,173790.0,-61.714,-25.000,24.143,-49.143,-41.000,-45.429
467,165553.0,-61.286,-24.429,23.714,-48.714,-40.000,-44.571
468,152734.0,-61.143,-24.714,23.714,-49.000,-39.143,-44.286
469,127510.0,-60.143,-23.429,23.286,-48.286,-38.000,-43.429


In [5]:
def min_max_normalize(df):
    normalized_df=(df-df.min())/(df.max()-df.min())
    return normalized_df

df = min_max_normalize(d)
df

Unnamed: 0,new_cases,retail_and_recreation,grocery_and_pharmacy,residential,transit_stations,parks,workplaces
0,0.000000,1.000000,0.769060,0.000000,0.995580,0.989411,0.964006
1,0.000000,0.998070,0.770030,0.000000,0.995580,0.993119,0.964006
2,0.000000,0.996914,0.770614,0.006863,0.992925,0.986445,0.966806
3,0.000000,0.998070,0.772951,0.000000,1.000000,0.991888,0.968668
4,0.000000,0.990638,0.769609,0.024500,0.988000,0.997879,0.924007
...,...,...,...,...,...,...,...
466,0.419592,0.279032,0.457428,0.828432,0.316687,0.336863,0.285994
467,0.399705,0.283979,0.464100,0.813712,0.322382,0.351694,0.298006
468,0.368755,0.285632,0.460770,0.813712,0.318585,0.364404,0.301996
469,0.307855,0.297189,0.475786,0.799025,0.328063,0.381355,0.313995


In [6]:
df = df.sample(frac=1)
df
train = df.iloc[:400]
test = df.iloc[401:]

# Cart Algorithm

In [7]:
class Node:
    def __init__(self, feature=None, value=None, left=None, right=None, prediction=None):
        self.feature = feature
        self.value = value
        self.left = left
        self.right = right
        self.prediction = prediction

def mean_squared_error(y_pred, y_true):
    return ((y_pred - y_true) ** 2).mean()

def find_best_split(data, target, features):
    best_feature = (None,None)
    best_mse = float('inf')
    for feature in features:
        values = data[feature].sort_values().values
        for i in range(len(values) - 1):
            split = (values[i] + values[i+1]) / 2
            left = data[data[feature] < split]
            right = data[data[feature] >= split]
            if len(left) == 0 or len(right) == 0:
                continue
#             mse = (mean_squared_error(left[target], left[target].mean()) + mean_squared_error(right[target], right[target].mean())) 
            msel = mean_squared_error(left[target], left[target].mean())
            mser = mean_squared_error(right[target], right[target].mean())
            mse = (len(left)*msel + len(right)*mser)/(len(left) + len(right))
            if mse < best_mse:
                best_mse = mse
                best_feature = (feature, split)
    return best_feature

def predict(node, data):
    y_pred = []
    for index, row in data.iterrows():
        current_node = node
        while current_node.left and current_node.right:
            if row[current_node.feature] < current_node.value:
                current_node = current_node.left
            else:
                current_node = current_node.right
        y_pred.append(current_node.prediction)
    return y_pred

def rmse(y_pred, y_true):
    return np.sqrt(mean_squared_error(y_pred, y_true))

def evaluate_model(node, data, target):
    y_pred = predict(node, data)
    y_true = data[target].values
    mse = mean_squared_error(y_pred, y_true)
    rmse_value = rmse(y_pred, y_true)
    print("MSE: ", mse)
    print("RMSE: ", rmse_value)

def build_tree(node, data, features, target, max_depth=3, depth=0):
    if depth >= max_depth or data.shape[0] < 5:
        node.value = np.mean(data[target])
        node.prediction = np.mean(data[target])
        return
    best_feature = find_best_split(data, target, features)
    node.feature, node.value = best_feature
    node.left = Node()
    node.right = Node()
    feature_name, feature_median = best_feature
    if feature_name == None:
        return
    left_data = data[data[feature_name] < feature_median]
    right_data = data[data[feature_name] >= feature_median]
    build_tree(node.left, left_data, features, target, max_depth, depth + 1)
    build_tree(node.right, right_data, features, target, max_depth, depth + 1)


In [8]:
def print_tree(node, indent=""):
    if node is None:
        return
    if node.prediction is not None:
        print(indent + str(node.prediction))
        return

    print(indent + str(node.feature) + " < " + str(node.value))
    print_tree(node.left, indent + "|  ")
    print_tree(node.right, indent + "|  ")

### Part A

In [9]:
root = Node()
features = ['new_cases',]
target = 'retail_and_recreation'
build_tree(root, train, features, target, 3, 0)
print_tree(root)

new_cases < 5.6737520159927374e-05
|  new_cases < 2.052208175997373e-05
|  |  new_cases < 1.2071812799984547e-06
|  |  |  0.9472964148675511
|  |  |  0.9828552936592324
|  |  new_cases < 5.190879503993355e-05
|  |  |  0.9245989552031805
|  |  |  0.9452348481346216
|  new_cases < 0.020615034718533612
|  |  new_cases < 0.00032231740175958743
|  |  |  0.46989274652119645
|  |  |  0.08597026522202135
|  |  new_cases < 0.3196084869672709
|  |  |  0.5580790492838017
|  |  |  0.3751657448695264


In [10]:
target = 'retail_and_recreation'
evaluate_model(root, test, target)

MSE:  0.025377669313007015
RMSE:  0.15930370150441267


### Part B

In [11]:
root2 = Node()
features2 = ['retail_and_recreation',
#             'grocery_and_pharmacy', 'residential', 'transit_stations', 'parks', 'workplaces'
           ]
target2 = 'new_cases'
build_tree(root2, train, features2, target2, 5, 0)
print_tree(root2)

retail_and_recreation < 0.6141937497110628
|  retail_and_recreation < 0.22949701816836945
|  |  retail_and_recreation < 0.11970089223799177
|  |  |  retail_and_recreation < 0.10567010309278352
|  |  |  |  retail_and_recreation < 0.0065993250427626785
|  |  |  |  |  0.003917906844234985
|  |  |  |  |  0.005875954880392479
|  |  |  |  retail_and_recreation < 0.11144306781933334
|  |  |  |  |  0.0007870821945589925
|  |  |  |  |  0.004876408780553758
|  |  |  retail_and_recreation < 0.16097845684434375
|  |  |  |  retail_and_recreation < 0.1510736905367297
|  |  |  |  |  0.015481375607212184
|  |  |  |  |  0.007480902392150424
|  |  |  |  retail_and_recreation < 0.22123919374971104
|  |  |  |  |  0.020366489506107265
|  |  |  |  |  0.011626362907665118
|  |  retail_and_recreation < 0.283152651287504
|  |  |  retail_and_recreation < 0.23692270352734496
|  |  |  |  retail_and_recreation < 0.23444940132217648
|  |  |  |  |  0.6462403546215728
|  |  |  |  |  0.6287386404241553
|  |  |  |  ret

In [12]:
target2 = 'new_cases'
evaluate_model(root2, test, target2)

MSE:  0.04302149269647173
RMSE:  0.2074162305521719


### Part C 

In [13]:
root3 = Node()
features3 = ['retail_and_recreation','grocery_and_pharmacy', 'residential', 'transit_stations', 'parks', 'workplaces']
target3 = 'new_cases'
build_tree(root3, train, features3, target3, 5, 0)
print_tree(root3)

residential < 0.5784236351782589
|  workplaces < 0.6570042560340483
|  |  retail_and_recreation < 0.32938837779113306
|  |  |  grocery_and_pharmacy < 0.6485824802505493
|  |  |  |  transit_stations < 0.4105571263589662
|  |  |  |  |  0.09854655373888185
|  |  |  |  |  0.1303242730354332
|  |  |  |  parks < 0.1822017559470843
|  |  |  |  |  0.025443035529759433
|  |  |  |  |  0.04777125363361565
|  |  |  residential < 0.38480938818927357
|  |  |  |  grocery_and_pharmacy < 0.822202355910812
|  |  |  |  |  0.030396824630361096
|  |  |  |  |  0.09848411228717395
|  |  |  |  parks < 0.4957584386308358
|  |  |  |  |  0.1747794497296824
|  |  |  |  |  0.4230026856491167
|  |  grocery_and_pharmacy < 0.9716145468143785
|  |  |  grocery_and_pharmacy < 0.8071740756322161
|  |  |  |  retail_and_recreation < 0.6818917294623457
|  |  |  |  |  0.0318768288796392
|  |  |  |  |  2.2741737661906372e-05
|  |  |  |  residential < 0.35538551281611364
|  |  |  |  |  0.03139569169072981
|  |  |  |  |  0.0475

In [14]:
evaluate_model(root2, test, target3)

MSE:  0.04302149269647173
RMSE:  0.2074162305521719


# C4.5

In [15]:
def standard_error(column, mean):
    if len(column) <= 1:
        return np.nan
    return np.sqrt(np.sum((column - mean)**2)/(len(column) - 1))

def find_best_split_c45(data, target, features):
    best_feature = (None,None,None)
    best_sd = float('inf')
    for feature in features:
        values = data[feature].sort_values().values
        for i in range(len(values) - 1):
            split = (values[i] + values[i+1]) / 2
            left = data[data[feature] < split]
            right = data[data[feature] >= split]
            if len(left) == 0 or len(right) == 0:
                continue
            sdl = standard_error(left[target], left[target].mean())
            sdr = standard_error(right[target], right[target].mean())
            sd = (len(left)*sdl + len(right)*sdr)/(len(left) + len(right))
            if sd < best_sd:
                best_sd = sd
                best_feature = (feature, split, sd)
    return best_feature

def build_tree_c45(node, data, features, target, max_depth=3, depth=0, cvlimit = 0.1):
    if depth >= max_depth or data.shape[0] < 5:
        node.value = np.mean(data[target])
        node.prediction = np.mean(data[target])
        return
    best_feature = find_best_split_c45(data, target, features)
    node.feature, node.value, sd = best_feature
    node.left = Node()
    node.right = Node()
    feature_name, feature_median, sd = best_feature
    if feature_name is None:
        return
    if data[target].mean() == 0:
        cv = 0
    else:
        cv = (sd/data[target].mean())*100

    if cv < cvlimit:
        node.value = np.mean(data[target])
        node.prediction = np.mean(data[target])
        return
    left_data = data[data[feature_name] < feature_median]
    right_data = data[data[feature_name] >= feature_median]
    build_tree_c45(node.left, left_data, features, target, max_depth, depth + 1, cvlimit)
    build_tree_c45(node.right, right_data, features, target, max_depth, depth + 1, cvlimit)

### Part A

In [21]:
rootx = Node()
features = ['new_cases']
target = 'retail_and_recreation'
build_tree_c45(rootx, train, features, target, 4, 0, 0.1)
print_tree(rootx)

new_cases < 5.6737520159927374e-05
|  new_cases < 1.2071812799984547e-06
|  |  None < None
|  |  |  None < None
|  |  |  None < None
|  |  new_cases < 1.3278994079983003e-05
|  |  |  0.9881612808777526
|  |  |  new_cases < 3.259389455995828e-05
|  |  |  |  0.9398721741944431
|  |  |  |  0.9262487864638711
|  new_cases < 0.020615034718533612
|  |  new_cases < 0.00032231740175958743
|  |  |  new_cases < 9.053859599988411e-05
|  |  |  |  0.26306766523353825
|  |  |  |  0.5585320670730499
|  |  |  new_cases < 0.012799743111823617
|  |  |  |  0.06197178253002528
|  |  |  |  0.15967989063315216
|  |  new_cases < 0.11777743440176924
|  |  |  new_cases < 0.027900373743324287
|  |  |  |  0.4438218077670714
|  |  |  |  0.607481931795417
|  |  |  new_cases < 0.44754555902150717
|  |  |  |  0.4974221937813421
|  |  |  |  0.36086784857068244


In [23]:
target = 'retail_and_recreation'
evaluate_model(rootx, test, target)

KeyError: None

### Part B

In [24]:
rootx2 = Node()
features2 = ['retail_and_recreation',
#             'grocery_and_pharmacy', 'residential', 'transit_stations', 'parks', 'workplaces'
           ]
target2 = 'new_cases'
build_tree_c45(rootx2, train, features2, target2, 5, 0, 0.1)
print_tree(rootx2)

retail_and_recreation < 0.6141937497110628
|  retail_and_recreation < 0.22949701816836945
|  |  retail_and_recreation < 0.11970089223799177
|  |  |  retail_and_recreation < 0.00907840599140122
|  |  |  |  retail_and_recreation < 0.0065993250427626785
|  |  |  |  |  0.003917906844234985
|  |  |  |  |  0.005064125469593518
|  |  |  |  retail_and_recreation < 0.08338148952891684
|  |  |  |  |  0.006658122122597192
|  |  |  |  |  0.004191936994794634
|  |  |  retail_and_recreation < 0.16097845684434375
|  |  |  |  retail_and_recreation < 0.1510736905367297
|  |  |  |  |  0.015481375607212184
|  |  |  |  |  0.007480902392150424
|  |  |  |  retail_and_recreation < 0.22123919374971104
|  |  |  |  |  0.020366489506107265
|  |  |  |  |  0.011626362907665118
|  |  retail_and_recreation < 0.283152651287504
|  |  |  retail_and_recreation < 0.23692270352734496
|  |  |  |  retail_and_recreation < 0.23444940132217648
|  |  |  |  |  0.6462403546215728
|  |  |  |  |  0.6287386404241553
|  |  |  |  reta

In [25]:
target2 = 'new_cases'
evaluate_model(rootx2, test, target2)

MSE:  0.0432052958561202
RMSE:  0.2078588363676661


### Part C 

In [26]:
rootx3 = Node()
features3 = ['retail_and_recreation','grocery_and_pharmacy', 'residential', 'transit_stations', 'parks', 'workplaces']
target3 = 'new_cases'
build_tree_c45(rootx3, train, features3, target3, 5, 0, 0.1)
print_tree(rootx3)

residential < 0.5661736952269841
|  workplaces < 0.6800064400515204
|  |  grocery_and_pharmacy < 0.874783807787594
|  |  |  parks < 0.5868704395799964
|  |  |  |  retail_and_recreation < 0.32938837779113306
|  |  |  |  |  0.0580231950493764
|  |  |  |  |  0.15920018475293582
|  |  |  |  residential < 0.32352537487561334
|  |  |  |  |  0.07345818806918597
|  |  |  |  |  0.03790819627801868
|  |  |  residential < 0.39705932814054834
|  |  |  |  transit_stations < 0.7841384253912628
|  |  |  |  |  0.09978319024211227
|  |  |  |  |  0.22364274884145088
|  |  |  |  retail_and_recreation < 0.628230317599741
|  |  |  |  |  0.5988657324693134
|  |  |  |  |  0.432025432895207
|  |  retail_and_recreation < 0.8007662613841247
|  |  |  workplaces < 0.7550050400403203
|  |  |  |  residential < 0.25735167964862915
|  |  |  |  |  0.031133981256268716
|  |  |  |  |  0.049261607450400945
|  |  |  |  residential < 0.26471193768657997
|  |  |  |  |  0.045156305832134196
|  |  |  |  |  0.13131683472929478

In [27]:
target3 = 'new_cases'
evaluate_model(rootx3, test, target3)

MSE:  0.004220392713252612
RMSE:  0.0649645496655877
