In [None]:
import warnings
warnings.filterwarnings("ignore")

# Load data

In [None]:
import re
import csv
import numpy as np

In [None]:
def load_data(path):
    with open(path) as file:
        reader = csv.reader(file, delimiter=',')
        # get header
        header = next(reader)
        data = np.asarray([[value for value in row] for row in reader])
    return header, data

In [None]:
header, data = load_data("data/training.csv")

# Data preprocessing

### Remove empty values

In [None]:
def print_sample(header, data):
    for i, (feature, value) in enumerate(zip(header, data[0])):
        print("({:^2d}) {:30} : {}".format(i, feature, value))

In [None]:
def print_feature(header, data, max_feature=5):
    for n_feature, feature in enumerate(data.T):
        values, counts = np.unique(feature, return_counts=True)
        counts_values = sorted(zip(counts, values), reverse=True)
        print("-" * 50)
        print("{}".format(header[n_feature]))
        print("-" * 50)
        for i, (v, c) in enumerate(counts_values):
            if i > max_feature:
                break
            print("{:10} : {:10} ({:5.1%})".format(c, v, v / data.shape[0]))

In [None]:
print_feature(header, data)

We can remove the feature without values in over 50% of samples. We decide to keep Weather as it is discrete and we can easily replace it with a one-hot vector.

In [None]:
data = np.delete(data, [7, 9, 10], 1)
header = np.delete(header, [7, 9, 10])

### Convert Date (feature 0)

In [None]:
def convert_date(data, index):
    new_data = []
    for i, d in enumerate(data):
        new_data.append(re.split('-| |:|', d[0]) + list(d[1:]))
    return np.asarray(new_data)

In [None]:
data = convert_date(data, 0)
header = ["Year", "Month", "Day", "Hour", "Minute"] + list(header[1:])

### Convert Weather (feature 12)

In [None]:
print(set(data.T[12]))

In [None]:
print_sample(header, data)

# Data analysis & visualization

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
