In [None]:
import warnings
warnings.filterwarnings("ignore")

# Load data

In [None]:
import re
import csv
import itertools
import numpy as np

In [None]:
def load_data(path, limit=None):
    with open(path) as file:
        reader = csv.reader(file, delimiter=',')
        # get header
        header = next(reader)
        data = np.asarray([[value for value in row] for row in itertools.islice(reader, limit)])
    return header, data

In [None]:
header, data = load_data("data/training.csv")

In [None]:
print(data.shape)

# Data preprocessing

### Remove features

In [None]:
def print_sample(header, data):
    for i, (feature, value) in enumerate(zip(header, data[0])):
        print("({:^2d}) {:30} : {}".format(i, feature, value))

In [None]:
def print_feature(header, data, max_feature=5):
    for n_feature, feature in enumerate(data.T):
        values, counts = np.unique(feature, return_counts=True)
        counts_values = sorted(zip(counts, values), reverse=True)
        print("-" * 50)
        print("{} ({})".format(header[n_feature], len(values)))
        print("-" * 50)
        for i, (v, c) in enumerate(counts_values):
            if i > max_feature:
                break
            print("{:10} : {:10} ({:5.1%})".format(c, v, v / data.shape[0]))

In [None]:
print_feature(header, data)

We can remove the feature without values in over 50% of samples. We decide to keep Weather as it is discrete and we can easily replace it with a one-hot vector. We also need to remove Withdrawals that is not available in test data.

In [None]:
data = np.delete(data, [7, 9, 10, 14], 1)
header = np.delete(header, [7, 9, 10, 14])

### Convert Date (feature 0)

In [None]:
def convert_date(data, index):
    new_data = []
    for i, d in enumerate(data):
        new_data.append(
            re.split('-| |:|', d[index])[:-1] + list(d[index + 1:]))
    return np.asarray(new_data)

In [None]:
index_date = 0
data = convert_date(data, index_date)
header = ["Year", "Month", "Day", "Hour"] + list(header[index_date + 1:])

### Convert Weather (feature 12)

In [None]:
def convert_weather(data, weather, index):
    N = len(weather)
    new_data = []
    for i, d in enumerate(data):
        new_data.append(
            list(d[:index]) +
            [1 if w.lower() in d[index].lower() else 0
             for w in weather] + list(d[index + 1:]))
    return np.asarray(new_data)

In [None]:
index_weather = 11
set(l for s in set(data.T[index_weather]) for l in s.split(","))

In [None]:
weather = [
    'Orages', 'Brouillard', 'Bruine', 'Généralement dégagé',
    'Généralement nuageux', 'Pluie', 'Pluie modérée', 'Pluie forte', 'Dégagé',
    'Nuageux', 'Neige'
]
data = convert_weather(data, weather, index_weather)
header = list(header[:index_weather]) + weather + list(
    header[index_weather + 1:])

### Convert data type to float

In [None]:
# samples with at least one missing value
missing = [d for d in data if '' in d]
print(len(missing))

# number of class 1 with missing value
print(sum(['1' in d[-1] for d in missing]))

Let's remove the samples with missing values as only one hundred have label 1.

In [None]:
def convert_type(data):
    return np.asarray(
        [[float(v.replace(",", ".")) for v in d] for d in data if "" not in d])

In [None]:
data = convert_type(data)

In [None]:
print_sample(header, data)

In [None]:
print(data.shape)

# Data analysis & visualization

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def plot_features(header, data):
    cols = 4
    n = data.shape[1] // cols + 1
    fig, ax = plt.subplots(nrows=n, ncols=cols, figsize=(cols * 6, n * 6))
    for i, (name, feature) in enumerate(zip(header, data.T)):
        ax[i // cols, i % cols].hist(feature)
        ax[i // cols, i % cols].set_title(name)
    plt.show()

In [None]:
plot_features(header, data)

# Logistic regression

In [None]:
from sklearn.metrics import f1_score, recall_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
x, y = data[:, :-1], data[:, -1]

In [None]:
scaler = MinMaxScaler()
x = scaler.fit_transform(x)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.20, stratify=y)

In [None]:
model = LogisticRegression(max_iter=9999, class_weight={0: 1, 1: 6})
model.fit(x_train, y_train)
pred = model.predict_proba(x_test)

In [None]:
best_f1 = -1
for i in range(0, 100):
    y_pred = [1 if y[1] > i / 100 else 0 for y in pred]
    if 1 in y_pred:
        best_f1 = max(best_f1, f1_score(y_test, y_pred))
print(best_f1)

In [None]:
y_pred = [1 if y[1] > 0.5 else 0 for y in pred]
print(f1_score(y_test, y_pred))

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(24, )))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer=Adam())

model.fit(
    x_train,
    y_train,
    batch_size=128,
    epochs=5,
    verbose=1,
    class_weight={
        0: 1,
        1: 10
    })
pred = model.predict(x_test)
best_f1 = -1
for i in range(0, 100):
    y_pred = [1 if y > i / 100 else 0 for y in pred]
    if 1 in y_pred:
        best_f1 = max(best_f1, f1_score(y_test, y_pred))
print(best_f1)