All imports 
----

In [11]:
import os
import numpy as np
import pandas as pd

import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

Useful functions
---

In [1]:
class AttributeDict(dict):
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__

def score_f1(y_true, y_pred, threshold):
    return sklearn.metrics.f1_score(y_true>threshold, y_pred>threshold)

def score_regression(y_true, y_pred):
    scores = [score_f1(y_true, y_pred, th) for th in [500, 1400, 5000, 10000]]
    return np.mean(scores)

Get the dataset in a DataFrame
---

In [5]:
datasets_path = os.path.join(os.curdir, "datasets")


dataset = AttributeDict(
    train=AttributeDict(
        X=pd.read_csv(os.path.join(datasets_path, 'X1.csv')),
        y=pd.read_csv(os.path.join(datasets_path, 'Y1.csv'), header=None)
    ),
    validation=AttributeDict(
        X=None,
        y=None
    ),
    test=AttributeDict(
        X=pd.read_csv(os.path.join(datasets_path, 'X2.csv')),
        y=None
    )
)

dataset.train.X, dataset.validation.X, dataset.train.y, dataset.validation.y = train_test_split(dataset.train.X, dataset.train.y, train_size=.8, shuffle=True)

Linear regression on the 58 train features
---

In [6]:
linear_regression_cf = LinearRegression()

linear_regression_cf.fit(dataset.train.X.values, dataset.train.y.values)

LinearRegression()

Inference on the validation set
---

In [13]:
y_val_pred = linear_regression_cf.predict(dataset.validation.X.values)
y_train_pred = linear_regression_cf.predict(dataset.train.X.values)


val_score = score_regression(dataset.validation.y.values, y_val_pred)
train_score = score_regression(dataset.train.y.values, y_train_pred)

print(f"Score on the validation set : {val_score:.2e}")
print(f"Score on the train set      : {train_score:.2e}")


val_acc = mean_squared_error(dataset.validation.y.values, y_val_pred)
train_acc = mean_squared_error(dataset.train.y.values, y_train_pred)

print(f"Accuracy on the validation set : {val_acc:.2e}")
print(f"Accuracy on the train set      : {train_acc:.2e}")

Score on the validation set : 4.84e-01
Score on the train set      : 4.91e-01
Accuracy on the validation set : 1.72e+17
Accuracy on the train set      : 1.53e+08
