In [1]:
import sys
assert sys.version_info >= (3, 7)

Note that you need python 3.7 to have use datetime.datetime.fromisoformat()

In [2]:
import csv 
import numpy as np
from functions import split
from functions import pipeline
from functions import load_data
from functions import compute_f1
from functions import corr_matrix
from functions import plot_feature
from functions import print_sample
from functions import convert_date
from functions import convert_type
from functions import print_feature
from functions import remove_missing
from functions import delete_feature
from functions import sort_by_station
from functions import convert_one_hot
from functions import convert_weather
from functions import sort_by_duration
from functions import feature_output_corr
from functions import normalization_feature
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, f1_score
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

# Loading and Preprocessing :

* path : (STRING) path of the file to load.
* limit : (INT) limit the number of example to load.
* delete_features : (LIST) feature names to remove.
* cvrt_date : (BOOLEAN) convert the data
* weather : (LIST) weather to consider. All other will be dropped.
* one_hot_features : (LIST) feature names to convert in one-hot vector.
* norm_features : (LIST) feature names to normalize in one-hot vector
* missing_features (LIST) feature which missing values are to replace 
* missing_values   (LIST) value with which to replace the missing values

### Training file:

In [3]:
header, x, y, label = pipeline(path="data/training.csv", norm_features=[], weather=[
                 "Orages", "Brouillard", "Bruine", "Généralement dégagé",
                 "Généralement nuageux", "Pluie", "Pluie modérée",
                 "Pluie forte", "Dégagé", "Nuageux", "Neige"
             ])

Data loaded (3.2s)
Visility indicator deleted (3.8s)
hmdx deleted (4.0s)
Wind Chill deleted (5.3s)
Date splited in Year/Month/Day/Hour/Weekday (5.9s)
Weekday converted in one-hot vector (11.1s)
Weather converted (14.3s)
Replace missing values (0.2s)
Remove samples with missing values (0.5s)
Data converted to float (8.1s)
Sort data according to station code (0.4s)
split data into x, y, and label (35.6s)


In [4]:
header, stations, x_stations, y_stations, label_stations = sort_by_station(header, x, y, label)

### Test file:

In [5]:
header_test, x_test = pipeline(path="data/test.csv", norm_features=[], test=True)

Data loaded (0.6s)
Visility indicator deleted (1.2s)
hmdx deleted (1.2s)
Wind Chill deleted (2.1s)
Date splited in Year/Month/Day/Hour/Weekday (1.5s)
Weekday converted in one-hot vector (2.7s)
Weather converted (2.9s)
Replace missing values (0.0s)
Remove samples with missing values (0.1s)
Data converted to float (1.3s)
Sort data according to station code (0.1s)


In [6]:
header, stations_test, x_test_stations  = sort_by_station(header_test, x_test)

In [7]:
assert(stations == stations_test)

# Model logistic par station

In [10]:
yi = header_test.index("Year")
mi = header_test.index("Month")
di = header_test.index("Day")
hi = header_test.index("Hour")
si = header_test.index("Station Code")

COMPUTE_THRESHOLD = True

with open("data/results.csv", "w") as output:
    writer = csv.writer(output, lineterminator='\n')
    writer.writerow(["id","volume"])
    
    for i, s in enumerate(stations):
        print ("\r{}/{}".format(i+1, len(stations)), end="")
        # if empty (no recording for that station in test set)
        if not x_test_stations[i]:
            continue
        
        # MODELS ALREADY TESTED
#         model = LogisticRegression(max_iter=9999, class_weight="balanced", solver="lbfgs")
#         model = SVC(kernel="linear",  class_weight="balanced")
#         model = LogisticRegression(penalty='l1', max_iter=9999, class_weight="balanced", solver="saga", n_jobs=-1)
#         model = AdaBoostClassifier(LogisticRegression(max_iter=9999, class_weight="balanced", solver="lbfgs", n_jobs=-1), n_estimators=100)
        model = RandomForestClassifier(n_estimators=100, n_jobs=-1, class_weight="balanced")

        if COMPUTE_THRESHOLD:
            # sort by time
            _x, _y, _label = sort_by_duration(header, x_stations[i], y_stations[i], label_stations[i])
            # create validation set
            split = int(0.8*len(_x))
            x_train, x_valid = _x[:split],  _x[split:]
            y_train, y_valid = _y[:split],  _y[split:]
            label_train, label_valid = _label[:split],  _label[split:]
            # train model
            model = model.fit(_x, _label)
            proba_valid = list(zip(*model.predict_proba(x_valid)))[1]
            # compute best threshold
            f1_score, threshold = compute_f1(proba_valid, label_valid)
            # re-train model
            model = model.fit(x_stations[i],  label_stations[i])
            # predict labels
            proba_test = list(zip(*model.predict_proba(x_test_stations[i])))[1]
            label_test = [1 if p > threshold else 0 for p in proba_test]
            
        else:
            # train model
            model = model.fit(x_stations[i],  label_stations[i])
            # predict labels
            label_test = model.predict(x_test_stations[i])
    
        # write prediction in file
        for i, (e, p) in enumerate(zip(x_test_stations[i], label_test)):
            d = "2016-{:02d}-{:02d}_{:02d}:00_{:4d}".format(int(e[mi]),int(e[di]),int(e[hi]),int(e[si]))
            writer.writerow([d, str(bool(p))])
print("")
print("Done!")

182/182
Done!
