In [1]:
import sys
assert sys.version_info >= (3, 7)


Note that you need python 3.7 to have use datetime.datetime.fromisoformat()

In [2]:
import csv
import numpy as np
from functions import split
from functions import pipeline
from functions import load_data
from functions import compute_f1
from functions import corr_matrix
from functions import plot_feature
from functions import print_sample
from functions import convert_date
from functions import convert_type
from functions import print_feature
from functions import remove_missing
from functions import delete_feature
from functions import sort_by_station
from functions import convert_one_hot
from functions import convert_weather
from functions import sort_by_duration
from functions import feature_output_corr
from functions import normalization_feature
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, f1_score
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier


# Loading and Preprocessing :

* path : (STRING) path of the file to load.
* limit : (INT) limit the number of example to load.
* delete_features : (LIST) feature names to remove.
* cvrt_date : (BOOLEAN) convert the data
* weather : (LIST) weather to consider. All other will be dropped.
* one_hot_features : (LIST) feature names to convert in one-hot vector.
* norm_features : (LIST) feature names to normalize in one-hot vector
* missing_features (LIST) feature which missing values are to replace 
* missing_values   (LIST) value with which to replace the missing values

### Training file:

In [3]:
header, x, y, label = pipeline(path="data/training.csv",
                               norm_features=[],
                               one_hot_features=[],
                               weather_coef = [0, 5, 4, 9, 8, 2, 3, 0, 10, 7, 0])


Data loaded (19.4s)
Visility indicator deleted (30.3s)
hmdx deleted (26.9s)
Wind Chill deleted (26.7s)
Date splited in Year/Month/Day/Hour/Weekday (24.4s)
Weather converted (22.3s)
Weather rescaled (96.9s)
Replace missing values (0.2s)
Remove samples with missing values (0.4s)
Data converted to float (9.1s)
Sort data according to station code (0.6s)
split data into x, y, and label (32.0s)


In [4]:
header, stations, x_stations, y_stations, label_stations = sort_by_station(
    header, x, y, label)


### Test file:

In [5]:
header_test, x_test = pipeline(path="data/test.csv",
                               norm_features=[],
                               one_hot_features=[],
                               test=True,
                               weather_coef = [0, 5, 4, 9, 8, 2, 3, 0, 10, 7, 0])


Data loaded (0.7s)
Visility indicator deleted (1.4s)
hmdx deleted (1.4s)
Wind Chill deleted (2.3s)
Date splited in Year/Month/Day/Hour/Weekday (1.7s)
Weather converted (3.6s)
Weather rescaled (22.4s)
Replace missing values (0.0s)
Remove samples with missing values (0.1s)
Data converted to float (1.4s)
Sort data according to station code (0.1s)


In [6]:
header, stations_test, x_test_stations = sort_by_station(header_test, x_test)


In [7]:
assert (stations == stations_test)


# Model logistic par station

In [8]:
def fill_missing_value(header, data):
    ind_row = 0
    ind_col = header.index("Weather_Coef")
    for ind, d in enumerate(data):
        if d[ind_col] == -1:
            continue
        else:
            for inter in range(ind_row+1, ind):
                alpha = (inter-ind_row)/(ind-ind_row)
                data[inter][ind_col] = (1-alpha)*data[ind_row][ind_col]+alpha*data[ind][ind_col]
            ind_row = ind
    return data

In [9]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 400, num = 10)]
# Number of features to consider at every split
max_features = ['log2', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 30, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 3, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5]
# Method of selecting samples for training each tree
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
unique_model = True

if unique_model:
    
    _x, _y, _label = sort_by_duration(header, x, y, label)
    split = int(0.8 * len(_x))

    model = RandomizedSearchCV(estimator = RandomForestClassifier(class_weight="balanced", n_jobs=1), 
                                   param_distributions = random_grid, 
                                   n_iter = 50, 
                                   cv = zip([range(split)],[range(split,len(_x))]), 
                                   verbose=2, 
                                   n_jobs = -1)

    if "Weather_Coef" in header:
        _x = fill_missing_value(header, _x)

    # train model
    model = model.fit(_x, _label)
    print(model.best_params_)

else:

    best_params_stations = []

    for i in range(len(stations)):

        _x, _y, _label = sort_by_duration(header, x_stations[i],y_stations[i], label_stations[i])
        split = int(0.8 * len(_x))

        model = RandomizedSearchCV(estimator = RandomForestClassifier(class_weight="balanced", n_jobs=-1), 
                                       param_distributions = random_grid, 
                                       n_iter = 50, 
                                       cv = zip([range(split)],[range(split,len(_x))]), 
                                       verbose=0, 
                                       n_jobs = -1)

        if "Weather_Coef" in header:
            _x = fill_missing_value(header, _x)

        # train model
        model = model.fit(_x, _label)
        best_params_stations.append(model.best_params_)
        print("\r{}/{}".format(i+1, len(stations)),end="")


In [10]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 400, num = 10)]
# Number of features to consider at every split
max_features = ['log2', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 30, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 3, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5]
# Method of selecting samples for training each tree
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}



Fitting 1 folds for each of 50 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 43.9min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 80.2min finished


{'n_estimators': 146, 'min_samples_split': 3, 'min_samples_leaf': 5, 'max_features': 'log2', 'max_depth': 22}


In [13]:
yi = header_test.index("Year")
mi = header_test.index("Month")
di = header_test.index("Day")
hi = header_test.index("Hour")
si = header_test.index("Station Code")

COMPUTE_THRESHOLD = True
f1_train, f1_val = [], []

with open("data/results.csv", "w") as output:
    writer = csv.writer(output, lineterminator='\n')
    writer.writerow(["id", "volume"])

    for i, s in enumerate(stations):
        print("\r{}/{} ({:.2f}, {:.2f})".format(
            i + 1, len(stations),
            np.mean(f1_train) if f1_train or not COMPUTE_THRESHOLD else 0,
            np.mean(f1_val) if f1_val or not COMPUTE_THRESHOLD else 0),
              end="")
        # if empty (no recording for that station in test set)
        if not x_test_stations[i]:
            continue

        # MODELS ALREADY TESTED
#         model = LogisticRegression(max_iter=9999, class_weight="balanced", solver="lbfgs")
#         model = SVC(kernel="linear",  class_weight="balanced")
#         model = LogisticRegression(penalty='l1', max_iter=9999, class_weight="balanced", solver="saga", n_jobs=-1)
#         model = AdaBoostClassifier(LogisticRegression(max_iter=9999, class_weight="balanced", solver="lbfgs", n_jobs=-1), n_estimators=100)
#         model = RandomForestClassifier(n_estimators=200, max_depth=None, n_jobs=-1, class_weight="balanced")
#         model = RandomForestClassifier(n_estimators=best_params_stations[i]['n_estimators'],
#                                        max_features=best_params_stations[i]['max_features'],
#                                        max_depth=best_params_stations[i]['max_depth'],
#                                        min_samples_split=best_params_stations[i]['min_samples_split'],
#                                        min_samples_leaf=best_params_stations[i]['min_samples_leaf'],
#                                        n_jobs=-1, class_weight="balanced")
        model = RandomForestClassifier(n_estimators=146,
                                       max_features='log2',
                                       max_depth=22,
                                       min_samples_split=3,
                                       min_samples_leaf=5,
                                       n_jobs=-1, class_weight="balanced")


        if COMPUTE_THRESHOLD:
            # sort by time
            _x, _y, _label = sort_by_duration(header, x_stations[i],
                                              y_stations[i], label_stations[i])
            
            if "Weather_Coef" in header:
                _x = fill_missing_value(header, _x)
            
            # create validation set
            split = int(0.8 * len(_x))
            x_train, x_valid = _x[:split], _x[split:]
            y_train, y_valid = _y[:split], _y[split:]
            label_train, label_valid = _label[:split], _label[split:]
            # train model
            model = model.fit(x_train, label_train)
    
            # predict the probabilities for train and validation set
            proba_train = list(zip(*model.predict_proba(x_train)))[1]
            proba_valid = list(zip(*model.predict_proba(x_valid)))[1]
            
            # compute best threshold on the validation set
            f1_score, threshold = compute_f1(proba_valid, label_valid)
            # print running average of f1-score for both train and eval
            f1_val.append(f1_score)
            f1_train.append(
                np.mean([
                    int(int(p > threshold) == l)
                    for p, l in zip(proba_train, label_train)
                ]))
            
            # re-train model
            model = model.fit(_x, _label)

            # predict labels
            _x_test, _, _ = sort_by_duration(header_test, x_test_stations[i])
            
            if "Weather_Coef" in header_test:
                _x_test = fill_missing_value(header_test, _x_test)
    
            proba_test = list(zip(*model.predict_proba(_x_test)))[1]
            label_test = [1 if p > threshold else 0 for p in proba_test]
        else:
            # train model
            model = model.fit(_x, _label)
            # predict labels
            label_test = model.predict(_x_test)

        # write prediction in file
        for i, (e, p) in enumerate(zip(x_test_stations[i], label_test)):
            d = "2016-{:02d}-{:02d}_{:02d}:00_{:4d}".format(
                int(e[mi]), int(e[di]), int(e[hi]), int(e[si]))
            writer.writerow([d, str(bool(p))])
print("")
print("Done!")


182/182 (0.96, 0.50)
Done!
