In [1]:
import sys
assert sys.version_info >= (3, 7)

Note that you need python 3.7 to have use datetime.datetime.fromisoformat()

In [2]:
import csv 
import numpy as np
from functions import split
from functions import pipeline
from functions import load_data
from functions import compute_f1
from functions import corr_matrix
from functions import plot_feature
from functions import print_sample
from functions import convert_date
from functions import convert_type
from functions import print_feature
from functions import remove_missing
from functions import delete_feature
from functions import convert_one_hot
from functions import convert_weather
from functions import feature_output_corr
from functions import normalization_feature
from sklearn.linear_model import LogisticRegression

# Loading and Preprocessing :

* path : (STRING) path of the file to load.
* limit : (INT) limit the number of example to load.
* delete_features : (LIST) feature names to remove.
* cvrt_date : (BOOLEAN) convert the data
* weather : (LIST) weather to consider. All other will be dropped.
* one_hot_features : (LIST) feature names to convert in one-hot vector.
* norm_features : (LIST) feature names to normalize in one-hot vector
* missing_features (LIST) feature which missing values are to replace 
* missing_values   (LIST) value with which to replace the missing values

### Training file:

In [3]:
header, x, y, label = pipeline(path="data/training.csv")

Data loaded (9.9s)
Visility indicator deleted (8.2s)
hmdx deleted (7.3s)
Wind Chill deleted (9.6s)
Date splited in Year/Month/Day/Hour/Weekday (11.9s)
Weekday converted in one-hot vector (21.6s)
Weather converted (36.8s)
Replace missing values (0.3s)
Remove samples with missing values (1.7s)
Data converted to float (27.7s)
Temperature (°C) normalized (28.6s)
Drew point (°C) normalized (29.1s)
Relativite humidity (%) normalized (28.1s)
wind direction (10s deg) normalized (27.3s)
Wind speed (km/h) normalized (26.3s)
Pressure at the station (kPa) normalized (31.3s)
Visibility (km) normalized (29.8s)
Sort data according to station code (1.2s)
split data into x, y, and label (79.6s)


In [4]:
index = header.index("Station Code")
stations = list(set(list(zip(*x))[index]))
x_stations = [[] for _ in stations]
y_stations = [[] for _ in stations]
label_stations = [[] for _ in stations]

for _x, _y, _label in zip(x, y, label):
    s = stations.index(_x[index])
    x_stations[s].append(_x)
    y_stations[s].append(_y)
    label_stations[s].append(_label)

### Test file:

In [4]:
header_test, x_test = pipeline(path="data/test.csv", test=True)

Data loaded (1.6s)
Visility indicator deleted (3.1s)
hmdx deleted (2.6s)
Wind Chill deleted (2.5s)
Date splited in Year/Month/Day/Hour/Weekday (4.6s)
Weekday converted in one-hot vector (4.3s)
Weather converted (7.8s)
Replace missing values (0.1s)
Remove samples with missing values (0.4s)
Data converted to float (6.6s)
Temperature (°C) normalized (6.7s)
Drew point (°C) normalized (8.6s)
Relativite humidity (%) normalized (6.5s)
wind direction (10s deg) normalized (6.6s)
Wind speed (km/h) normalized (6.4s)
Pressure at the station (kPa) normalized (6.8s)
Visibility (km) normalized (6.7s)
Sort data according to station code (0.3s)


In [6]:
index = header_test.index("Station Code")
x_test_stations = [[] for _ in stations]

for _x in x_test:
    s = stations.index(_x[index])
    x_test_stations[s].append(_x)

# Model logistic par station

In [7]:
yi = header_test.index("Year")
mi = header_test.index("Month")
di = header_test.index("Day")
hi = header_test.index("Hour")
si = header_test.index("Station Code")

with open("data/results.csv", "w") as output:
    writer = csv.writer(output, lineterminator='\n')
    writer.writerow(["id","volume"])
    
    for i, s in enumerate(stations):
        # if empty (no recording for that station in test set)
        if not x_test_stations[i]:
            continue
        model = LogisticRegression(max_iter=9999, class_weight={0: 1, 1: 6}, solver="lbfgs")
        model.fit(x_stations[i], label_stations[i])
        pred = model.predict(x_test_stations[i])
        
        for i, (e, p) in enumerate(zip(x_test_stations[i], pred)):
            d = "2016-{:02d}-{:02d}_{:02d}:00_{:4d}".format(int(e[mi]),int(e[di]),int(e[hi]),int(e[si]))
            writer.writerow([d, str(bool(p))])

# To continue

### Deleting Outliers

In [None]:
header=np.asarray(header)
x=np.asarray(x)
y=np.asarray(y)
label=np.asarray(label)

In [None]:
print("Quantile at 0.99:", np.quantile(y, 0.99))
print("10 Highest values:", sorted(y, reverse = True)[0:10])

In [None]:
non_out_ind = np.where(y < 50)
x, y, label = x[non_out_ind], y[non_out_ind], label[non_out_ind]

#### Distribution of the withdrawals:

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(6, 4), dpi=75)
sns.distplot(y)
plt.show()

To reduce the tail of the distribution, we apply the square root function to to $y$ :

In [None]:
y = np.sqrt(y)

split = int(x.shape[0] * 0.8)
x_train, x_valid = x[:split], x[split:]
y_train, y_valid = y[:split], y[split:]
label_train, label_valid = label[:split], label[split:]

### Forward Feature Selection 

In [None]:
import itertools
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

def fit_linear_reg(X,Y):
    #Fit linear regression model and return RSS and R squared values
    model_k = LinearRegression(fit_intercept = True)
    model_k.fit(X,Y)
    RSS = mean_squared_error(Y,model_k.predict(X)) * len(Y)
    R_squared = model_k.score(X,Y)
    return RSS, R_squared


#Initialization variables
Y = y_train
X = x_train
k = 25
# k = len(header)

remaining_features = [i for i in range(len(header))]
features = []
RSS_list, R_squared_list = [np.inf], [np.inf] #Due to 1 indexing of the loop...
features_list = dict()

for i in range(1,k+1):
    best_RSS = np.inf
    
    for combo in itertools.combinations(remaining_features,1):

            RSS = fit_linear_reg(X[:,list(combo) + features],Y)   #Store temp result 

            if RSS[0] < best_RSS:
                best_RSS = RSS[0]
                best_R_squared = RSS[1] 
                best_feature = combo[0]

    #Updating variables for next loop
    features.append(best_feature)
    remaining_features.remove(best_feature)
    
    #Saving values for plotting
    RSS_list.append(best_RSS)
    R_squared_list.append(best_R_squared)
    features_list[i] = features.copy()

In [None]:
for i in range(1,len(features_list)):
    features_list[i] = [f for f in features_list[i] if header[f] in header_test]

print('Forward stepwise subset selection')
for i in range(1,20):
    print('Number of features :', len(features_list[i]))
    print('Features :', features_list[i])
    print('RSS :', round(RSS_list[i]))
    print('R Squared :', R_squared_list[i])
    print("")

Features that we choose to keep :

In [None]:
features_keep = features_list[10]

x_train = x_train[:, features_keep]
x_valid = x_valid[:, features_keep]

### Linear regression on Withdrawals:

In [None]:
model_linear = LinearRegression()
model_linear = model_linear.fit(x_train, y_train)

y_pred = model_linear.predict(x_valid)

# The coefficients
print('Score: ', model_linear.score(x_valid, y_valid))
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_valid, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_valid, y_pred))

### Logistic regression between Withdrawals and Volume:

In [None]:
from sklearn.linear_model import LogisticRegression

model_logit = LogisticRegression(max_iter=9999, class_weight={0: 1, 1: 6})
model_logit = model_logit.fit(y_train.reshape(-1,1), label_train)

print(model_logit.score(y_valid.reshape(-1,1), label_valid))
print(model_logit.score(model_linear.predict(x_valid).reshape(-1,1), label_valid))

## Creation of the result file:

In [None]:
features_keep_test = [np.where(i == header_test)[0][0] 
                      for i in header[features_keep]]
x_test_f = x_test[:,features_keep_test]


In [None]:
label_pred = model_logit.predict(model_linear.predict(x_test_f).reshape(-1,1))

In [None]:
yi = np.where(header_test == "Year")
mi = np.where(header_test == "Month")
di = np.where(header_test == "Day")
hi = np.where(header_test == "Hour")
si = np.where(header_test == "Station Code")

results = [["id","volume"]]

for index,element in enumerate(x_test):
    string_element = "2016-{:02d}-{:02d}_{:02d}:00_{:4d}".format(int(element[mi]),
                                                                 int(element[di]),
                                                                 int(element[hi]),
                                                                 int(element[si]))
    results.append([string_element, str(bool(label_pred[index]))])

In [None]:
print(sum(model_logit.predict(model_linear.predict(x_valid).reshape(-1,1))))
print(sum(model_logit.predict(model_linear.predict(x_test_f).reshape(-1,1))))

In [None]:
import csv 

with open("data/results.csv", "w") as output:
    writer = csv.writer(output, lineterminator='\n')
    writer.writerows(results)