In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
np.set_printoptions(threshold=300)

In [2]:
from proj1_helpers import load_csv_data, predict_labels, create_csv_submission
from implement import modeling
from preprocessing import dataprocessing, split_categories
from select_features import best_set_of_features
from other import plot_train_test
import math

## Load and process data

Load data

In [3]:
y_train, x_train, ids_train = load_csv_data('train.csv', sub_sample=False)
_,      x_test,   ids_test  = load_csv_data('test.csv',  sub_sample=False)

Create categories

In [4]:
cat_0_tri, cat_1_tri, cat_2_tri, cat_3_tri, idx_0_tr, idx_1_tr, idx_2_tr, idx_3_tr = split_categories(x_train)
cat_0_tei, cat_1_tei, cat_2_tei, cat_3_tei, idx_0_te, idx_1_te, idx_2_te, idx_3_te = split_categories(x_test)

y_train_0 = y_train[idx_0_tr]
y_train_1 = y_train[idx_1_tr] 
y_train_2 = y_train[idx_2_tr]
y_train_3 = y_train[idx_3_tr]

Preprocess the data, create features

In [10]:
trx_0, tex_0 = dataprocessing(cat_0_tri, cat_0_tei, degree = 5, adddegree = True, inv = True, frac = False, sqroot = True, sqrootpos = True, cbroot = True, comb = True, comb3 = False, trigo=True, expo = False, hyperb=False,combtrigo=False)

In [11]:
trx_1, tex_1= dataprocessing(cat_1_tri, cat_1_tei, degree = 5, adddegree = True, inv = True, frac = False, sqroot = True, sqrootpos = True, cbroot = True, comb = True, comb3 = False, trigo=True, expo = False, hyperb=False,combtrigo=False)

In [12]:
trx_2, tex_2= dataprocessing(cat_2_tri, cat_2_tei, degree = 5, adddegree = True, inv = True, frac = False, sqroot = True, sqrootpos = True, cbroot = True, comb = True, comb3 = False, trigo=True, expo = False, hyperb=False,combtrigo=False)

In [13]:
trx_3, tex_3= dataprocessing(cat_3_tri, cat_3_tei, degree = 5, adddegree = True, inv = True, frac = False, sqroot = True, sqrootpos = True, cbroot = True, comb = True, comb3 = False, trigo=True, expo = False, hyperb=False,combtrigo=False)

## Best features selection

In [15]:
num_intervals_lambda = 30
nb_fold = 5
nb_crossvalid = 5
min_range = -5
max_range = 0

In [None]:
features_0, lambdas_0, accuracies_0 = best_set_of_features(trx_0,y_train_0, num_intervals_lambda, nb_fold, nb_crossvalid, min_range, max_range)

ok


In [None]:
features_1, lambdas_1, accuracies_1 = best_set_of_features(trx_1,y_train_1, num_intervals_lambda, nb_fold, nb_crossvalid, min_range, max_range)

In [None]:
features_2, lambdas_2, accuracies_2 = best_set_of_features(trx_2,y_train_2, num_intervals_lambda, nb_fold, nb_crossvalid, min_range, max_range)

In [None]:
features_3, lambdas_3, accuracies_3 = best_set_of_features(trx_3,y_train_3, num_intervals_lambda, nb_fold, nb_crossvalid, min_range, max_range)

In [None]:
lamb_0 = lambdas_0[-3]
lamb_1 = lambdas_1[-3]
lamb_2 = lambdas_2[-3]
lamb_3 = lambdas_3[-3]

In [None]:
feat_0 = features_0[:-1]
feat_1 = features_1[:-1]
feat_2 = features_2[:-1]
feat_3 = features_3[:-1]

In [None]:
accuracy_0 = accuracies_0[:-3]
accuracy_1 = accuracies_1[:-3]
accuracy_2 = accuracies_2[:-3]
accuracy_3 = accuracies_3[:-3]

In [None]:
trx_fast_0 = trx_0[:,feat_0]
trx_fast_1 = trx_1[:,feat_1]
trx_fast_2 = trx_2[:,feat_2]
trx_fast_3 = trx_3[:,feat_3]

tex_fast_0 = tex_0[:,feat_0]
tex_fast_1 = tex_1[:,feat_1]
tex_fast_2 = tex_2[:,feat_2]
tex_fast_3 = tex_3[:,feat_3]

In [None]:
print(feat_0,"\n",feat_1,"\n",feat_2,"\n",feat_3)

## Keep it for report

In [None]:
def keep_features_number(feat_0, lamb_0, accuracy_0, name):
    with open(name, 'w') as f:
        f.write("%s " % name)
        f.write("For report all simple ones")
        f.write("degree = 5, adddegree = True, inv = True, frac = False, sqroot = True, sqrootpos = True, cbroot = True, comb = True, comb3 = False, trigo=True, expo = False, hyperb=False,combtrigo=False")
        for item in feat_0:
            f.write("%s,\n" % item)
        f.write("\n")
        
        f.write(" Lambda = %5.9f \n" % lamb_0)
        f.write(" Accuracy = %5.9f \n" % accuracy_0)

In [None]:
keep_features_number(feat_0, lamb_0, accuracy_0, "categorie_0_for_report.txt")

In [None]:
keep_features_number(feat_1, lamb_1, accuracy_1, "categorie_1_for_report.txt")

In [None]:
keep_features_number(feat_2, lamb_2, accuracy_2, "categorie_2_for_report.txt")

In [None]:
keep_features_number(feat_3, lamb_3, accuracy_3, "categorie_3_for_report.txt")

## Find w with selected x

In [39]:
w_best_0, _ = ridge_regression(y_train_0, trx_fast_0, lamb_0)
w_best_1, _ = ridge_regression(y_train_1, trx_fast_1, lamb_1)
w_best_2, _ = ridge_regression(y_train_2, trx_fast_2, lamb_2)
w_best_3, _ = ridge_regression(y_train_3, trx_fast_3, lamb_3)

## Prediction with the best model

Create prediction for each category

In [40]:
y_0_te = predict_labels(w_best_0,tex_fast_0)
y_1_te = predict_labels(w_best_1,tex_fast_1)
y_2_te = predict_labels(w_best_2,tex_fast_2)
y_3_te = predict_labels(w_best_3,tex_fast_3)

Reconstruct y in order

In [41]:
order_tab = np.concatenate((idx_0_te, idx_1_te, idx_2_te, idx_3_te))
order_idx = np.argsort(order_tab, axis=0)
y_unordered = np.concatenate((y_0_te, y_1_te, y_2_te, y_3_te))
y_pred = y_unordered[order_idx]

Create submission

In [42]:
create_csv_submission(ids_test, y_pred, "submission.csv")

In [43]:
def keep_features_number(feat_0, feat_1, feat_2, feat_3):
    with open('degree_9_features', 'w') as f:
        f.write("\n Cat 0")
        for item in feat_0:
            f.write("%s\n" % item)
            f.write(",")
            
        f.write("\n Cat 1")
        for item in feat_1:
            f.write("%s\n" % item)
            f.write(",")
            
        f.write("\n Cat 2")
        for item in feat_2:
            f.write("%s\n" % item)
            f.write(",")
            
        f.write("\n Cat 3")
        for item in feat_3:
            f.write("%s\n" % item)
            f.write(",")