In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import sys
import matplotlib.pyplot as plt
from implementations import *
%load_ext autoreload
%autoreload 2

# Data input and output paths
DATA_TRAIN_PATH = '../data/train.csv' 
DATA_TEST_PATH = '../data/test.csv'
OUTPUT_PATH = 'predictions_out.csv'

# For debugging purpose
np.set_printoptions(threshold= 50)

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Clean the data

In [3]:
# Remove features with unusable data 
unique, counts = np.unique(np.where(tX < -998)[1], return_counts=True)
print(dict(zip(unique, counts)))
#tX = np.delete(tX, (0, 4, 5, 6, 12, 23, 24, 25, 26, 27, 28), 1)

{0: 38114, 4: 177457, 5: 177457, 6: 177457, 12: 177457, 23: 99913, 24: 99913, 25: 99913, 26: 177457, 27: 177457, 28: 177457}


In [4]:
# Normalize the data between -1 and 1
tX, mean_x, std_x = standardize(tX)

## Split the data into, training, validation and test sets

In [4]:
np.random.seed(330)
# How much annotated data for training and validation. The rest is used for testing.
training_perc, validation_perc = 0.4, 0.1

indices = np.arange(len(y))
np.random.shuffle(indices)

splits = (np.array([training_perc, validation_perc]) * len(y)).astype(int).cumsum()
training_indices, validation_indices, test_indices = np.split(indices, splits)

tX_train = tX[training_indices]
y_train = y[training_indices]
ids_train = ids[training_indices]

tX_validation = tX[validation_indices]
y_validation = y[validation_indices]
ids_validation = ids[validation_indices]

tX_test = tX[test_indices]
y_test = y[test_indices]
ids_test = ids[test_indices]

## Regression 

In [18]:


initial_w = np.zeros(tX.shape[1])
max_iters = 500
gamma = 0.1

loss, weights = least_squares_GD(y, tX, initial_w, max_iters, gamma)

print("Loss: ", loss)

Gradient Descent(0/499): loss=1.0, w0=0.02270010324104557, w1=-0.033357651291556725
Gradient Descent(1/499): loss=0.9377066833805242, w0=0.03630256598230802, w1=-0.05605393886293694
Gradient Descent(2/499): loss=0.9265100498717618, w0=0.047848655792248014, w1=-0.07555774961931183
Gradient Descent(3/499): loss=0.908813501283479, w0=0.05660252841214014, w1=-0.09136396872291716
Gradient Descent(4/499): loss=0.8966102970700459, w0=0.06340858402736381, w1=-0.10451256642292625
Gradient Descent(5/499): loss=0.8863206563219602, w0=0.06860961069700092, w1=-0.11548963962597862
Gradient Descent(6/499): loss=0.8780775459913392, w0=0.072549760234742, w1=-0.12474907179768632
Gradient Descent(7/499): loss=0.8713190231616081, w0=0.07548467647511706, w1=-0.13262961948643687
Gradient Descent(8/499): loss=0.8657497808325769, w0=0.07762075015108406, w1=-0.13940104505374565
Gradient Descent(9/499): loss=0.8611130884137834, w0=0.07912098088162242, w1=-0.145274799551429
Gradient Descent(10/499): loss=0.85721

Gradient Descent(88/499): loss=0.8080182510853164, w0=0.044057649337739556, w1=-0.2326246399604814
Gradient Descent(89/499): loss=0.8078331390665411, w0=0.04383722168510552, w1=-0.23291625917384914
Gradient Descent(90/499): loss=0.8076514033710676, w0=0.04362073662754168, w1=-0.23320222083705275
Gradient Descent(91/499): loss=0.8074728109130509, w0=0.043408097778381276, w1=-0.23348266722537872
Gradient Descent(92/499): loss=0.8072973136667079, w0=0.043199211950526735, w1=-0.23375773641590114
Gradient Descent(93/499): loss=0.8071248163403261, w0=0.04299398903259496, w1=-0.23402756243177728
Gradient Descent(94/499): loss=0.8069553608888469, w0=0.04279234186968466, w1=-0.23429227538012548
Gradient Descent(95/499): loss=0.806788798528474, w0=0.04259418614866084, w1=-0.2345520015838944
Gradient Descent(96/499): loss=0.806625119515932, w0=0.042399440287843614, w1=-0.2348068637080976
Gradient Descent(97/499): loss=0.8064641857797727, w0=0.042208025330982536, w1=-0.23505698088075694
Gradient D

Gradient Descent(176/499): loss=0.7992568670212331, w0=0.03357742772112768, w1=-0.24604109053959572
Gradient Descent(177/499): loss=0.799209394650747, w0=0.033519236425291435, w1=-0.2461162270598994
Gradient Descent(178/499): loss=0.7991626138647503, w0=0.03346183582152877, w1=-0.24619045577460336
Gradient Descent(179/499): loss=0.7991164986132842, w0=0.033405214359161564, w1=-0.24626379124481015
Gradient Descent(180/499): loss=0.7990710060224409, w0=0.033349360680752414, w1=-0.24633624771705348
Gradient Descent(181/499): loss=0.7990261636406033, w0=0.03329426361795774, w1=-0.24640783913155084
Gradient Descent(182/499): loss=0.7989819593451554, w0=0.03323991218750365, w1=-0.24647857913021928
Gradient Descent(183/499): loss=0.7989383875362047, w0=0.03318629558727997, w1=-0.24654848106446142
Gradient Descent(184/499): loss=0.798895435600153, w0=0.033133403192547976, w1=-0.24661755800272833
Gradient Descent(185/499): loss=0.7988530756149599, w0=0.03308122455225748, w1=-0.24668582273786605

Gradient Descent(261/499): loss=0.7968325847120393, w0=0.03060592862396739, w1=-0.25018189761987175
Gradient Descent(262/499): loss=0.7968169966115398, w0=0.03058706899808208, w1=-0.2502120016752684
Gradient Descent(263/499): loss=0.7968015936752618, w0=0.030568451466232494, w1=-0.25024181066648593
Gradient Descent(264/499): loss=0.7967863841181035, w0=0.030550072801287353, w1=-0.25027132785717515
Gradient Descent(265/499): loss=0.796771370559227, w0=0.030531929821124855, w1=-0.2503005564661763
Gradient Descent(266/499): loss=0.7967565430692988, w0=0.030514019387971315, w1=-0.25032949966834683
Gradient Descent(267/499): loss=0.7967418870617392, w0=0.030496338407750526, w1=-0.2503581605953694
Gradient Descent(268/499): loss=0.7967273980534373, w0=0.03047888382944352, w1=-0.2503865423365412
Gradient Descent(269/499): loss=0.7967130750047582, w0=0.030461652644458544, w1=-0.2504146479395439
Gradient Descent(270/499): loss=0.7966989206997447, w0=0.030444641886011103, w1=-0.2504424804111963


Gradient Descent(348/499): loss=0.7959788554829714, w0=0.02960891329936151, w1=-0.25196637295837776
Gradient Descent(349/499): loss=0.7959731112452435, w0=0.029602654160385797, w1=-0.251979583128104
Gradient Descent(350/499): loss=0.7959674325778264, w0=0.029596472636909253, w1=-0.2519926725293285
Gradient Descent(351/499): loss=0.7959618131719982, w0=0.029590367747091666, w1=-0.2520056423304756
Gradient Descent(352/499): loss=0.7959562558833663, w0=0.02958433852209739, w1=-0.25201849368779794
Gradient Descent(353/499): loss=0.7959507617232308, w0=0.029578384005915608, w1=-0.2520312277455231
Gradient Descent(354/499): loss=0.7959453238937738, w0=0.029572503255183114, w1=-0.25204384563599863
Gradient Descent(355/499): loss=0.7959399447829423, w0=0.029566695339009665, w1=-0.25205634847983416
Gradient Descent(356/499): loss=0.7959346240137697, w0=0.0295609593388058, w1=-0.2520687373860418
Gradient Descent(357/499): loss=0.7959293593245985, w0=0.02955529434811313, w1=-0.252081013452174
Gra

Gradient Descent(437/499): loss=0.7956443035298031, w0=0.02926984860439451, w1=-0.252777149865331
Gradient Descent(438/499): loss=0.7956419928816449, w0=0.029267778329611742, w1=-0.25278308990888604
Gradient Descent(439/499): loss=0.7956397042092147, w0=0.02926573378034675, w1=-0.2527889780598121
Gradient Descent(440/499): loss=0.7956374370483424, w0=0.02926371464707649, w1=-0.2527948147992613
Gradient Descent(441/499): loss=0.795635191176541, w0=0.02926172062414922, w1=-0.2528006006037656
Gradient Descent(442/499): loss=0.7956329670256972, w0=0.029259751409733477, w1=-0.2528063359452832
Gradient Descent(443/499): loss=0.7956307640305436, w0=0.029257806705767723, w1=-0.25281202129124447
Gradient Descent(444/499): loss=0.7956285820826394, w0=0.029255886217910705, w1=-0.2528176571045972
Gradient Descent(445/499): loss=0.7956264223532198, w0=0.029253989655492505, w1=-0.2528232438438515
Gradient Descent(446/499): loss=0.7956242836670212, w0=0.02925211673146623, w1=-0.2528287819631242
Gradi

## Generate predictions and save ouput in csv format for submission:

In [20]:
y, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

#tX_test = np.delete(tX_test, (0, 4, 5, 6, 12, 23, 24, 25, 26, 27, 28), 1)

print(y.shape)
print(tX_test.shape)
print(weights.shape)

(568238,)
(568238, 30)
(30,)


In [21]:
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [32]:
print(y_pred.shape)
print(y.shape)

(568238,)
(568238,)
