In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import os
import datetime
import seaborn as sns
from functions import *
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
# Github does not accept files above 100mb and test.csv is 104mb
# thus we upload zip whith test.csv which needs to be extracted
with zipfile.ZipFile("../data/test.csv.zip","r") as zip_ref:
    zip_ref.extractall("../data/")

In [3]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv' # TODO: download train data and supply path here 
y, tx, ids = load_csv_data(DATA_TRAIN_PATH)

In [None]:
#Lets verify loaded data
print(y.shape)
print(tx.shape)
print(ids.shape)

In [None]:
# fig = plt.figure()

# ax2 = fig.add_subplot(1, 1, 1)
# ax2.scatter(tX[:,0].T, y, marker=".", color='b', s=5)
# ax2.set_xlabel("x")
# ax2.set_ylabel("y")
# ax2.grid()
# fig

# Do your thing crazy machine learning thing here :) ...

## Grading Criteria:
1. Competitive Part **(counts one third)**. The final rank of your team in the (private) leaderboard will be translated linearly to a scale from 4 to 6.
2. Code **(counts one third)**. In Python. No external libraries allowed! For this first project, we want you to implement and use the methods we have seen in class. The code will be graded by two TAs independently, according to the criteria described:
* Rules for the code part:
  * Reproducibility: In your submission, you must provide a script run.py which produces exactly the same .csv predictions which you used in your best submission to the competition on Kaggle.
  * Documentation: Your ML system must be clearly described in your PDF report and also well- documented in the code itself. A clear ReadMe file must be provided. The documentation must also include all data preparation, feature generation as well as cross-validation steps that you have used.
  * In addition to your customized system, don’t forget that your code submission must still also include the 6 basic method implementations as described above in step 2.
  * No use of external ML libraries is allowed in Project 1. (It will be allowed in Project 2).
  * No external datasets allowed.
3. Written Report **(counts one third)**. You will write a maximum 2 page PDF report on your findings, using LaTeX. The code will be graded by two TAs independently, and we will provide you feedback. The main criteria will be if you were able to correctly use, implement and describe the 6 baseline methods mentioned in Step 2 above. This counts half for the written report. In addition, we will grade you on the scientific contribution you made additionally, to improve your predictions. For this part, the criteria are
  * scientific novelty
  * creativity
  * reproducibility
  * solid comparison baselines supporting your claims – writeup quality
  

As usual, your code and report will be automatically checked for plagiarism.

# Todo's

* verify correctness of implemented methods
* (!) implement local estimation on local validation test set and local **cross validation**!
* fix and check reg_logistic_regression
* Exploratory data analysis with comments
* Dataset cleaning
* Comment code and this notebook
* Improve predictions to be number one in the keggle!
  * construct better features (optional)
  * implement additional modifications of basic methods implemented (optional)
  * clean and preprocess data
* LateX pdf report

In [None]:
#Lets test some basics: Least Squares Gradient Descent

# Define the parameters of the algorithm.
max_iters = 1000
gamma = 0.0000001

# Start gradient descent.
start_time = datetime.datetime.now()
grad_loss, gradient_w = least_squares_GD(y, tx, gamma, max_iters)
end_time = datetime.datetime.now()

# Print result
exection_time = (end_time - start_time).total_seconds()
print("Gradient Descent: execution time={t:.3f} seconds. MSE Loss={l}".format(t=exection_time, l=grad_loss))

In [None]:
gammas = np.logspace(-1, -10, 10)

In [None]:
# Stochastic Gradient Descent

# Define the parameters of the algorithm.
max_iters = 100
gamma = 0.0001
gammas = np.logspace(-1, -10, 10)
for gamma in np.nditer(gammas):
    # Start stochastic gradient descent.
    start_time = datetime.datetime.now()
    stoch_grad_loss, stoch_gradient_w = least_squares_SGD(y, tx, gamma, max_iters)
    end_time = datetime.datetime.now()

    # Print result
    exection_time = (end_time - start_time).total_seconds()
    print("Stochastic Gradient Descent: execution time={t:.3f} seconds. MSE Loss={l}".format(t=exection_time, l=stoch_grad_loss))

In [None]:
# Least Squares - produce our best keggle result 57th position Mateusz Paluchowski0.74463
start_time = datetime.datetime.now()
least_squares_loss, leas_squares_gradient_w = least_squares(y, tx)
end_time = datetime.datetime.now()

# Print result
exection_time = (end_time - start_time).total_seconds()
print("Stochastic Gradient Descent: execution time={t:.3f} seconds. MSE Loss={l}".format(t=exection_time, l=least_squares_loss))

In [None]:
lambs = np.logspace(-4, -20, 100)
lambs

In [None]:
#Ridge Regression - to be checked because changes in lamb parameter almost doesnt affect anything (only large lambs)

# Define the parameters of the algorithm.
lamb = 0.0001

lambs = np.logspace(-4, -20, 100)
for lamb in np.nditer(lambs):
    start_time = datetime.datetime.now()
    ridge_regression_loss, ridge_regression_gradient_w = ridge_regression(y, tx, lamb)
    end_time = datetime.datetime.now()

    # Print result
    exection_time = (end_time - start_time).total_seconds()
    print("Ridge Regression: execution time={t:.3f} seconds. MSE Loss={l}".format(t=exection_time, l=least_squares_loss))

In [None]:
w = np.zeros((tx.shape[1],1))

print(tx.shape)
print(np.array([y]).T.shape)
print(w.shape)

In [None]:
gammas = np.logspace(-16, -20, 10)
gammas

In [7]:
gammas = np.logspace(-20, -23, 3)
gammas

# For 1000 iters
# 1e-20: MSE Loss=-65117.90563844488
# 1e-21: MSE Loss=149446.2867804076
# 5e-22: MSE Loss=161366.53989681418
# 1e-23: MSE Loss=173048.39001428057
# 

array([-20. , -21.5, -23. ])

In [10]:
# Logistic Regression using gradient descent

# Define the parameters of the algorithm.
max_iters = 1000
gamma = 5e-20 #best from linspace below

gammas = np.logspace(-21, -23, 10)# np.logspace(-16, -20, 10)
for gamma in np.nditer(np.array([gamma])):
    
    start_time = datetime.datetime.now()
    logistic_regression_loss, logistic_regression_w = logistic_regression(np.array([y]).T, tx, gamma, max_iters)
    end_time = datetime.datetime.now()

    # Print result
    exection_time = (end_time - start_time).total_seconds()
    print("Logistic Regression: execution time={t:.3f} seconds. MSE Loss={l}".format(t=exection_time, l=logistic_regression_loss))

Logistic Regression: execution time=868.312 seconds. MSE Loss=-1018728.2017495089


In [None]:
# TODO: Regularized Logistic Regression using gradient descent

# Define the parameters of the algorithm.
max_iters = 1
gamma = 3.41379310345e-14
lambd = 0.1
    
start_time = datetime.datetime.now()
logistic_regression_loss, logistic_regression_w = reg_logistic_regression(np.array([y]).T, tx, lambd, gamma, max_iters)
end_time = datetime.datetime.now()

# Print result
exection_time = (end_time - start_time).total_seconds()
print("Penalizec Logistic Regression: execution time={t:.3f} seconds. MSE Loss={l}".format(t=exection_time, l=logistic_regression_loss))

## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download test data and supply path here 
y_test, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
OUTPUT_PATH = '../data/logistic_regression_100iter_submission.csv' # TODO: fill in desired name of output file for submission
weights = logistic_regression_w
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

http://inclass.kaggle.com/c/epfml-project-1

In [None]:
# Delete train.csv such that github accepts push
os.remove('../data/test.csv')