In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import os
import datetime
import cProfile
from implementations import *
from costs import *
from method_comparison_helpers import *
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [None]:
# Github does not accept files above 100mb and test.csv is 104mb
# thus we upload zip whith test.csv which needs to be extracted
with zipfile.ZipFile("../data/test.csv.zip","r") as zip_ref:
    zip_ref.extractall("../data/")

In [2]:
from proj1_helpers import *
# Load train data
DATA_TRAIN_PATH = '../data/train.csv' 
y, tx, ids = load_csv_data(DATA_TRAIN_PATH)

#Lets verify loaded data
print(y.shape)
print(tx.shape)
print(ids.shape)

(250000,)
(250000, 30)
(250000,)


In [3]:
split_ratio = 0.2
tx_train, tx_test, y_train, y_test = split_data(tx, y, split_ratio)
print(tx_train.shape, y_train.shape)
print(tx_test.shape, y_test.shape)

(50000, 30) (50000,)
(200000, 30) (200000,)


In [None]:
ones = np.ones((tx.shape[0],1))
tx_with_ones = np.hstack((ones, tx))
tx_with_ones.shape

In [None]:
tx_with_ones_train, tx_with_ones_test, y_with_ones_train, y_with_ones_test = split_data(tx_with_ones, y, split_ratio)
print(tx_with_ones_train.shape, y_with_ones_train.shape)
print(tx_with_ones_test.shape, y_with_ones_test.shape)

# Do your thing crazy machine learning thing here :) ...

## Grading Criteria:
1. Competitive Part **(counts one third)**. The final rank of your team in the (private) leaderboard will be translated linearly to a scale from 4 to 6.
2. Code **(counts one third)**. In Python. No external libraries allowed! For this first project, we want you to implement and use the methods we have seen in class. The code will be graded by two TAs independently, according to the criteria described:
* Rules for the code part:
  * Reproducibility: In your submission, you must provide a script run.py which produces exactly the same .csv predictions which you used in your best submission to the competition on Kaggle.
  * Documentation: Your ML system must be clearly described in your PDF report and also well- documented in the code itself. A clear ReadMe file must be provided. The documentation must also include all data preparation, feature generation as well as cross-validation steps that you have used.
  * In addition to your customized system, don’t forget that your code submission must still also include the 6 basic method implementations as described above in step 2.
  * No use of external ML libraries is allowed in Project 1. (It will be allowed in Project 2).
  * No external datasets allowed.
3. Written Report **(counts one third)**. You will write a maximum 2 page PDF report on your findings, using LaTeX. The code will be graded by two TAs independently, and we will provide you feedback. The main criteria will be if you were able to correctly use, implement and describe the 6 baseline methods mentioned in Step 2 above. This counts half for the written report. In addition, we will grade you on the scientific contribution you made additionally, to improve your predictions. For this part, the criteria are
  * scientific novelty
  * creativity
  * reproducibility
  * solid comparison baselines supporting your claims – writeup quality
  

As usual, your code and report will be automatically checked for plagiarism.

# Todo's

* Exploratory data analysis with comments
* Dataset cleaning
* Comment code and this notebook
* Improve predictions to be number one in the keggle!
  * construct better features (optional)
  * implement additional modifications of basic methods implemented (optional)
  * clean and preprocess data
* LateX pdf report

In [None]:
#Lets test some basics: Least Squares Gradient Descent

# Define the parameters of the algorithm.
max_iters = 100
gammas = np.logspace(-10, -7, 10)
train_losses = []
test_losses = []

for gamma in np.nditer(gammas):
    # Start gradient descent.
    start_time = datetime.datetime.now()
    initial_w = np.zeros(tx_with_ones_train.shape[1])
    gradient_w, train_rmse = least_squares_GD(y_with_ones_train, tx_with_ones_train, initial_w, max_iters, gamma)

    # Print result
    test_mse = compute_loss(y_with_ones_test, tx_with_ones_test, gradient_w)
    test_rmse = np.sqrt(2*test_mse)    
    
    train_losses = np.append(train_losses, train_rmse)
    test_losses = np.append(test_losses, test_rmse)
    
    end_time = datetime.datetime.now()
    exection_time = (end_time - start_time).total_seconds()
    #print("Gradient Descent: execution time={t:.3f} seconds. Train RMSE Loss={l}, Test RMSE Loss={tl}".format(t=exection_time, l=grad_loss, tl=test_rmse))
    
plt.semilogx(gammas, train_losses, marker=".", color='b', label='Train')
plt.semilogx(gammas, test_losses, marker=".", color='r', label='Test')
plt.xlabel("gamma")
plt.ylabel("rmse")
plt.grid(True)
plt.legend()

In [None]:
# Stochastic Gradient Descent

# Define the parameters of the algorithm.
max_iters = 10
gammas = np.logspace(-3, -10, 2)
train_losses = []
test_losses = []
for gamma in np.nditer(gammas):
    # Start stochastic gradient descent.
    start_time = datetime.datetime.now()
    initial_w = np.zeros(tx_with_ones_train.shape[1])
    stoch_gradient_w, train_rmse = least_squares_SGD(y_with_ones_train, tx_with_ones_train, initial_w, max_iters, gamma)

    test_mse = compute_loss(y_with_ones_test, tx_with_ones_test, stoch_gradient_w)
    test_rmse = np.sqrt(2*test_mse)    
    
    train_losses = np.append(train_losses, train_rmse)
    test_losses = np.append(test_losses, test_rmse)
    
    end_time = datetime.datetime.now()
    exection_time = (end_time - start_time).total_seconds()
    print("Stochastic Gradient Descent: execution time={t:.3f} seconds. Train RMSE={l}, Test RMSE={tl}".format(t=exection_time, l=train_rmse, tl=test_rmse))

plt.semilogx(gammas, train_losses, marker=".", color='b', label='Train')
plt.semilogx(gammas, test_losses, marker=".", color='r', label='Test')
plt.xlabel("gamma")
plt.ylabel("rmse")
plt.grid(True)

In [None]:
# Least Squares - produce our best keggle result 57th position Mateusz Paluchowski0.74463
start_time = datetime.datetime.now()

least_squares_w, least_squares_loss = least_squares(y_with_ones_train, tx_with_ones_train)
test_mse = compute_loss(y_with_ones_test, tx_with_ones_test, least_squares_w)
test_rmse = np.sqrt(2*test_mse)

end_time = datetime.datetime.now()
exection_time = (end_time - start_time).total_seconds()
print("Lest Squares: execution time={t:.3f} seconds. RMSE Train Loss={l}, Test Loss={tl}".format(t=exection_time, l=least_squares_loss, tl=test_rmse))

In [None]:
#Ridge Regression

# Define the parameters of the algorithm.
train_losses = []
test_losses = []
lambs = np.logspace(-5, 8, 100)
start_time = datetime.datetime.now()
for lamb in np.nditer(lambs):
    ridge_regression_gradient_w,  ridge_regression_loss = ridge_regression(y_with_ones_train, tx_with_ones_train, lamb)
    
    train_losses = np.append(train_losses, ridge_regression_loss)
    
    test_mse = compute_loss(y_with_ones_test, tx_with_ones_test, ridge_regression_gradient_w)
    test_rmse = np.sqrt(2*test_mse)
    test_losses = np.append(test_losses, test_rmse)
    
end_time = datetime.datetime.now()
exection_time = (end_time - start_time).total_seconds()

print("Ridge Regression: execution time={t:.3f} seconds.".format(t=exection_time))
plt.semilogx(lambs, train_losses, marker=".", color='b', label='Train')
plt.semilogx(lambs, test_losses, marker=".", color='r', label='Test')
plt.xlabel("gamma")
plt.ylabel("rmse")
plt.grid(True)
plt.legend()

#Train RMSE = 0.824233562872 Test RMSE = 1.39476527452

In [None]:
# Logistic Regression using gradient descent

# Define the parameters of the algorithm.
max_iters = 1000
train_losses = []
test_losses = []
weights = np.empty((0,tx_with_ones_train.shape[1]), float)
gammas = np.logspace(-18, -23, 10)# np.logspace(-16, -20, 10)
for gamma in np.nditer(gammas):
    start_time = datetime.datetime.now()
    initial_w = np.zeros((tx_with_ones_train.shape[1],1))
    logistic_regression_w, logistic_regression_loss = logistic_regression(np.array([y_with_ones_train]).T, tx_with_ones_train, initial_w, max_iters, gamma)
    weights = np.vstack((weights, logistic_regression_w.T))

    train_losses = np.append(train_losses, logistic_regression_loss)
    
    test_rmse = compute_RMSE(np.array([y_with_ones_test]).T, tx_with_ones_test, logistic_regression_w)
    test_losses = np.append(test_losses, test_rmse)
    
    end_time = datetime.datetime.now()
    exection_time = (end_time - start_time).total_seconds()
    print("Logistic Regression: execution time={t:.3f} seconds. Train RMSE={l}, Test RMSE={tl}".format(t=exection_time, l=logistic_regression_loss, tl=test_rmse))    

plt.semilogx(gammas, train_losses, marker=".", color='b', label='Train')
plt.semilogx(gammas, test_losses, marker=".", color='r', label='Test')
plt.xlabel("gamma")
plt.ylabel("rmse")
plt.grid(True)

In [None]:
# Regularized Logistic Regression using gradient descent
# Slow

# Define the parameters of the algorithm.
max_iters = 2
gamma = 3.41379310345e-14
lambd = 0.1
    
start_time = datetime.datetime.now()
initial_w = np.zeros((tx_with_ones_train.shape[1],1))
logistic_regression_w, logistic_regression_loss = reg_logistic_regression(np.array([y_with_ones_train]).T, tx_with_ones_train, lambd, initial_w, max_iters, gamma)
test_rmse = compute_RMSE(np.array([y_with_ones_test]).T, tx_with_ones_test, logistic_regression_w)
end_time = datetime.datetime.now()
exection_time = (end_time - start_time).total_seconds()
print("Penalized Logistic Regression: execution time={t:.3f} seconds. Train RMSE={l}, Test RMSE={tl}".format(t=exection_time, l=logistic_regression_loss, tl=test_rmse))

In [None]:
# Logistic Regression using newtons method

# Define the parameters of the algorithm.
max_iters = 2

gammas = np.logspace(-18, -23, 1)
for gamma in np.nditer(gammas):
    start_time = datetime.datetime.now()
    
    initial_w = np.zeros((tx_with_ones_train.shape[1],1))
    logistic_regression_newton_w, logistic_regression_newton_loss = learning_by_newton_method(np.array([y_with_ones_train]).T, tx_with_ones_train, initial_w, max_iters, gamma)
    test_rmse = compute_RMSE(np.array([y_with_ones_test]).T, tx_with_ones_test, logistic_regression_newton_w)
    
    end_time = datetime.datetime.now()
    exection_time = (end_time - start_time).total_seconds()
    print("Logistic Regression Newtons Method: execution time={t:.3f} seconds. Train RMSE={l}, Test RMSE={tl}".format(t=exection_time, l=logistic_regression_newton_loss, tl=test_rmse))

In [None]:
# Iteratively reweighted least squares 
# slow for large dataset, can have matrix singularity problems if many iters are run.

# Define the parameters of the algorithm.
max_iters = 1
start_time = datetime.datetime.now()

initial_w = np.zeros((tx_with_ones_train.shape[1],1))
IRLS_w, IRLS_loss  = learning_by_IRLS(np.array([y_with_ones_train[0:10000]]).T, tx_with_ones_train[0:10000], initial_w, max_iters)
test_rmse = compute_RMSE(np.array([y_with_ones_test[0:10000]]).T, tx_with_ones_test[0:10000], IRLS_w)

end_time = datetime.datetime.now()
exection_time = (end_time - start_time).total_seconds()
print("IRLS: execution time={t:.3f} seconds. Train RMSE={l}, Test RMSE={tl}".format(t=exection_time, l=IRLS_loss, tl=test_rmse))

## Exploratory data analysis

In [4]:
# NUMPY ONLY VERSION

#Lets print some basic statistics about dataset
print(np.mean(tx, axis=0))
print(np.std(tx, axis=0))
print(np.min(tx, axis=0))
print(np.max(tx, axis=0))

[ -4.90230794e+01   4.92398193e+01   8.11819816e+01   5.78959617e+01
  -7.08420675e+02  -6.01237051e+02  -7.09356603e+02   2.37309984e+00
   1.89173324e+01   1.58432217e+02   1.43760943e+00  -1.28304708e-01
  -7.08985189e+02   3.87074191e+01  -1.09730480e-02  -8.17107200e-03
   4.66602072e+01  -1.95074680e-02   4.35429640e-02   4.17172345e+01
  -1.01191920e-02   2.09797178e+02   9.79176000e-01  -3.48329567e+02
  -3.99254314e+02  -3.99259788e+02  -6.92381204e+02  -7.09121609e+02
  -7.09118631e+02   7.30645914e+01]
[ 406.34483401   35.34481492   40.82860887   63.65555431  454.47965615
  657.97098617  453.01897051    0.78290955   22.2734492   115.70588372
    0.84474126    1.19358245  453.59581401   22.41203584    1.21407622
    1.81675941   22.06487828    1.26497962    1.81660763   32.8946274
    1.81221908  126.49925272    0.97742435  532.96172343  489.33730734
  489.33290465  479.87453609  453.38371728  453.3881105    98.01546598]
[ -9.99000000e+02   0.00000000e+00   6.32900000e+00   0

In [5]:
# NUMPY ONLY VERSION

#Lets extract sparse columns which contain -999 values
columns = tx.min(axis=0)#tx_train.shape[1]
sparse_columns = np.array([])
for i, minimum in np.ndenumerate(columns):
    if -999 == minimum:
        sparse_columns = np.append(sparse_columns, [i])
print('Sparse columns:')        
print(sparse_columns)

Sparse columns:
[  0.   4.   5.   6.  12.  23.  24.  25.  26.  27.  28.]


In [6]:
# NUMPY ONLY VERSION

# Lets replace -999 values with nan's
tx_nan = tx.copy()
tx_nan[tx_nan==-999]=np.nan
print(np.nanmean(tx_nan, axis=0))
print(np.nanstd(tx_nan, axis=0))
print(np.nanmin(tx_nan, axis=0))
print(np.nanmax(tx_nan, axis=0))

[  1.21858528e+02   4.92398193e+01   8.11819816e+01   5.78959617e+01
   2.40373503e+00   3.71783360e+02  -8.21688171e-01   2.37309984e+00
   1.89173324e+01   1.58432217e+02   1.43760943e+00  -1.28304708e-01
   4.58289801e-01   3.87074191e+01  -1.09730480e-02  -8.17107200e-03
   4.66602072e+01  -1.95074680e-02   4.35429640e-02   4.17172345e+01
  -1.01191920e-02   2.09797178e+02   9.79176000e-01   8.48221045e+01
  -3.27458741e-03  -1.23928255e-02   5.76794744e+01  -1.18452642e-02
  -1.58228913e-03   7.30645914e+01]
[  57.29802145   35.34481492   40.82860887   63.65555431    1.74221431
  397.69658434    3.58433731    0.78290955   22.2734492   115.70588372
    0.84474126    1.19358245    0.39867861   22.41203584    1.21407622
    1.81675941   22.06487828    1.26497962    1.81660763   32.8946274
    1.81221908  126.49925272    0.97742435   60.66207397    1.78454002
    1.81337943   31.98556122    2.0317286     1.8169372    98.01546598]
[  9.044   0.      6.329   0.      0.     13.602 -18.06

In [7]:
# NUMPY ONLY VERSION

#Lets fill NaNs with column's mean value
tx_mean_filled = tx_nan.copy()
#Obtain mean of columns as you need, nanmean is just convenient.
mean = np.nanmean(tx_mean_filled, axis=0)
#Find indicies that you need to replace
inds = np.where(np.isnan(tx_mean_filled))
#Place column means in the indices. Align the arrays using take
tx_mean_filled[inds]=np.take(mean,inds[1])
print(np.mean(tx_mean_filled, axis=0))
print(np.std(tx_mean_filled, axis=0))
print(np.min(tx_mean_filled, axis=0))
print(np.max(tx_mean_filled, axis=0))

[  1.21858528e+02   4.92398193e+01   8.11819816e+01   5.78959617e+01
   2.40373503e+00   3.71783360e+02  -8.21688171e-01   2.37309984e+00
   1.89173324e+01   1.58432217e+02   1.43760943e+00  -1.28304708e-01
   4.58289801e-01   3.87074191e+01  -1.09730480e-02  -8.17107200e-03
   4.66602072e+01  -1.95074680e-02   4.35429640e-02   4.17172345e+01
  -1.01191920e-02   2.09797178e+02   9.79176000e-01   8.48221045e+01
  -3.27458741e-03  -1.23928255e-02   5.76794744e+01  -1.18452642e-02
  -1.58228913e-03   7.30645914e+01]
[  52.74979213   35.34481492   40.82860887   63.65555431    0.9384893
  214.22966692    1.93079704    0.78290955   22.2734492   115.70588372
    0.84474126    1.19358245    0.21475866   22.41203584    1.21407622
    1.81675941   22.06487828    1.26497962    1.81660763   32.8946274
    1.81221908  126.49925272    0.97742435   47.00226517    1.38269956
    1.40504495   17.22985913    1.09444375    0.97874075   98.01546598]
[  9.044   0.      6.329   0.      0.     13.602 -18.066

In [8]:
# NUMPY ONLY VERSION

#Lets normalize
tx_mean_filled_normalized = tx_mean_filled.copy()
tx_mean_filled_normalized = (tx_mean_filled_normalized - tx_mean_filled_normalized.mean(axis=0)) / tx_mean_filled_normalized.std(axis=0, ddof=1)
print(np.mean(tx_mean_filled_normalized, axis=0))
print(np.std(tx_mean_filled_normalized, axis=0))
print(np.min(tx_mean_filled_normalized, axis=0))
print(np.max(tx_mean_filled_normalized, axis=0))

[ -9.73054681e-13   4.42750414e-15  -3.50538043e-15   7.10211001e-15
  -7.23440168e-12  -6.30188342e-12   6.80633033e-13   2.44147274e-14
   6.40282893e-15   2.86169444e-15  -6.95043934e-15   5.43928191e-15
   5.55859610e-13  -5.97410332e-15   1.30739863e-16   6.37561115e-17
   2.58283965e-14  -1.17813315e-16  -1.43046242e-16   8.24421384e-15
   1.23416388e-16  -8.96501851e-15  -1.91224281e-15   2.89132939e-12
  -2.77659665e-15   2.53467926e-14  -8.40723363e-12   2.09942845e-14
  -5.89750587e-15  -3.38248540e-16]
[ 0.999998  0.999998  0.999998  0.999998  0.999998  0.999998  0.999998
  0.999998  0.999998  0.999998  0.999998  0.999998  0.999998  0.999998
  0.999998  0.999998  0.999998  0.999998  0.999998  0.999998  0.999998
  0.999998  0.999998  0.999998  0.999998  0.999998  0.999998  0.999998
  0.999998  0.999998]
[-2.1386682  -1.39312431 -1.8333427  -0.90951758 -2.56127609 -1.67194698
 -8.93117038 -2.76544782 -0.84932039 -0.97080623 -1.64619242 -1.07717127
 -2.13397163 -0.83470247 -2.0

In [9]:
# PURE NUMPY VERSION

#Lets fill NaNs with 0
tx_zero_filled = tx_nan.copy()
where_are_NaNs = np.isnan(tx_zero_filled)
tx_zero_filled[where_are_NaNs] = 0
print(np.mean(tx_zero_filled, axis=0))
print(np.std(tx_zero_filled, axis=0))
print(np.min(tx_zero_filled, axis=0))
print(np.max(tx_zero_filled, axis=0))

[  1.03280465e+02   4.92398193e+01   8.11819816e+01   5.78959617e+01
   6.97496600e-01   1.07881121e+02  -2.38430900e-01   2.37309984e+00
   1.89173324e+01   1.58432217e+02   1.43760943e+00  -1.28304708e-01
   1.32982868e-01   3.87074191e+01  -1.09730480e-02  -8.17107200e-03
   4.66602072e+01  -1.95074680e-02   4.35429640e-02   4.17172345e+01
  -1.01191920e-02   2.09797178e+02   9.79176000e-01   5.09227808e+01
  -1.96589200e-03  -7.44000800e-03   1.67369685e+01  -3.43716400e-03
  -4.59136000e-04   7.30645914e+01]
[  68.56596553   35.34481492   40.82860887   63.65555431    1.43904749
  272.69840421    1.96648005    0.78290955   22.2734492   115.70588372
    0.84474126    1.19358245    0.29896744   22.41203584    1.21407622
    1.81675941   22.06487828    1.26497962    1.81660763   32.8946274
    1.81221908  126.49925272    0.97742435   62.73325086    1.38270049
    1.40505806   31.33881741    1.09445695    0.97874101   98.01546598]
[  0.      0.      6.329   0.      0.      0.    -18.06

In [10]:
# NUMPY ONLY VERSION

#Lets normalize
tx_zero_filled_normalized = tx_zero_filled.copy()
tx_zero_filled_normalized = (tx_zero_filled_normalized - tx_zero_filled_normalized.mean(axis=0)) / tx_zero_filled_normalized.std(axis=0, ddof=1)
print(np.mean(tx_zero_filled_normalized, axis=0))
print(np.std(tx_zero_filled_normalized, axis=0))
print(np.min(tx_zero_filled_normalized, axis=0))
print(np.max(tx_zero_filled_normalized, axis=0))

[  2.23364047e-14   4.42750414e-15  -3.50538043e-15   7.10211001e-15
  -2.93632718e-15   4.56355265e-15  -1.32019273e-15   2.44147274e-14
   6.40282893e-15   2.86169444e-15  -6.95043934e-15   5.43928191e-15
  -7.47514495e-16  -5.97410332e-15   1.30739863e-16   6.37561115e-17
   2.58283965e-14  -1.17813315e-16  -1.43046242e-16   8.24421384e-15
   1.23416388e-16  -8.96501851e-15  -1.91224281e-15  -3.99443056e-15
  -1.46157516e-15   1.15313340e-15   8.03767719e-16   2.29318138e-15
   9.35779405e-18  -3.38248540e-16]
[ 0.999998  0.999998  0.999998  0.999998  0.999998  0.999998  0.999998
  0.999998  0.999998  0.999998  0.999998  0.999998  0.999998  0.999998
  0.999998  0.999998  0.999998  0.999998  0.999998  0.999998  0.999998
  0.999998  0.999998  0.999998  0.999998  0.999998  0.999998  0.999998
  0.999998  0.999998]
[-1.50629043 -1.39312431 -1.8333427  -0.90951758 -0.48469228 -0.3956052
 -9.06570778 -2.76544782 -0.84932039 -0.97080623 -1.64619242 -1.07717127
 -0.4448063  -0.83470247 -2.04

In [11]:
# PURE NUMPY VERSION
# Lets do exactly the same for Predtiction dataset
DATA_PRED_PATH = '../data/test.csv'
_, tx_pred, ids_pred = load_csv_data(DATA_PRED_PATH)

# Lets replace -999 values for nan's
tx_pred_nan = tx_pred.copy()
tx_pred_nan[tx_pred_nan==-999]=np.nan

#Lets fill NaNs with column's mean value
tx_pred_mean_filled = tx_pred_nan.copy()
#Obtain mean of columns as you need, nanmean is just convenient.
mean = np.nanmean(tx_pred_mean_filled, axis=0)
#Find indicies that you need to replace
inds = np.where(np.isnan(tx_pred_mean_filled))
#Place column means in the indices. Align the arrays using take
tx_pred_mean_filled[inds]=np.take(mean,inds[1])

#Lets normalize tx_pred_mean_filled
tx_pred_mean_filled_normalized = tx_pred_mean_filled.copy()
tx_pred_mean_filled_normalized = (tx_pred_mean_filled_normalized - tx_pred_mean_filled_normalized.mean(axis=0)) / tx_pred_mean_filled_normalized.std(axis=0, ddof=1)

#Lets fill NaNs with 0
tx_pred_zero_filled = tx_pred_nan.copy()
where_are_NaNs = np.isnan(tx_pred_zero_filled)
tx_pred_zero_filled[where_are_NaNs] = 0

#Lets normalize tx_pred_zero_filled
tx_pred_zero_filled_normalized = tx_pred_zero_filled.copy()
tx_pred_zero_filled_normalized = (tx_pred_zero_filled_normalized - tx_pred_zero_filled_normalized.mean(axis=0)) / tx_pred_zero_filled_normalized.std(axis=0, ddof=1)

In [None]:
# Save/load for future to/from csv
# np.savetxt("../data/tx_zero_filled_normalized.csv", tx_zero_filled_normalized, delimiter=",")
# np.savetxt("../data/tx_pred_zero_filled_normalized.csv", tx_pred_zero_filled_normalized, delimiter=",")
# np.savetxt("../data/tx_mean_filled_normalized.csv", tx_mean_filled_normalized, delimiter=",")
# np.savetxt("../data/tx_pred_mean_filled_normalized.csv", tx_pred_mean_filled_normalized, delimiter=",")


# tx_zero_filled_normalized = np.loadtxt("../data/tx_zero_filled_normalized.csv", delimiter=",")
# tx_pred_zero_filled_normalized = np.loadtxt("../data/tx_pred_zero_filled_normalized.csv", delimiter=",")
# tx_mean_filled_normalized = np.loadtxt("../data/tx_mean_filled_normalized.csv", delimiter=",")
# tx_pred_mean_filled_normalized = np.loadtxt("../data/tx_pred_mean_filled_normalized.csv", delimiter=",")

In [12]:
# Lets split tx to train and test
split_ratio = 0.2
tx_zero_filled_normalized_train, tx_zero_filled_normalized_test, y_zero_filled_normalized_train, y_zero_filled_normalized_test = split_data(tx_zero_filled_normalized, y, split_ratio)
tx_mean_filled_normalized_train, tx_mean_filled_normalized_test, y_mean_filled_normalized_train, y_mean_filled_normalized_test = split_data(tx_mean_filled_normalized, y, split_ratio)

In [13]:
train_datasets = [tx_train, tx_zero_filled_normalized_train, tx_mean_filled_normalized_train]
test_datasets = [tx_test, tx_zero_filled_normalized_test, tx_mean_filled_normalized_test]
pred_datasets = [tx_pred, tx_pred_zero_filled_normalized, tx_pred_mean_filled_normalized]
datasets_names = ['Original/Raw','Zero filled', 'Mean filled']

In [None]:
max_iters = 100
gammas = np.logspace(-14, -18, 10)
for i in range(len(train_datasets)):
     logistic_regression_dataset_gammas_test(y_train, y_test, train_datasets[i], test_datasets[i], max_iters, gammas, datasets_names[i], i)

In [None]:
# Try Logistic regression
max_iters = 1000
gammas = np.logspace(-14, -18, 10)# np.logspace(-16, -20, 10)
train_losses = []
test_losses = []
weights = np.empty((0,new_meanfilled_tx_train.shape[1]), float)
for gamma in np.nditer(gammas):
    
    start_time = datetime.datetime.now()
    initial_w = np.zeros((new_meanfilled_tx_train.shape[1],1))
    logistic_regression_w, logistic_regression_loss = logistic_regression(np.array([y_train]).T, new_meanfilled_tx_train, initial_w, max_iters, gamma)
    
    train_losses = np.append(train_losses, logistic_regression_loss)
    test_mse = compute_loss(y_test, new_meanfilled_tx_test, logistic_regression_w[:,0])
    test_rmse = np.sqrt(2*test_mse)
    test_losses = np.append(test_losses, test_rmse)
    
    end_time = datetime.datetime.now()
    exection_time = (end_time - start_time).total_seconds()
    
    weights = np.vstack((weights, logistic_regression_w.T))
    print("Logistic Regression: execution time={t:.3f} seconds. RMSE Loss={l}".format(t=exection_time, l=logistic_regression_loss))


plt.semilogx(gammas, test_losses, marker=".", color='r', label='test error')
plt.semilogx(gammas, train_losses, marker=".", color='b', label='train error')
plt.xlabel("gamma")
plt.ylabel("rmse")
plt.grid(True)
plt.legend()

In [None]:
# Try Least Squares 
start_time = datetime.datetime.now()
least_squares_w, least_squares_loss = least_squares(y_train, new_tx_train)
end_time = datetime.datetime.now()

test_mse = compute_loss(y_test, new_tx_test, least_squares_w)
test_rmse = np.sqrt(2*test_mse)

exection_time = (end_time - start_time).total_seconds()
print("Lest Squares: execution time={t:.3f} seconds. RMSE Train Loss={l}, Test Loss={tl}".format(t=exection_time, l=least_squares_loss, tl=test_rmse))

## Cross validation


In [None]:
from plots import cross_validation_visualization

subset_y = y
subset_tx = tx


# Define the parameters of the algorithm.
seed = 1
k_fold = 10
lambdas = np.logspace(-16, 2, 1)

rmse_tr = []
rmse_te = []
# weights = np.empty((0,subset_tx.shape[1]), float)
start_time = datetime.datetime.now()

for lambd in np.nditer(lambdas):
    loss_tr, loss_te = cross_validation_mat(subset_y, subset_tx, k_fold, seed, lambd)
    rmse_tr = np.append(rmse_tr, loss_tr)
    rmse_te = np.append(rmse_te, loss_te)
#     weights = np.vstack((weights, w))
        
end_time = datetime.datetime.now()
exection_time = (end_time - start_time).total_seconds()

print("Cross Validation: execution time={t:.3f} seconds.".format(t=exection_time))
#cross_validation_visualization(lambdas, rmse_tr, rmse_te)

## Generate predictions and save ouput in csv format for submission:

In [None]:
OUTPUT_PATH = '../data/logistic_regression_cross_validation_submission.csv' # TODO: fill in desired name of output file for submission
weights_pred = least_squares_w
y_pred = predict_labels(weights_pred, new_tx_pred)
create_csv_submission(ids_pred, y_pred, OUTPUT_PATH)

http://inclass.kaggle.com/c/epfml-project-1

In [None]:
# Delete train.csv such that github accepts push
os.remove('../data/test.csv')