In [24]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import os
import datetime
import seaborn as sns
import pandas as pd
import cProfile
from matplotlib.mlab import PCA
from functions import *
from costs import *
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load the training data into feature matrix, class labels, and event ids:

In [2]:
# Github does not accept files above 100mb and test.csv is 104mb
# thus we upload zip whith test.csv which needs to be extracted
with zipfile.ZipFile("../data/test.csv.zip","r") as zip_ref:
    zip_ref.extractall("../data/")

In [3]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv' # TODO: download train data and supply path here 
y, tx, ids = load_csv_data(DATA_TRAIN_PATH)

#Lets verify loaded data
print(y.shape)
print(tx.shape)
print(ids.shape)

(250000,)
(250000, 30)
(250000,)


In [4]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download test data and supply path here 
y_test, tx_test, ids_test = load_csv_data(DATA_TEST_PATH)

print(y_test.shape)
print(tx_test.shape)

(568238,)
(568238, 30)


In [5]:
all_y = np.append(y, y_test)
all_tx = np.concatenate((tx, tx_test))

print(all_y.shape)
print(all_tx.shape)

(818238,)
(818238, 30)


In [6]:
# fig = plt.figure()

# ax2 = fig.add_subplot(1, 1, 1)
# ax2.scatter(tX[:,0].T, y, marker=".", color='b', s=5)
# ax2.set_xlabel("x")
# ax2.set_ylabel("y")
# ax2.grid()
# fig

# Do your thing crazy machine learning thing here :) ...

## Grading Criteria:
1. Competitive Part **(counts one third)**. The final rank of your team in the (private) leaderboard will be translated linearly to a scale from 4 to 6.
2. Code **(counts one third)**. In Python. No external libraries allowed! For this first project, we want you to implement and use the methods we have seen in class. The code will be graded by two TAs independently, according to the criteria described:
* Rules for the code part:
  * Reproducibility: In your submission, you must provide a script run.py which produces exactly the same .csv predictions which you used in your best submission to the competition on Kaggle.
  * Documentation: Your ML system must be clearly described in your PDF report and also well- documented in the code itself. A clear ReadMe file must be provided. The documentation must also include all data preparation, feature generation as well as cross-validation steps that you have used.
  * In addition to your customized system, don’t forget that your code submission must still also include the 6 basic method implementations as described above in step 2.
  * No use of external ML libraries is allowed in Project 1. (It will be allowed in Project 2).
  * No external datasets allowed.
3. Written Report **(counts one third)**. You will write a maximum 2 page PDF report on your findings, using LaTeX. The code will be graded by two TAs independently, and we will provide you feedback. The main criteria will be if you were able to correctly use, implement and describe the 6 baseline methods mentioned in Step 2 above. This counts half for the written report. In addition, we will grade you on the scientific contribution you made additionally, to improve your predictions. For this part, the criteria are
  * scientific novelty
  * creativity
  * reproducibility
  * solid comparison baselines supporting your claims – writeup quality
  

As usual, your code and report will be automatically checked for plagiarism.

# Todo's

* verify correctness of implemented methods
* (!) implement local estimation on local validation test set and local **cross validation**! **CV DONE**
* fix and check reg_logistic_regression
* Exploratory data analysis with comments
* Dataset cleaning
* Comment code and this notebook
* Improve predictions to be number one in the keggle!
  * construct better features (optional)
  * implement additional modifications of basic methods implemented (optional)
  * clean and preprocess data
* LateX pdf report

In [None]:
#Lets test some basics: Least Squares Gradient Descent

# Define the parameters of the algorithm.
max_iters = 100
gammas = np.logspace(-7, -9, 10)
losses = []
for gamma in np.nditer(gammas):
    # Start gradient descent.
    start_time = datetime.datetime.now()
    grad_loss, gradient_w = least_squares_GD(y, tx, gamma, max_iters)
    end_time = datetime.datetime.now()

    # Print result
    grad_loss = compute_rmse_loss(grad_loss)
    exection_time = (end_time - start_time).total_seconds()
    losses = np.append(losses, grad_loss)
    print("Gradient Descent: execution time={t:.3f} seconds. RMSE Loss={l}".format(t=exection_time, l=grad_loss))
    
plt.semilogx(gammas, losses, marker=".", color='b')
plt.xlabel("gamma")
plt.ylabel("rmse")
plt.grid(True)

In [None]:
# Stochastic Gradient Descent

# Define the parameters of the algorithm.
max_iters = 100
gammas = np.logspace(-3, -10, 10)
losses = []
for gamma in np.nditer(gammas):
    # Start stochastic gradient descent.
    start_time = datetime.datetime.now()
    stoch_grad_loss, stoch_gradient_w = least_squares_SGD(y, tx, gamma, max_iters)
    end_time = datetime.datetime.now()

    # Print result
    stoch_grad_loss = compute_rmse_loss(stoch_grad_loss)
    exection_time = (end_time - start_time).total_seconds()
    losses = np.append(losses, stoch_grad_loss)
    print("Stochastic Gradient Descent: execution time={t:.3f} seconds. RMSE Loss={l}".format(t=exection_time, l=stoch_grad_loss))

plt.semilogx(gammas, losses, marker=".", color='b')
plt.xlabel("gamma")
plt.ylabel("rmse")
plt.grid(True)

In [None]:
# Least Squares - produce our best keggle result 57th position Mateusz Paluchowski0.74463
start_time = datetime.datetime.now()
least_squares_loss, least_squares_w = least_squares(all_y, all_tx)
end_time = datetime.datetime.now()

# Print result
least_squares_loss = compute_rmse_loss(least_squares_loss)
exection_time = (end_time - start_time).total_seconds()
print("Lest Squares: execution time={t:.3f} seconds. RMSE Loss={l}".format(t=exection_time, l=least_squares_loss))

In [None]:
#Ridge Regression - to be checked because changes in lamb parameter almost doesnt affect anything (only large lambs)

# Define the parameters of the algorithm.
losses = []
lambs = np.logspace(-4, -12, 80)
start_time = datetime.datetime.now()
for lamb in np.nditer(lambs):
    ridge_regression_loss, ridge_regression_gradient_w = ridge_regression(y, tx, lamb)
    
    ridge_regression_loss = compute_rmse_loss(ridge_regression_loss)
    losses = np.append(losses, ridge_regression_loss)
    
end_time = datetime.datetime.now()
exection_time = (end_time - start_time).total_seconds()
print("Ridge Regression: execution time={t:.3f} seconds.".format(t=exection_time))
plt.semilogx(lambs, losses, marker=".", color='b')
plt.xlabel("gamma")
plt.ylabel("rmse")
plt.grid(True)

In [None]:
gammas = np.logspace(-20, -23, 3)
gammas

# For 1000 iters
# 1e-20: MSE Loss=-65117.90563844488
# 1e-21: MSE Loss=149446.2867804076
# 5e-22: MSE Loss=161366.53989681418
# 1e-23: MSE Loss=173048.39001428057
# 

In [None]:
# Logistic Regression using gradient descent

# Define the parameters of the algorithm.
max_iters = 100
losses = []
gammas = np.logspace(-18, -23, 10)# np.logspace(-16, -20, 10)
for gamma in np.nditer(gammas):
    
    start_time = datetime.datetime.now()
    logistic_regression_loss, logistic_regression_w = logistic_regression_GD(np.array([y]).T, tx, gamma, max_iters)
    end_time = datetime.datetime.now()
    
    # Print result
    exection_time = (end_time - start_time).total_seconds()
    logistic_regression_loss = compute_rmse_loss(logistic_regression_loss)
    losses = np.append(losses, logistic_regression_loss)
    print("Logistic Regression: execution time={t:.3f} seconds. RMSE Loss={l}".format(t=exection_time, l=logistic_regression_loss))


plt.semilogx(gammas, losses, marker=".", color='b', label='log reg rmse error')
plt.xlabel("gamma")
plt.ylabel("rmse")
plt.grid(True)

In [None]:
# TODO: Regularized Logistic Regression using gradient descent

# Define the parameters of the algorithm.
max_iters = 1
gamma = 3.41379310345e-14
lambd = 0.1
    
start_time = datetime.datetime.now()
logistic_regression_loss, logistic_regression_w = reg_logistic_regression(np.array([y]).T, tx, lambd, gamma, max_iters)
end_time = datetime.datetime.now()

# Print result
exection_time = (end_time - start_time).total_seconds()
print("Penalized Logistic Regression: execution time={t:.3f} seconds. RMSE Loss={l}".format(t=exection_time, l=logistic_regression_loss))

In [None]:
# TOOD: Logistic Regression using newtons method

# Define the parameters of the algorithm.
max_iters = 2
gamma = 5e-20 

gammas = np.logspace(-18, -23, 10)
for gamma in np.nditer(gammas[0]):
    
    start_time = datetime.datetime.now()
    logistic_regression_newton_loss, logistic_regression_newton_w = logistic_regression_newton(np.array([y]).T, tx, gamma, max_iters)
    end_time = datetime.datetime.now()

    # Print result
    exection_time = (end_time - start_time).total_seconds()
    logistic_regression_newton_loss = compute_rmse_loss(logistic_regression_newton_loss)
    print("Logistic Regression: execution time={t:.3f} seconds. RMSE Loss={l}".format(t=exection_time, l=logistic_regression_newton_loss))

## Exploratory data analysis

In [7]:
# Lets load it into Pandas data frame since it is easier for data analysis
original_df = pd.DataFrame(tx)
# original_df.columns = original_df.columns.astype(str)
original_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,197.76,...,-0.277,258.733,2.0,67.435,2.15,0.444,46.062,1.24,-2.475,113.497
1,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,125.157,...,-1.916,164.546,1.0,46.226,0.725,1.158,-999.0,-999.0,-999.0,46.226
2,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,197.814,...,-2.186,260.414,1.0,44.251,2.053,-2.028,-999.0,-999.0,-999.0,44.251
3,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,75.968,...,0.06,86.062,0.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0
4,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,57.983,...,-0.871,53.131,0.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0


In [8]:
# Lets display some basic statistics
columns = original_df.columns.to_series()
sparse_columns = np.array([])
for i, column in columns.iteritems():
    value_counts = original_df[original_df.columns[column]].value_counts()
    if -999 in value_counts:
        sparse_columns = np.append(sparse_columns, [column])
        print("Column {c_no} contains {v_no} values equal to -999".format(c_no = column, v_no = value_counts[-999]))

print('Sparse columns:')        
print(sparse_columns)
first10_df = original_df.iloc[:,:10]
first10_df.describe()
# As we can see there are features (columns) which are afected greatly by missing values represented as -999
# for example 4th, 5th, 6th. We should do something about it.

Column 0 contains 38114 values equal to -999
Column 4 contains 177457 values equal to -999
Column 5 contains 177457 values equal to -999
Column 6 contains 177457 values equal to -999
Column 12 contains 177457 values equal to -999
Column 23 contains 99913 values equal to -999
Column 24 contains 99913 values equal to -999
Column 25 contains 99913 values equal to -999
Column 26 contains 177457 values equal to -999
Column 27 contains 177457 values equal to -999
Column 28 contains 177457 values equal to -999
Sparse columns:
[  0.   4.   5.   6.  12.  23.  24.  25.  26.  27.  28.]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
count,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0
mean,-49.023079,49.239819,81.181982,57.895962,-708.420675,-601.237051,-709.356603,2.3731,18.917332,158.432217
std,406.345647,35.344886,40.828691,63.655682,454.480565,657.972302,453.019877,0.782911,22.273494,115.706115
min,-999.0,0.0,6.329,0.0,-999.0,-999.0,-999.0,0.208,0.0,46.104
25%,78.10075,19.241,59.38875,14.06875,-999.0,-999.0,-999.0,1.81,2.841,77.55
50%,105.012,46.524,73.752,38.4675,-999.0,-999.0,-999.0,2.4915,12.3155,120.6645
75%,130.60625,73.598,92.259,79.169,0.49,83.446,-4.593,2.961,27.591,200.47825
max,1192.026,690.075,1349.351,2834.999,8.503,4974.979,16.69,5.684,2834.999,1852.462


In [9]:
middle10_df = original_df.iloc[:,10:20]
middle10_df.describe()

Unnamed: 0,10,11,12,13,14,15,16,17,18,19
count,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0
mean,1.437609,-0.128305,-708.985189,38.707419,-0.010973,-0.008171,46.660207,-0.019507,0.043543,41.717235
std,0.844743,1.193585,453.596721,22.412081,1.214079,1.816763,22.064922,1.264982,1.816611,32.894693
min,0.047,-1.414,-999.0,20.0,-2.499,-3.142,26.0,-2.505,-3.142,0.109
25%,0.883,-1.371,-999.0,24.59175,-0.925,-1.575,32.375,-1.014,-1.522,21.398
50%,1.28,-0.356,-999.0,31.804,-0.023,-0.033,40.516,-0.045,0.086,34.802
75%,1.777,1.225,0.0,45.017,0.898,1.565,53.39,0.959,1.618,51.895
max,19.773,1.414,1.0,764.408,2.497,3.142,560.271,2.503,3.142,2842.617


In [10]:
last10_df = original_df.iloc[:,20:]
last10_df.describe()

Unnamed: 0,20,21,22,23,24,25,26,27,28,29
count,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0
mean,-0.010119,209.797178,0.979176,-348.329567,-399.254314,-399.259788,-692.381204,-709.121609,-709.118631,73.064591
std,1.812223,126.499506,0.977426,532.962789,489.338286,489.333883,479.875496,453.384624,453.389017,98.015662
min,-3.142,13.678,0.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0
25%,-1.575,123.0175,0.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0
50%,-0.024,179.739,1.0,38.96,-1.872,-2.093,-999.0,-999.0,-999.0,40.5125
75%,1.561,263.37925,2.0,75.349,0.433,0.503,33.703,-2.457,-2.275,109.93375
max,3.142,2003.976,3.0,1120.573,4.499,3.141,721.456,4.5,3.142,1633.433


In [11]:
original_df.isnull().sum()

#At least there are no null values! (-999 are our nulls in this case)

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
dtype: int64

In [12]:
# Lets replace -999 values for nan's

replaced999_df = original_df.replace(-999, np.nan)
replaced999_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,138.470,51.655,97.827,27.980,0.910,124.711,2.666,3.064,41.928,197.760,...,-0.277,258.733,2.0,67.435,2.150,0.444,46.062,1.240,-2.475,113.497
1,160.937,68.768,103.235,48.146,,,,3.473,2.078,125.157,...,-1.916,164.546,1.0,46.226,0.725,1.158,,,,46.226
2,,162.172,125.953,35.635,,,,3.148,9.336,197.814,...,-2.186,260.414,1.0,44.251,2.053,-2.028,,,,44.251
3,143.905,81.417,80.943,0.414,,,,3.310,0.414,75.968,...,0.060,86.062,0.0,,,,,,,0.000
4,175.864,16.915,134.805,16.405,,,,3.891,16.405,57.983,...,-0.871,53.131,0.0,,,,,,,0.000
5,89.744,13.550,59.149,116.344,2.636,284.584,-0.540,1.362,61.619,278.876,...,2.237,282.849,3.0,90.547,-2.412,-0.653,56.165,0.224,3.106,193.660
6,148.754,28.862,107.782,106.130,0.733,158.359,0.113,2.941,2.545,305.967,...,-1.443,294.074,2.0,123.010,0.864,1.450,56.867,0.131,-2.767,179.877
7,154.916,10.418,94.714,29.169,,,,2.897,1.526,138.178,...,-1.761,187.299,1.0,30.638,-0.715,-1.724,,,,30.638
8,105.594,50.559,100.989,4.288,,,,2.904,4.288,65.333,...,0.024,129.804,0.0,,,,,,,0.000
9,128.053,88.941,69.272,193.392,,,,1.609,28.859,255.123,...,0.845,294.741,1.0,167.735,-2.767,-2.514,,,,167.735


In [13]:
replaced999_df.describe()



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
count,211886.0,250000.0,250000.0,250000.0,72543.0,72543.0,72543.0,250000.0,250000.0,250000.0,...,250000.0,250000.0,250000.0,150087.0,150087.0,150087.0,72543.0,72543.0,72543.0,250000.0
mean,121.858528,49.239819,81.181982,57.895962,2.403735,371.78336,-0.821688,2.3731,18.917332,158.432217,...,-0.010119,209.797178,0.979176,84.822105,-0.003275,-0.012393,57.679474,-0.011845,-0.001582,73.064591
std,57.298157,35.344886,40.828691,63.655682,1.742226,397.699325,3.584362,0.782911,22.273494,115.706115,...,1.812223,126.499506,0.977426,60.662276,1.784546,1.813385,31.985782,2.031743,1.81695,98.015662
min,9.044,0.0,6.329,0.0,0.0,13.602,-18.066,0.208,0.0,46.104,...,-3.142,13.678,0.0,30.0,-4.499,-3.142,30.0,-4.5,-3.142,0.0
25%,,19.241,59.38875,14.06875,,,,1.81,2.841,77.55,...,-1.575,123.0175,0.0,,,,,,,0.0
50%,,46.524,73.752,38.4675,,,,2.4915,12.3155,120.6645,...,-0.024,179.739,1.0,,,,,,,40.5125
75%,,73.598,92.259,79.169,,,,2.961,27.591,200.47825,...,1.561,263.37925,2.0,,,,,,,109.93375
max,1192.026,690.075,1349.351,2834.999,8.503,4974.979,16.69,5.684,2834.999,1852.462,...,3.142,2003.976,3.0,1120.573,4.499,3.141,721.456,4.5,3.142,1633.433


In [14]:
#_ = pd.scatter_matrix(replaced999_df.loc[:,0:10], figsize=(20,20), diagonal='hist')

In [15]:
#_ = pd.scatter_matrix(replaced999_df.loc[:,10:20], figsize=(20,20), diagonal='hist')

In [16]:
#_ = pd.scatter_matrix(replaced999_df.loc[:,20:30], figsize=(20,20), diagonal='hist')

In [17]:
# Lets sum all sprase columns and combine it into new one
combined_df = replaced999_df
combined_df['combined'] = replaced999_df[sparse_columns].sum(axis=1)
combined_df.describe()



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,combined
count,211886.0,250000.0,250000.0,250000.0,72543.0,72543.0,72543.0,250000.0,250000.0,250000.0,...,250000.0,250000.0,150087.0,150087.0,150087.0,72543.0,72543.0,72543.0,250000.0,250000.0
mean,121.858528,49.239819,81.181982,57.895962,2.403735,371.78336,-0.821688,2.3731,18.917332,158.432217,...,209.797178,0.979176,84.822105,-0.003275,-0.012393,57.679474,-0.011845,-0.001582,73.064591,279.400081
std,57.298157,35.344886,40.828691,63.655682,1.742226,397.699325,3.584362,0.782911,22.273494,115.706115,...,126.499506,0.977426,60.662276,1.784546,1.813385,31.985782,2.031743,1.81695,98.015662,343.361054
min,9.044,0.0,6.329,0.0,0.0,13.602,-18.066,0.208,0.0,46.104,...,13.678,0.0,30.0,-4.499,-3.142,30.0,-4.5,-3.142,0.0,0.0
25%,,19.241,59.38875,14.06875,,,,1.81,2.841,77.55,...,123.0175,0.0,,,,,,,0.0,99.7495
50%,,46.524,73.752,38.4675,,,,2.4915,12.3155,120.6645,...,179.739,1.0,,,,,,,40.5125,158.9285
75%,,73.598,92.259,79.169,,,,2.961,27.591,200.47825,...,263.37925,2.0,,,,,,,109.93375,328.585
max,1192.026,690.075,1349.351,2834.999,8.503,4974.979,16.69,5.684,2834.999,1852.462,...,2003.976,3.0,1120.573,4.499,3.141,721.456,4.5,3.142,1633.433,5278.662


In [18]:
value_counts = combined_df['combined'].value_counts()
np.nan in value_counts

#No NaN's! Impressive!

False

In [19]:
droped_nans_df = combined_df.dropna(axis=1, thresh=250000)
print(droped_nans_df.columns.shape)
droped_nans_df

(20,)


Unnamed: 0,1,2,3,7,8,9,10,11,13,14,15,16,17,18,19,20,21,22,29,combined
0,51.655,97.827,27.980,3.064,41.928,197.760,1.582,1.396,32.638,1.017,0.381,51.626,2.273,-2.414,16.824,-0.277,258.733,2.0,113.497,381.813
1,68.768,103.235,48.146,3.473,2.078,125.157,0.879,1.414,42.014,2.039,-3.011,36.918,0.501,0.103,44.704,-1.916,164.546,1.0,46.226,209.046
2,162.172,125.953,35.635,3.148,9.336,197.814,3.776,1.414,32.154,-0.705,-2.093,121.409,-0.953,1.052,54.283,-2.186,260.414,1.0,44.251,44.276
3,81.417,80.943,0.414,3.310,0.414,75.968,2.354,-1.285,22.647,-1.655,0.010,53.321,-0.522,-3.100,31.082,0.060,86.062,0.0,0.000,143.905
4,16.915,134.805,16.405,3.891,16.405,57.983,1.056,-1.385,28.209,-2.197,-2.231,29.774,0.798,1.569,2.723,-0.871,53.131,0.0,0.000,175.864
5,13.550,59.149,116.344,1.362,61.619,278.876,0.588,0.479,53.651,0.371,1.329,31.565,-0.884,1.857,40.735,2.237,282.849,3.0,193.660,524.376
6,28.862,107.782,106.130,2.941,2.545,305.967,3.371,1.393,28.850,1.113,2.409,97.240,0.675,-0.966,38.421,-1.443,294.074,2.0,179.877,488.305
7,10.418,94.714,29.169,2.897,1.526,138.178,0.365,-1.305,78.800,0.654,1.547,28.740,0.506,-1.347,22.275,-1.761,187.299,1.0,30.638,183.115
8,50.559,100.989,4.288,2.904,4.288,65.333,0.675,-1.366,39.008,2.433,-2.532,26.325,0.210,1.884,37.791,0.024,129.804,0.0,0.000,105.594
9,88.941,69.272,193.392,1.609,28.859,255.123,0.599,0.538,54.646,-1.533,0.416,32.742,-0.317,-0.636,132.678,0.845,294.741,1.0,167.735,290.507


In [20]:
#_ = pd.scatter_matrix(droped_nans_df, figsize=(20,20), diagonal='hist')

In [21]:
normalized_df = (droped_nans_df - droped_nans_df.mean())# / (droped_nans_df.max() - droped_nans_df.min())
normalized_df.describe()

Unnamed: 0,1,2,3,7,8,9,10,11,13,14,15,16,17,18,19,20,21,22,29,combined
count,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0
mean,1.515898e-13,-1.446938e-13,4.52839e-13,1.913874e-14,1.428226e-13,3.278874e-13,-5.897574e-15,5.456513e-15,-1.331884e-13,1.213172e-16,1.115001e-16,5.696898e-13,-1.150724e-16,-2.270486e-16,2.720095e-13,2.017737e-16,-1.139248e-12,2.523876e-15,-2.269052e-13,2.669678e-12
std,35.34489,40.82869,63.65568,0.7829111,22.27349,115.7061,0.8447429,1.193585,22.41208,1.214079,1.816763,22.06492,1.264982,1.816611,32.89469,1.812223,126.4995,0.9774263,98.01566,343.3611
min,-49.23982,-74.85298,-57.89596,-2.1651,-18.91733,-112.3282,-1.390609,-1.285695,-18.70742,-2.488027,-3.133829,-20.66021,-2.485493,-3.185543,-41.60823,-3.131881,-196.1192,-0.979176,-73.06459,-279.4001
25%,-29.99882,-21.79323,-43.82721,-0.5630998,-16.07633,-80.88222,-0.5546094,-1.242695,-14.11567,-0.914027,-1.566829,-14.28521,-0.9944925,-1.565543,-20.31923,-1.564881,-86.77968,-0.979176,-73.06459,-179.6506
50%,-2.715819,-7.429982,-19.42846,0.1184002,-6.601832,-37.76772,-0.1576094,-0.2276953,-6.903419,-0.01202695,-0.02482893,-6.144207,-0.02549253,0.04245704,-6.915235,-0.01388081,-30.05818,0.020824,-32.55209,-120.4716
75%,24.35818,11.07702,21.27304,0.5879002,8.673668,42.04603,0.3393906,1.353305,6.309581,0.908973,1.573171,6.729793,0.9785075,1.574457,10.17777,1.571119,53.58207,1.020824,36.86916,49.18492
max,640.8352,1268.169,2777.103,3.3109,2816.082,1694.03,18.33539,1.542305,725.7006,2.507973,3.150171,513.6108,2.522507,3.098457,2800.9,3.152119,1794.179,2.020824,1560.368,4999.262


In [22]:
# Lets do exactly the same but for test dataset
original_test_df = pd.DataFrame(tx_test)
replaced999_test_df = original_test_df.replace(-999, np.nan)
combined_test_df = replaced999_test_df
combined_test_df['combined'] = replaced999_test_df[sparse_columns].sum(axis=1)
droped_nans_test_df = combined_test_df.dropna(axis=1, thresh=568000)
normalized_test_df = (droped_nans_test_df - droped_nans_test_df.mean())# / (droped_nans_test_df.max() - droped_nans_test_df.min())

#Verify we left same columns
print(droped_nans_df.columns == droped_nans_test_df.columns)

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True]


In [23]:
# Lets change it back to numpy array

new_tx = normalized_df.as_matrix()
new_tx_test = normalized_test_df.as_matrix()
new_all_tx = np.concatenate((new_tx, new_tx_test)) 
print(new_tx.shape)
print(new_tx_test.shape)
print(new_all_tx.shape)

(250000, 20)
(568238, 20)
(818238, 20)


In [37]:
# Try to reduce dimensionality with PCA
print(new_tx.shape)
print(y.shape)
tx_y = np.hstack((new_tx, np.array([y]).T))
print(tx_y.shape)

results = PCA(tx_y) #this will return a 2d array of the data projected into PCA space
print(results.Y.shape)


(250000, 20)
(250000,)
(250000, 21)
(250000, 21)


In [None]:
# Why dont we try method that yielded best result so far: least squares

# No improvement :(
start_time = datetime.datetime.now()
new_least_squares_loss, new_least_squares_w = least_squares(y, new_tx)
end_time = datetime.datetime.now()

# Print result
new_least_squares_loss = compute_rmse_loss(new_least_squares_loss)
exection_time = (end_time - start_time).total_seconds()
print("Lest Squares: execution time={t:.3f} seconds. RMSE Loss={l}".format(t=exection_time, l=new_least_squares_loss))

## Cross validation


In [None]:
from plots import cross_validation_visualization

subset_y = all_y
subset_tx = new_all_tx


# Define the parameters of the algorithm.
seed = 1
k_fold = 10
lambdas = np.logspace(-8, 2, 10)

rmse_tr = []
rmse_te = []
weights = np.empty((0,subset_tx.shape[1]), float)
start_time = datetime.datetime.now()

for lambd in np.nditer(lambdas):
    w, loss_tr, loss_te = cross_validation(subset_y, subset_tx, k_fold, seed, lambd)
    rmse_tr = np.append(rmse_tr, loss_tr)
    rmse_te = np.append(rmse_te, loss_te)
    weights = np.vstack((weights, w))
        
end_time = datetime.datetime.now()
exection_time = (end_time - start_time).total_seconds()

print("Cross Validation: execution time={t:.3f} seconds.".format(t=exection_time))
cross_validation_visualization(lambdas, rmse_tr, rmse_te)

In [None]:
print(rmse_te[3])
best_weights = weights[3]
print(best_weights.shape)

In [None]:
# Test against local test set
test_mse = compute_loss(y, new_tx, best_weights)
test_rmse = np.sqrt(2*test_mse)

print(test_mse)
print(test_rmse)

## Generate predictions and save ouput in csv format for submission:

In [None]:
OUTPUT_PATH = '../data/least_squares_cleaned_data_overfitted_submission.csv' # TODO: fill in desired name of output file for submission
weights = new_least_squares_w
y_pred = predict_labels(weights, new_tx)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

http://inclass.kaggle.com/c/epfml-project-1

In [None]:
# Delete train.csv such that github accepts push
os.remove('../data/test.csv')