# Retrieve data

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
train_path = "C:\\Users\\Rishi\\Documents\\Kaggle\\Mercari\\data\\train.tsv"
train = pd.read_csv(train_path, sep='\t')

# Preprocessing
## Parse out individual category levels

In [3]:
from sklearn import preprocessing

train_x = train[["brand_name", "shipping", "category_name"]]
train_y = train.price

def piece(string, delim, n):
    string = str(string)
    if string.count(delim) < n:
        return pd.np.NaN
    return string.split(delim)[n]

train_x["category1"] = train_x.category_name.map(lambda x: piece(x, "/", 0))
train_x["category2"] = train_x.category_name.map(lambda x: piece(x, "/", 1))
train_x["category3"] = train_x.category_name.map(lambda x: piece(x, "/", 2))
train_x["category4"] = train_x.category_name.map(lambda x: piece(x, "/", 3))
train_x["category5"] = train_x.category_name.map(lambda x: piece(x, "/", 4))
train_x = train_x.drop(columns='category_name')
train_x.head()

Unnamed: 0,brand_name,shipping,category1,category2,category3,category4,category5
0,,1,Men,Tops,T-shirts,,
1,Razer,0,Electronics,Computers & Tablets,Components & Parts,,
2,Target,1,Women,Tops & Blouses,Blouse,,
3,,1,Home,Home Décor,Home Décor Accents,,
4,,0,Women,Jewelry,Necklaces,,


## Fill in missing data

For now we will be representing any missing data with the string 'zMissing.'

In [4]:
train_x = train_x.fillna('zMissing')
train_x.head()

Unnamed: 0,brand_name,shipping,category1,category2,category3,category4,category5
0,zMissing,1,Men,Tops,T-shirts,zMissing,zMissing
1,Razer,0,Electronics,Computers & Tablets,Components & Parts,zMissing,zMissing
2,Target,1,Women,Tops & Blouses,Blouse,zMissing,zMissing
3,zMissing,1,Home,Home Décor,Home Décor Accents,zMissing,zMissing
4,zMissing,0,Women,Jewelry,Necklaces,zMissing,zMissing


# One hot encode
http://scikit-learn.org/stable/modules/preprocessing.html#encoding-categorical-features

First we map the actual categories to integers to save space. Then we one hot encode those integers.



**OneHotEncoder**

* Encode categorical integer features using a one-hot aka one-of-K scheme.
* The input to this transformer should be a matrix of integers, denoting the values taken on by categorical (discrete) features.
* The output will be a sparse matrix where each column corresponds to one possible value of one feature.
* It is assumed that input features take on values in the range [0, n_values).
* This encoding is needed for feeding categorical data to many scikit-learn estimators, notably linear models and SVMs with the standard kernels.



In [5]:
# Pandas is proving to be too slow, lets try it with sklearn
# dummies = pd.get_dummies(train_x)
encoder = preprocessing.OneHotEncoder()
label_encoder = preprocessing.LabelEncoder()
data_label_encoded = train_x.apply(label_encoder.fit_transform)
train_x_encoded = encoder.fit_transform(data_label_encoded)

# Supervised Learning

Lets start by experimenting with a simple SGD regressor becasue of how fast it is.
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html#sklearn.linear_model.SGDRegressor

Further down the line we should incorporate matrix factorization as the collinearity and correlations are likely quite high. We'll also want to perform cross validation to pick optimal parameters for SGD.

In [6]:
# This competition uses root mean squared log error.
# Lets use this to evaluate our models as well.
from sklearn.metrics import make_scorer

# vectorized error calc
def rmsle(y, y0):
    assert len(y) == len(y0)
    return np.sqrt(np.mean(np.power(np.log1p(y)-np.log1p(y0), 2)))

rmsle_score = make_scorer(rmsle, greater_is_better=False)

## Stochastic Gradient Decent - SGD
Stochastic gradient descent is a simple yet very efficient approach to fit linear models. It is particularly useful when the number of samples (and the number of features) is very large. The partial_fit method allows only/out-of-core learning. The classes SGDClassifier and SGDRegressor provide functionality to fit linear models for classification and regression using different (convex) loss functions and different penalties.

We'll begin by employing a huber loss and l2 penalty.

In [7]:
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import cross_val_score

pd.np.random.seed(369)

sgd = SGDRegressor(loss="huber", penalty="l2")
scores = cross_val_score(sgd, train_x_encoded, train_y, cv=5, scoring=rmsle_score)

In [12]:
scores.mean()

-0.70328071008492865

## Lets add 