In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
from sklearn import metrics
from ml_metrics import quadratic_weighted_kappa
import time
import nltk

In [2]:
data = pd.read_csv("../raw_train_test.csv")
data.loc[:,'text'] = data.apply(
    lambda x: "%s %s" % (x['query'], x['product_title']),axis = 1
)

#Tokenization via TF-IDF

We need to tokenize all of the data (train AND test) in order to be aware of all the terms in both sets. Then, we need to divide this tokenized data into train and test. In order to do this, we need to know the row indices of train and test.

In [3]:
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stemmer = nltk.PorterStemmer()
    return [stemmer.stem(t.lower()) for t in tokens]

tokenization_pipeline = Pipeline(steps = [
    ('tfidf',TfidfVectorizer(tokenizer = tokenize, stop_words = 'english'))
])

train_test_split_idx = data[data.dataset == 'train'].shape[0]
X = tokenization_pipeline.fit_transform(data['text'])
X_train = X[:train_test_split_idx,:] 
y_train = data.loc[:train_test_split_idx-1,'median_relevance']
X_test =  X[train_test_split_idx:,:]

In [4]:
X_train.shape

(10158, 23720)

#Hyperparameter Tuning

In [141]:
X_tuning_sample, y_tuning_sample = X_train[:1000,:], y_train[:1000]
prediction_pipeline = Pipeline(steps = [
    ('PCA',TruncatedSVD()),
    ('scaler',StandardScaler()),
    ('classifier',LogisticRegression())
])

grid_search_parameters = {
    'PCA__n_components': [100,500,-1],
    'classifier__penalty': ['l1','l2'],
    'classifier__class_weight': [None,'auto']
}

In [142]:
grid_search_start = time.time()
if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block
    
    kappa_scorer = metrics.make_scorer(
        quadratic_weighted_kappa, 
        greater_is_better = True
    )
    
    grid_search = GridSearchCV(
        estimator  = prediction_pipeline,
        param_grid = grid_search_parameters,
        cv = 5,
        n_jobs = -1,
        verbose = 1,
        scoring = kappa_scorer
    )
    grid_search.fit(X_tuning_sample,y_tuning_sample)
    best_params = grid_search.best_params_
    print("Best score: %0.4f" % grid_search.best_score_)
    print("Best parameters set:",best_params)
    print("Model trained in %0.1f seconds" % (time.time() - grid_search_start))

[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.1min finished


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best score: 0.3098
Best parameters set: {'classifier__class_weight': 'auto', 'classifier__penalty': 'l2', 'PCA__n_components': 500}
Model trained in 69.0 seconds


#Model Training

In [163]:
training_start = time.time()
if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block
    kappa_scorer = metrics.make_scorer(
        quadratic_weighted_kappa, 
        greater_is_better = True
    )
    
    scores = cross_val_score(
        estimator = grid_search.best_estimator_,
        X = X_train,
        y = y_train,
        scoring = kappa_scorer,
        cv = 10,
        n_jobs = -1
    )
print("Average cross validation score: %0.4f" % scores.mean())
print("Training time: %0.1f seconds" % (time.time() - training_start))

Average cross validation score: 0.4601
Training time: 94.9 seconds


In [185]:
prediction_start = time.time()
results = pd.DataFrame({
'id': np.array(data.loc[train_test_split_idx:,'id']),
'prediction': grid_search.best_estimator_.predict(X_test)
})
print("Prediction time: %0.1f seconds" % (time.time() - prediction_start))

Prediction time: 0.3 seconds


In [188]:
results.to_csv('../submissions/01_logistic_regression_0.46.csv',index=False, float_format = "%d")