In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
from sklearn import metrics
from ml_metrics import quadratic_weighted_kappa
import time
import nltk

In [6]:
data = pd.read_csv("../raw_train_test.csv")
data.loc[:,'text'] = data.apply(
    lambda x: "%s %s" % (x['query'], x['product_title']),axis = 1
)

#Tokenization via TF-IDF

We need to tokenize all of the data (train AND test) in order to be aware of all the terms in both sets. Then, we need to divide this tokenized data into train and test. In order to do this, we need to know the row indices of train and test.

In [7]:
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stemmer = nltk.PorterStemmer()
    return [stemmer.stem(t.lower()) for t in tokens]

tokenization_pipeline = Pipeline(steps = [
    ('tfidf',TfidfVectorizer(tokenizer = tokenize, stop_words = 'english'))
])

train_test_split_idx = data[data.dataset == 'train'].shape[0]
X = tokenization_pipeline.fit_transform(data['text'])
X_train = X[:train_test_split_idx,:] 
y_train = data.loc[:train_test_split_idx-1,'median_relevance']

test_set_ids = np.array(data.loc[train_test_split_idx:,'id'])
X_test =  X[train_test_split_idx:,:]

#Hyperparameter Tuning

In [8]:
prediction_pipeline = Pipeline(steps = [
    ('PCA',TruncatedSVD()),
    ('scaler',StandardScaler()),
    ('classifier',SVC())
])

grid_search_parameters = {
    'PCA__n_components': [100,500,1000,-1],
    'classifier__C': [10e-2,1.0,10e2],
    'classifier__gamma': [10e-2,1.0,10e2],
    'classifier__class_weight': [None,'auto']
}

In [9]:
grid_search_start = time.time()
if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block
    
    kappa_scorer = metrics.make_scorer(
        quadratic_weighted_kappa, 
        greater_is_better = True
    )
    
    grid_search = GridSearchCV(
        estimator  = prediction_pipeline,
        param_grid = grid_search_parameters,
        cv = StratifiedKFold(y = y_train,n_folds=3),
        n_jobs = -1,
        verbose = 1,
        scoring = kappa_scorer
    )
    grid_search.fit(X_train,y_train)
    best_params = grid_search.best_params_
    print("Best score: %0.4f" % grid_search.best_score_)
    print("Best parameters set:",best_params)
    print("Model trained in %0.1f seconds" % (time.time() - grid_search_start))

[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:   24.4s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed: 85.9min
[Parallel(n_jobs=-1)]: Done 210 out of 216 | elapsed: 86.5min remaining:  2.5min
[Parallel(n_jobs=-1)]: Done 216 out of 216 | elapsed: 86.7min finished


Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best score: 0.4642
Best parameters set: {'PCA__n_components': 100, 'classifier__C': 1000.0, 'classifier__gamma': 0.1, 'classifier__class_weight': 'auto'}
Model trained in 5232.3 seconds


#Model Training

In [10]:
training_start = time.time()
if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block
    kappa_scorer = metrics.make_scorer(
        quadratic_weighted_kappa, 
        greater_is_better = True
    )
    
    scores = cross_val_score(
        estimator = grid_search.best_estimator_,
        X = X_train,
        y = y_train,
        scoring = kappa_scorer,
        cv = StratifiedKFold(y = y_train,n_folds=10),
        n_jobs = -1
    )
print("Average cross validation score: %0.4f" % scores.mean())
print("All Cross Validation Scores: %s" % scores)
print("Training time: %0.1f seconds" % (time.time() - training_start))

Average cross validation score: 0.4750
All Cross Validation Scores: [ 0.51681851  0.45404373  0.48279532  0.51216152  0.4175586   0.47406038
  0.47793642  0.48722882  0.43634664  0.49062778]
Training time: 126.0 seconds


In [11]:
prediction_start = time.time()
results = pd.DataFrame({
'id': test_set_ids,
'prediction': grid_search.best_estimator_.predict(X_test)
})
print("Prediction time: %0.1f seconds" % (time.time() - prediction_start))

Prediction time: 22.6 seconds


In [12]:
results.to_csv('../submissions/03_svm.csv',index=False, float_format = "%d")