In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
from sklearn import metrics
from ml_metrics import quadratic_weighted_kappa
import time
import nltk

In [2]:
data = pd.read_csv("../raw_train_test.csv")
data.loc[:,'text'] = data.apply(
    lambda x: "%s %s" % (x['query'], x['product_title']),axis = 1
)

#Tokenization via TF-IDF

We need to tokenize all of the data (train AND test) in order to be aware of all the terms in both sets. Then, we need to divide this tokenized data into train and test. In order to do this, we need to know the row indices of train and test.

In [3]:
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stemmer = nltk.PorterStemmer()
    return [stemmer.stem(t.lower()) for t in tokens]

tokenization_pipeline = Pipeline(steps = [
    ('tfidf',TfidfVectorizer(tokenizer = tokenize, stop_words = 'english'))
])

train_test_split_idx = data[data.dataset == 'train'].shape[0]
X = tokenization_pipeline.fit_transform(data['text'])
X_train = X[:train_test_split_idx,:] 
y_train = data.loc[:train_test_split_idx-1,'median_relevance']

test_set_ids = np.array(data.loc[train_test_split_idx:,'id'])
X_test =  X[train_test_split_idx:,:]

#Model Training

In [7]:
training_start = time.time()
if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block
    kappa_scorer = metrics.make_scorer(
        quadratic_weighted_kappa, 
        greater_is_better = True
    )
    
    prediction_pipeline = Pipeline(steps = [
        ('PCA',TruncatedSVD(n_components=200)),
        ('scaler',StandardScaler()),
        ('classifier',SVC(C=10,kernel='rbf'))
    ])
    
    model = prediction_pipeline.fit(X_train,y_train)
    
    scores = cross_val_score(
        estimator = prediction_pipeline,
        X = X_train,
        y = y_train,
        scoring = kappa_scorer,
        cv = StratifiedKFold(y = y_train,n_folds=10),
        n_jobs = -1
    )
print("Average cross validation score: %0.4f" % scores.mean())
print("All Cross Validation Scores: %s" % scores)
print("Training time: %0.1f seconds" % (time.time() - training_start))

Average cross validation score: 0.5975
All Cross Validation Scores: [ 0.61439355  0.54867142  0.61158444  0.62091436  0.56721934  0.6179734
  0.6106239   0.56852321  0.60003877  0.61456973]
Training time: 223.2 seconds


In [10]:
predicted_X = model.predict(X_train)

In [12]:
data_train = data[data.dataset == 'train']
data_train['predicted'] = predicted_X

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app


In [16]:
data_train = data_train[['id','query','product_title','median_relevance','predicted']]
data_train.to_csv('../04_svm_actual_vs_predicted.csv',index = False,float_format = "%d")

In [8]:
prediction_start = time.time()
results = pd.DataFrame({
'id': test_set_ids,
'prediction': model.predict(X_test)
})
print("Prediction time: %0.1f seconds" % (time.time() - prediction_start))

Prediction time: 35.6 seconds


In [9]:
results.to_csv('../submissions/04_svm.csv',index=False, float_format = "%d")