In [4]:
import pandas as pd
import gensim
import re
import numpy as np

In [5]:
headers = "country,sku_id,title,category_lvl_1,category_lvl_2,category_lvl_3,short_description, price,product_type".split(",")

# Training data
data_train = pd.read_csv("./data/training/data_train.csv", names=headers)
clarity_train = pd.read_csv("./data/training/clarity_train.labels", names=['clarity_label'])
conciseness_train = pd.read_csv("./data/training/conciseness_train.labels", names=['conciseness_label'])

data_train['clarity_label'] = clarity_train['clarity_label']
data_train['conciseness_label'] = conciseness_train['conciseness_label']

# Validation
data_validate = pd.read_csv("./data/validation/data_valid.csv", names=headers)

In [6]:
def repair_feature_dataframe(data_train, w2v_model, contain_label=False):
    # Title cleaning
    def clean_title(title):
        title = title.lower()
        title = title.replace("/", " / ")
        title = re.sub(r"[^A-z0-9\s-]", "", title)

        return title

    data_train['title_clean'] = data_train.title.apply(clean_title)
    
    selected_label = ['title_clean']
    if contain_label:
        selected_label += ['clarity_label', 'conciseness_label']
    data_train = data_train[selected_label]
    
    # Word2vec title
    def vector_title(title, w2v_model):
        title = title.split(" ")
        vec = np.zeros(300)
        for w in title:
            try:
                vec += w2v_model[w]
            except:
                pass

        return vec

    data_train['vector_title'] = data_train.title_clean.apply(lambda t: vector_title(t, w2v_model))
    return data_train

In [7]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)

In [8]:
data_train = repair_feature_dataframe(data_train, w2v_model, contain_label=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [9]:
data_validate = repair_feature_dataframe(data_validate, w2v_model, contain_label=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [10]:
data_train

Unnamed: 0,title_clean,clarity_label,conciseness_label,vector_title
0,adana gallery suri square hijab light pink,1,1,"[-0.209838867188, -0.318603515625, 0.740234375..."
1,cuba heartbreaker eau de parfum spray 100ml 33oz,1,1,"[0.033447265625, -0.02001953125, 0.71508789062..."
2,andoer 150cm cellphone smartphone mini dual-he...,1,0,"[0.1669921875, -1.40698242188, -1.36212158203,..."
3,anmyna complaint silky set shampoo 520ml con...,1,1,"[-0.15185546875, 0.803344726562, -0.0489807128..."
4,argital argiltubo green clay for face and body...,1,1,"[0.155151367188, 0.319580078125, 1.06811523438..."
5,asus tp300lj-dw004h transformer book flip 4gb ...,1,1,"[-0.063232421875, 0.209411621094, 0.5410003662..."
6,ng-40c ring-shaped 40w 3166lm 5400k macro phot...,1,1,"[-0.204246520996, 0.407958984375, -0.41015625,..."
7,buytra exfoliating peel foot mask 1pair,1,1,"[0.027099609375, 1.09033203125, 0.050720214843..."
8,cliptec occ121 slim flat usb 30 extension cabl...,1,1,"[-0.136474609375, 0.530029296875, -0.563232421..."
9,mcdonalds coke can glass limited edition 12oz ...,1,1,"[-0.496246337891, 0.174072265625, -0.504882812..."


# Checkpoint training data

In [15]:
data_train.to_csv("./checkpoint/data_train.csv", index=False)
data_validate.to_csv("./checkpoint/data_validate.csv", index=False)

# SVM Model

In [12]:
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

## 1. SVC

In [13]:
X = data_train.vector_title.tolist()
y_clarity_label = data_train.clarity_label.tolist()
y_conciseness_label = data_train.conciseness_label.tolist()

data_validate_input = data_validate.vector_title.tolist()

In [16]:
clf_SVC_clarity = svm.SVC()
clf_SVC_conciseness = svm.SVC()



# clarity_label ===================
# X_train, X_test, y_train, y_test = train_test_split(X, y_clarity_label, test_size=0.1, random_state=42)
# clf_SVC_clarity.fit(X_train, y_train)
print "Fitting ..."
clf_SVC_clarity.fit(X, y_clarity_label)
# y_pred = clf_SVC_clarity.predict(X_test)
# print 'Accuracy clf_SVC_clarity = ', accuracy_score(y_test, y_pred)
# Validation
print "Predict ..."
validate_result_clarity = clf_SVC_clarity.predict(data_validate_input)
np.savetxt("submit/clarity_valid.predict", validate_result_clarity, "%.4f")

Fitting ...
Predict ...


In [17]:
# conciseness_label ===================
# X_train, X_test, y_train, y_test = train_test_split(X, y_conciseness_label, test_size=0.1, random_state=43)
# clf_SVC.fit(X_train, y_train)
# clf_SVC_clarity.fit(X_train, y_train)

print "Fitting ..."
clf_SVC_conciseness.fit(X, y_conciseness_label)
# y_pred = clf_SVC_conciseness.predict(X_test)
# print 'Accuracy clf_SVC_conciseness = ', accuracy_score(y_test, y_pred)

# Validation
print "Predict ..."
validate_result_conciseness = clf_SVC_conciseness.predict(data_validate_input)

np.savetxt("submit/conciseness_valid.predict", validate_result_conciseness, "%.4f")

Fitting ...
Predict ...


## 2. Support Vector Regression

In [None]:
clf_SVR_clarity = svm.SVR()
clf_SVR_conciseness = svm.SVR()


clf_SVR_clarity.fit(X, y_clarity_label)

SVR_validate_result_conciseness = 

clf_SVR_conciseness.fit(X, y_conciseness_label)

In [None]:
y_pred = clf_SVR.predict(X_test)
accuracy_score(y_test, y_pred.round(), normalize=False)