In [1]:
import numpy as np
import pandas as pd

from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.externals import joblib
from sklearn.utils import shuffle

## Helpers

In [2]:
def create_np_array_from_vector(vector):
    vec_arr = vector.toarray()
    vec_list = []
    
    for i in range(0, vector.shape[1]):
        vec_list.append(vec_arr[0,i])
        
    return np.array(vec_list)

## Read dataset

In [3]:
dataset_filename = 'dataset.csv'
data = pd.read_csv(dataset_filename)

## Create vectorizer

In [4]:
# exctract name column
product_names = data.ix[:, 0]
product_categories = data.ix[:, 1]

# create the transform
vectorizer = TfidfVectorizer(stop_words='english')

# tokenize and build vocab
vectorizer.fit(product_names)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [5]:
# show vectorizer debug information
print('vocabulary is ')
print(vectorizer.vocabulary_)

vocabulary is 
{'бур': 4, 'ультразвуковой': 27, 'набор': 18, 'ударная': 26, 'корд': 15, 'карманный': 10, 'зев': 9, 'бензопила': 3, 'устройство': 29, 'зажим': 8, 'шина': 32, 'уровень': 28, 'сверло': 22, 'конец': 14, 'автомобильная': 1, 'яркий': 33, 'цепь': 31, 'дальномер': 5, 'аккумуляторная': 2, 'лента': 17, 'пильная': 21, 'острый': 19, 'отвертка': 20, 'сменная': 23, 'комплект': 13, 'колесный': 12, 'дрель': 7, 'струбцина': 24, '70ый': 0, 'диск': 6, 'кассета': 11, 'фонарь': 30, 'лазерный': 16, 'тиски': 25}


##  Save vectorizator

In [6]:
vectorizator_filename = 'vectorizator.sav'
joblib.dump(vectorizer, vectorizator_filename)

['vectorizator.sav']

In [7]:
# transform names
name_vectors = []

for product_name in product_names:
    vector = vectorizer.transform([product_name])
    np_vector = create_np_array_from_vector(vector)
    name_vectors.append(np_vector)

## Shuffle Dataset

In [8]:
name_vectors, product_categories = shuffle(name_vectors, product_categories)

## Split dataset to Train and Test 

In [9]:
train_size = int(len(data) * 0.80)
test_size = len(data) - train_size

print('train size = {0}, test size = {1}'.format(train_size, test_size))

train size = 608, test size = 153


In [10]:
train_X = name_vectors[0:train_size]
train_Y = product_categories[0:train_size]

test_X = name_vectors[train_size:len(name_vectors)]
test_Y = product_categories[train_size:len(name_vectors)]

Train dataset shape
(608, 34)
(608,)
Test dataset shape
(153, 34)
(153,)


## Create and study SVM model

In [11]:
#create SVM nodel
clf = svm.SVC(gamma=0.001, C=100.)

#train model
clf.fit(train_X, train_Y)

SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

## Check model

In [12]:
#check model on test data
predicted_Y = clf.predict(test_X)

#compare predicted and expected output
result = np.mean(predicted_Y == test_Y)

print('Model accuracy = {0}'.format(result))

Model accuracy = 0.9934640522875817


## Save SVM model

In [14]:
svm_filename = 'svm.sav'
joblib.dump(clf, svm_filename)

['svm.sav']

## Manual check

In [13]:
#check model in manual mode - we exect get Y = 114862
manual_test = [0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]

predicted_value = clf.predict(manual_test)
print('Predicted value = {0}'.format(predicted_value))

Predicted value = [114862]


