In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('new_transformed_data.csv', usecols = ['lemmatized_text', 'category_id'])

In [3]:
data.head()

Unnamed: 0,category_id,lemmatized_text
0,19,картина гобелен размер х сантиметр
1,22,стул прессовать кожа продать недорого стул све...
2,37,домашний минь баня минь баня мб минь сауна пре...
3,43,эксклюзивный коллекция книга трансаэро подарок...
4,1,ноутбук aser продаваться ноутбук acer e c ta к...


In [4]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import textblob, string


Using TensorFlow backend.


In [5]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(data['lemmatized_text'], data['category_id'], test_size=0.2)

# 2. Feature Engineering


The next step is the feature engineering step. In this step, raw text data will be transformed into feature vectors and new features will be created using the existing dataset. We will implement the following ideas in order to obtain relevant features from our dataset.

+ 2.1 Count Vectors as features
+ 2.2 TF-IDF Vectors as features

+ + Word level


Lets look at the implementation of these ideas in detail.

## 2.1 Count Vectors as features


In [7]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(data['lemmatized_text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [8]:
xtrain_count

<367137x314380 sparse matrix of type '<class 'numpy.int64'>'
	with 9506395 stored elements in Compressed Sparse Row format>

## 2.2 TF-IDF Vectors as features


In [6]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=9000)
tfidf_vect.fit(data['lemmatized_text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

In [7]:
xtrain_tfidf

<391613x9000 sparse matrix of type '<class 'numpy.float64'>'
	with 8950148 stored elements in Compressed Sparse Row format>

# 3. Model Building


The final step in the text classification framework is to train a classifier using the features created in the previous step. There are many different choices of machine learning models which can be used to train a final model. We will implement following different classifiers for this purpose:

+ Naive Bayes Classifier
+ Bagging Models
+ SGD Classifier

Lets implement these models and understand their details. The following function is a utility function which can be used to train a model. It accepts the classifier, feature_vector of training data, labels of training data and feature vectors of valid data as inputs. Using these inputs, the model is trained and accuracy score is computed.



In [8]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    %time
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

## 3.1 Naive Bayes


In [9]:
# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print("NB, WordLevel TF-IDF: ", accuracy)

CPU times: user 9 µs, sys: 0 ns, total: 9 µs
Wall time: 17.9 µs
NB, WordLevel TF-IDF:  0.8465844092171924



NB, WordLevel TF-IDF:  0.8264830854714823


NB, Count Vectors:  0.8550825298251348

## 3.2 Bagging Model


In [33]:
# RF on Word Level TF IDF Vectors with maxfeatures 9000
%time
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=25), xtrain_tfidf, train_y, xvalid_tfidf)
print("RF, WordLevel TF-IDF: ", accuracy)

#было 0.8033583918941003

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 10.5 µs
CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.58 µs
RF, WordLevel TF-IDF:  0.8302214414119954


## 3.3 SGD Model


In [31]:
sgd_cls = linear_model.SGDClassifier(alpha=0.000001, random_state=0, class_weight='balanced', penalty='l2', loss='log', n_jobs=-1)

In [32]:
accuracy = train_model(sgd_cls, xtrain_tfidf, train_y, xvalid_tfidf)
print("sgd, WordLevel TF-IDF: ", accuracy)

CPU times: user 13 µs, sys: 1e+03 ns, total: 14 µs
Wall time: 26.2 µs
sgd, WordLevel TF-IDF:  0.8707509396960288


# 4 final

In [34]:
data = pd.read_csv('test.csv')
data.head()

Unnamed: 0,item_id,title,description,price
0,489517,Стоик журнальный сталь,продам журнальный столик изготавливаю столы из...,10000.0
1,489518,iPhone 5 64Gb,"Телефон в хорошем состоянии. Комплект, гаранти...",12500.0
2,489519,Утеплитель,ТЕПЛОПЕЛЕН-ЛИДЕР ТЕПЛА!!! Толщина утеплителя :...,250.0
3,489520,Пальто демисезонное,Продам пальто женское (букле) в отличном состо...,1700.0
4,489521,Samsung syncmaster T200N,"Условно рабочий, проблема в панели настройки м...",1000.0


In [35]:
data.shape

(243166, 4)

In [36]:
data.insert(1 , 'text', data.apply(lambda x: x['title'] + ' ' + x['description'], axis=1))
data.drop(['title', 'description'], axis=1, inplace=True)

data.head()

Unnamed: 0,item_id,text,price
0,489517,Стоик журнальный сталь продам журнальный столи...,10000.0
1,489518,iPhone 5 64Gb Телефон в хорошем состоянии. Ком...,12500.0
2,489519,Утеплитель ТЕПЛОПЕЛЕН-ЛИДЕР ТЕПЛА!!! Толщина у...,250.0
3,489520,Пальто демисезонное Продам пальто женское (бук...,1700.0
4,489521,"Samsung syncmaster T200N Условно рабочий, проб...",1000.0


In [37]:
import re

#Let's use 'GOOD OLD'regular expressions for cleanup and standard func '.lower()' to make words similar
def del_symb(text):
    #return re.sub(r"[^a-zA-Zа-яёА-Я0-9]", " ", text.lower())
    return re.sub(r"[^a-zA-Zа-яёА-Я]", " ", text.lower())

In [38]:
data['text'] = data['text'].apply(del_symb)

data.head()

Unnamed: 0,item_id,text,price
0,489517,стоик журнальный сталь продам журнальный столи...,10000.0
1,489518,iphone gb телефон в хорошем состоянии ком...,12500.0
2,489519,утеплитель теплопелен лидер тепла толщина у...,250.0
3,489520,пальто демисезонное продам пальто женское бук...,1700.0
4,489521,samsung syncmaster t n условно рабочий проб...,1000.0


In [39]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import Stemmer as stm


In [40]:
def process(input_text):
    language = 'russian'
    #stemmer = stm.Stemmer(language)
    
    # List of stop words 
    stop_words = stopwords.words(language)
    # Remove the stop words 
    input_text = [x for x in input_text.split() if not x in stop_words]  
    # Stemming on the words 
    #input_text = [stemmer.stemWord(x) for x in input_text]
    
    return ' '.join(input_text)

In [41]:
data['text'] = data['text'].apply(process)

In [42]:
data.head()

Unnamed: 0,item_id,text,price
0,489517,стоик журнальный сталь продам журнальный столи...,10000.0
1,489518,iphone gb телефон хорошем состоянии комплект г...,12500.0
2,489519,утеплитель теплопелен лидер тепла толщина утеп...,250.0
3,489520,пальто демисезонное продам пальто женское букл...,1700.0
4,489521,samsung syncmaster t n условно рабочий проблем...,1000.0


In [44]:
import pymorphy2
from pymorphy2 import MorphAnalyzer


In [45]:
def to_lemmatize1(df):
    lemmatizer = MorphAnalyzer()
    lemm_func = lambda text: ' '.join([lemmatizer.normal_forms(word)[0] for word in text.split()])
    df['lemmatized_text'] = df['text'].apply(lemm_func)
    
    return df

In [None]:
%%time

data = to_lemmatize1(data)

In [None]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=9000)
tfidf_vect.fit(data['lemmatized_text'])
xtest_tfidf =  tfidf_vect.transform(data['lemmatized_text'])

In [None]:
test_pred = sgd_cls.predict(xtest_tfidf)

In [None]:
test_pred = pd.Series(test_pred)