# Создаем классификатор

In [1]:
import warnings
warnings.filterwarnings('ignore')
from lxml import etree
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import collections
%matplotlib inline

In [2]:
# Загружаем обучающий файл
train_data = pd.read_csv('train.csv', sep = ',')

In [3]:
# Достаем нужные значения 
title = train_data['title'].values
price = train_data['price'].values
categories = train_data['category_id'].values

In [4]:
# Определяем обучающие и тестовые даннные
from sklearn.model_selection import train_test_split
titles_train, titles_test, prices_train, prices_test, id_train, id_test = train_test_split(title, price, categories, test_size=0.33, random_state=42)

In [5]:
#Классификатор на основе текстовой информации
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Создаем модель и преобразовываем исходный набор данных
# Конвертируем набор текстов в матрицу токенов, находящихся в тексте с помощью CountVectorizer()
# Так как наши прогнозы будут дискретны, применим LogisticRegression()
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('algo', LogisticRegression(penalty='l1', C=17))
])

In [6]:
# Обучаем модель
pipeline.fit(titles_train, id_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...penalty='l1', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [7]:
# Классификатор на основе числовой информации
from sklearn.ensemble import RandomForestClassifier

# Создаем модель 
randomforest = RandomForestClassifier(n_estimators = 50, n_jobs=-1)

# Обучаем модель
randomforest.fit(prices_train.reshape(-1,1), id_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

## Оценка качества модели (Accuracy)

In [8]:
from sklearn.metrics import accuracy_score

# Применяем нашу модель на тестовых данных
predict_price = randomforest.predict_proba(prices_test.reshape(-1,1)) 
predict_title = pipeline.predict_proba(titles_test)
predict_proba = predict_price + 3*predict_title

prediction = [np.argmax(predict_proba[i]) for i in range(0, predict_proba.shape[0])]
print('Test Accuracy:', accuracy_score(prediction, id_test))

Test Accuracy: 0.8351440191654131


# Подсчет Accuracy для каждого уровня иерархии

In [9]:
category_data = pd.read_csv('category.csv', sep = ',')

In [10]:
category_list = [(category_data['name'][i].split('|')[0]) for i in range(0,len(category_data))]

category_predict =  [category_list[prediction[i]] for i in range(len(prediction))]
catategiry_id_test = [category_list[id_test[i]] for i in range(len(id_test))]
print('Test Accuracy:', accuracy_score(category_predict, catategiry_id_test))

Test Accuracy: 0.9318563089246693


In [11]:
category_list2 = [(category_data['name'][i].split('|')[1]) for i in range(0,len(category_data))]

category_predict =  [category_list2[prediction[i]] for i in range(len(prediction))]
catategiry_id_test = [category_list2[id_test[i]] for i in range(len(id_test))]
print('Test Accuracy:', accuracy_score(category_predict, catategiry_id_test))

Test Accuracy: 0.8988739700757084


# Применение классификатора

In [12]:
# Загружаем тестовый файл
test_data = pd.read_csv('test.csv', sep = ',')

In [13]:
# Достаем нужные значения 
title = test_data['title'].values
price = test_data['price'].values
item_id = test_data['item_id'].values

In [14]:
# Применяем нашу модель
price_prediction = randomforest.predict_proba(price.reshape(-1,1))
title_prediction = pipeline.predict_proba(title)
proba_prediction = price_prediction + 3*title_prediction

In [15]:
result = [np.argmax(proba_prediction[i]) for i in range(0, proba_prediction.shape[0])]
dict_result= {'item_id': item_id, 'category_id': result}
df_result = pd.DataFrame.from_dict(dict_result)

In [27]:
df_result.to_csv('result.csv', index = False)