In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import re
import csv
import reverse_geocoder as rg

from sklearn.metrics import mean_absolute_error, mean_squared_log_error 

import nltk
import pymorphy2
from pymorphy2.tagset import OpencorporaTag
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

from sklearn.pipeline import Pipeline

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.ensemble import RandomForestRegressor
# from sklearn.tree import ExtraTreeRegressor
# from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
ma = pymorphy2.MorphAnalyzer()
parse_results = {}
latin = OpencorporaTag('LATN')

def get_parse_result(word):
    word = word.lower()
    if not (word in parse_results):
        pv = ma.parse(word)
        for p in pv:
            if p.tag.POS in ['ADJF', 'NOUN', 'VERB'] or p.tag == latin:
                parse_results[word] = p.normal_form
                break

    if not (word in parse_results):
        parse_results[word] = None

    return parse_results[word]


# получаем слова
def getMeaningfullWords(text):
    meaning_words = []
    clean_text = re.sub('—.*', '', text)
    all_words = re.findall('[А-ЯЁа-яёA-Za-z]{3,}', clean_text)

    for word in all_words:
        parse_result = get_parse_result(word)
        if parse_result is not None:
            meaning_words.append(parse_result)

    return meaning_words

# заливаем стоп-слова:
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('russian')    
count_vect = TfidfVectorizer(
    tokenizer=getMeaningfullWords,
    stop_words=stop_words,
    smooth_idf=True,
    sublinear_tf=True,
    min_df=0.024,
    max_df=0.8,
    norm='l2')

def df_to_cv(df, test): 
    
    #  для тренировочного датасета fit_transform, для тестового transform:
    if not test:
        matrix_tfidf = count_vect.fit_transform(df['name']).toarray()
    else:
        matrix_tfidf = count_vect.transform(df['name']).toarray()

    df = df.join(
        pd.DataFrame(matrix_tfidf, columns=count_vect.get_feature_names()),
        how='left')
    df = df.drop('name', axis=1)
    
    return df

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def preproc(data_frame, test):

    df = pd.DataFrame(data_frame)

    #   переконвертируем цену в рубли для тренировочного датасета:
    if not test:
        df['price'] = df['price'].map(lambda x: int(x / 100))
        df = df[df['price'] >= 0]
    
    #   удалим айдишники:
    df = df.drop('id', axis=1)

    #   True и False заменим на 1 и 0 соответственно:
    df['payment_available'] = df['payment_available'].map({False: 0, True: 1})

    #   заменим 1-2 раза встречающиеся подкатегории наиболее подходящими:
    df['subcategory'] = df['subcategory'].replace({
        10002: 10001,
        815: 905,
        1806: 1305
    })

    #   заменим 1 раз встречающуюся категорию
    df['category'] = df['category'].replace({18: 13})

    #   обработаем координаты, добавив города и районы:
    coordinates = []
    for i, row in df.iterrows():
        coordinates.append(
            (row['location']['latitude'], row['location']['longitude']))
    coordinates = rg.search(coordinates)
    coordinates = pd.DataFrame(coordinates)
    coordinates = coordinates.rename(
        columns={'admin1': 'city', 'name': 'subcity'})
    df = df.join(coordinates[['subcity', 'lat', 'lon', 'city']])

    #   редко встречающиеся города и районы объединим в отдельные группы:
    df = df.apply(lambda x: x.mask(x.map(x.value_counts()) <
                                   110, 'other_city') if x.name == 'city' else x)
    df = df.apply(lambda x: x.mask(x.map(x.value_counts()) <= 10,
                                   'other_subcity') if x.name == 'subcity' else x)
    
    #   закодируем города и районы их частотами:
    df = df.apply(lambda x: x.map(x.value_counts())
                  if x.name == 'subcity' else x)
    df = df.apply(lambda x: x.map(x.value_counts()) if x.name == 'city' else x)

    #   почистим описание от довйных пробелов:
    df['description'] = df['description'].map(
        lambda x: x.lower().strip().replace("  ", " "))
    
    #   заменим разные варианты написаня одних слов:
    df['name'] = df['name'].map(lambda x:
                                x.lower().strip().replace('айфон', 'iphone').replace('самсунг', 'samsung'))

    #  найдем всё бесплатное и запишем в отдельную колонку:
    df['free'] = np.where(
        (
            (df['description'].str.contains("почти за") == False) &
            (df['description'].str.contains("почти даром") == False) &
            (df['description'].str.contains("почти бесплатно") == False) &
            (df['description'].str.contains("даром за") == False) &
            (df['description'].str.contains("прода") == False) &
            (df['description'].str.contains("бесплатно привезу") == False) &
            (df['description'].str.contains("недорого") == False) &
            (df['description'].str.contains("практически") == False) &
            (df['description'].str.contains("нуждающим") == False) &
            (df['description'].str.contains("можно сказать") == False) &
            (df['description'].str.contains("цена") == False) &
            (df['description'].str.contains("считай") == False) &
            (df['description'].str.contains("при заказе") == False) &
            (df['description'].str.contains("при покупке") == False) &
            (df['description'].str.contains(" символ") == False) &
            (df['description'].str.contains("чехол") == False) &
            (df['description'].str.contains("чехлы") == False) &
            (df['description'].str.contains("монтаж") == False) &
            (df['description'].str.contains("установк") == False) &
            (df['description'].str.contains("дешевле только") == False) &

            (df['name'].str.contains("прода") == False) &
            (df['name'].str.contains("почти") == False) &
            (df['name'].str.contains("считай") == False)
        ) & (
            (
                (
                    (df['name'].str.contains("даром")) |
                    (df['name'].str.contains("бесплатно"))
                ) &
                (df['name'].str.contains("достав") == False) &
                (df['name'].str.contains("привез") == False)
            ) | (
                (
                    (df['description'].str.contains("бесплатно")) |
                    (df['description'].str.contains("даром"))
                ) &
                (df['description'].str.contains("достав") == False) &
                (df['description'].str.contains("привез") == False)
            )
        ), 1, 0)

    #   объединим названия (увеличим значимость), описания и наличие станции метро:
    df['name'] = (df['name']+' ')*2
    df['name'] = df[['name', 'description']].apply(
        lambda x: ' '.join(x), axis=1)
    df['subway'] = df['subway'].fillna('').astype(str)
    df['name'] = df[['name', 'subway']].apply(lambda x: ' '.join(x), axis=1)

    #   заменим описание на его длину:
    df['description'] = df['description'].map(lambda x: len(x))

    #   кол-во картинок:
    df['images'] = df['images'].map(lambda x: len(x))

    #   кол-во словарей с полями:
    df['fields'] = df['fields'].map(lambda x: len(x))
    
    #   удалим не информативные колонки:
    df = df.drop(['can_buy', 'can_promote', 'contacts_visible', 'mortgage_available', 'delivery_available',
                  'fields', 'category', 'location', 'images', 'subway'], axis=1)

    return df.fillna(0)

# Работа с train_sample, тестирование

In [4]:
train_sample = pickle.load(open('data2/train_sample.pckl', 'rb')).reset_index(drop=True)

In [6]:
train_sample.head()

Unnamed: 0,can_buy,can_promote,category,contacts_visible,date_created,delivery_available,description,fields,id,images,location,mortgage_available,name,payment_available,price,subcategory,subway
0,False,False,9,True,1492780671,False,,"[{'order': 1, 'field': {'type': 'category', 'i...",3edeb34cf93f490ff760af85,"[{'height': 3024, 'id': '58fa06746c86cb4f22313...","{'latitude': 55.806888, 'longitude': 37.546077}",False,Сумка DG,True,199900,914,
1,False,False,22,True,1476824319,False,8-12 лет,"[{'order': 1, 'field': {'type': 'category', 'i...",c98febd50dad3cc0ffc86085,"[{'id': '58068ccc04559f59bdbda92d', 'num': 1, ...","{'latitude': 55.692979, 'longitude': 37.872337}",False,Комплект,False,35000,2202,
2,False,False,22,True,1473004313,False,"На девочку 1,5 г,состояние хорошее","[{'order': 1, 'field': {'type': 'category', 'i...",ade01e13912a46a99134cc75,"[{'id': '57cc42ecd53f3dcf17dc01c8', 'num': 1, ...","{'latitude': 55.639011, 'longitude': 37.349378}",False,Пальтишко демисезонное,False,30000,2204,
3,False,False,22,True,1476307221,False,"Размер-135mm, euro-22.5.Прочная, мягкая, не ск...","[{'order': 1, 'field': {'type': 'category', 'i...",ab3e6941c11304c1519aef75,"[{'id': '580546528ae74be97723532e', 'num': 1, ...","{'latitude': 55.847334, 'longitude': 37.495834}",False,Attipas,True,80000,2209,
4,False,False,22,True,1503487787,False,,"[{'order': 1, 'field': {'type': 'category', 'i...",252452a91c944a22c276d995,"[{'height': 720, 'id': '599d66f3f235022f7411a5...","{'latitude': 60.044826, 'longitude': 30.35546}",False,Жилет теплый,True,50000,2204,


In [5]:
train_sample = train_sample.pipe(preproc, False).pipe(df_to_cv, False)

Loading formatted geocoded file...


In [6]:
train_sample.shape

(100000, 48)

In [7]:
train_sample.head()

Unnamed: 0,date_created,description,payment_available,price,subcategory,subcity,lat,lon,city,free,...,состояние,телефон,торг,тёплый,удобный,фото,хороший,цвета,цена,чехол
0,1492780671,0,1,1999,914,253,55.8,37.51667,13377,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1476824319,8,0,350,2202,260,55.7,37.85,13377,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1473004313,34,0,300,2204,176,55.64528,37.33583,13377,0,...,0.390589,0.0,0.0,0.0,0.0,0.0,0.525283,0.0,0.0,0.0
3,1476307221,493,1,800,2209,64,55.85381,37.49604,13377,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1503487787,0,1,500,2204,260,60.06964,30.3487,2105,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# обучимся на части, у которой есть цена:
part_df = train_sample[train_sample['price'] > 0]
X_part, y_part = part_df.drop('price', axis=1).values, part_df['price'].values

# на показатели при обучении по всем данным тоже посмотрим:
X, y = train_sample.drop('price', axis =1).values, train_sample['price'].values

# чистим переменные, которые не будем использовать, ради экономии памяти:
train_sample = None
part_df = None

In [27]:
# логарифмирование цены используем для CatBoost'a
# y_part = np.log1p(y_part)
# y = np.log1p(y)

# RandomForestRegressor

In [9]:
rfr = RandomForestRegressor(n_jobs=8, random_state=282, n_estimators=50)
param_grid = {
# 'min_samples_split': [2, 4],
# 'min_samples_leaf': [1, 2],
    'n_estimators': range(100,101)
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=282)
rs = RandomizedSearchCV(rfr, param_distributions=param_grid, n_iter=1, n_jobs=-1,
                        cv=cv, scoring='neg_mean_squared_log_error', random_state=282)

In [14]:
rs.fit(X_part, y_part)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=282, shuffle=True),
          error_score='raise-deprecating',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=8,
           oob_score=False, random_state=282, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=1, n_jobs=-1,
          param_distributions={'n_estimators': range(100, 101)},
          pre_dispatch='2*n_jobs', random_state=282, refit=True,
          return_train_score='warn', scoring='neg_mean_squared_log_error',
          verbose=0)

In [15]:
rs.best_params_

{'n_estimators': 100}

In [16]:
abs(rs.best_score_)**0.5

1.321485001270554

In [13]:
best_model = rs.best_estimator_

# Читаем test_hack, предсказываем

In [17]:
test_hack = pickle.load(open('data2/test_hack.pckl', 'rb')).reset_index(drop=True)

In [18]:
model_for_file = pd.read_csv('data/submit_Sample.csv', delimiter=',', encoding='utf8', index_col='id')

In [19]:
model_for_file.head()

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
285ea2e9935ccdeb8378c6a5,0
adfb73820bbb831257df6e95,0
783025601c36202f633fc6a5,0
2f0cd2d2e15dc90afd847f95,0
5c23a37902855a20172845a5,0


In [20]:
# предобработка test_hack
test_hack = test_hack.pipe(preproc, True).pipe(df_to_cv, True)

In [21]:
test_hack.head()

Unnamed: 0,date_created,description,payment_available,subcategory,subcity,lat,lon,city,free,iphone,...,состояние,телефон,торг,тёплый,удобный,фото,хороший,цвета,цена,чехол
0,1517061944,28,1,603,9506,54.74306,55.96779,16248,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1508310389,122,1,203,576,59.73833,30.08944,42337,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1517089590,634,0,116,1090,55.98028,37.135,91690,0,0.0,...,0.416424,0.0,0.0,0.0,0.0,0.461439,0.330761,0.0,0.0,0.0
3,1509378271,0,1,1009,1657,55.80961,37.78739,87647,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1515487857,145,0,1104,2281,59.84167,30.25583,42337,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
test_hack.shape

(749525, 47)

In [23]:
model_for_file['price'] = best_model.predict(test_hack)

In [24]:
model_for_file['price'] = model_for_file['price'].map(lambda x: int(x))

In [26]:
model_for_file.to_csv('predict_1.33.csv')