In [1]:
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import datetime as dt
import pandas as pd


In [2]:
from bs4 import BeautifulSoup

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

In [4]:
from scipy.stats import randint as sp_randint
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

In [5]:
# import logging

# set up logging to file - see previous section for more details
# logging.basicConfig(level=logging.DEBUG,
#                     format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
#                     datefmt='%m-%d %H:%M',
#                     filename='./myapp.log',
#                     filemode='w')
# define a Handler which writes INFO messages or higher to the sys.stderr
# console = logging.StreamHandler()
# console.setLevel(logging.INFO)
# set a format which is simpler for console use
# formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
# tell the handler to use this format
# console.setFormatter(formatter)
# add the handler to the root logger
# logging.getLogger('').addHandler(console)

In [6]:
# отключаем warnings
import warnings

warnings.filterwarnings('ignore')

In [7]:
# logging.info("read cvs train.cvs")
df_train_raw = pd.read_csv('data/train.csv', sep='\t')

In [8]:
df_train_raw.head()

Unnamed: 0,id,name,description,target
0,0,Заведующий отделом/секцией в магазин YORK (Уру...,<p><strong>В НОВЫЙ МАГАЗИН YORK (хозтовары) пр...,1
1,1,Наладчик станков и манипуляторов с ПУ,Обязанности:работа на токарных станках с ЧПУ T...,0
2,2,Разработчик С++ (Криптограф),<strong>Требования:</strong> <ul> <li>Опыт про...,0
3,3,Фрезеровщик,<p>Условия:</p> <ul> <li>На работу вахтовым ме...,0
4,4,Мерчендайзер/продавец-консультант,<p><strong>Компания Палладиум Стандарт - призн...,1


In [9]:
# logging.info("train shape is".format(df_train_raw.shape))
df_train_raw.shape

(200000, 4)

In [10]:
# logging.info("read cvs test.cvs")
df_test = pd.read_csv('data/test.csv', sep='\t')

In [11]:
df_test.head()

Unnamed: 0,id,name,description
0,200000,Дизайнер-консультант мебели,<p><strong>Обязанности:</strong></p> <ul> <li>...
1,200001,Продавец-консультант (ТЦ на Пушкина),<p><strong>Обязанности</strong>:</p> <p>∙ конс...
2,200002,Менеджер по продажам,<p>Торговый Дом «Форт» это ведущая компания Пе...
3,200003,Продавец-консультант в магазин одежды (ТЦ Волн...,<p><strong>Требуются продавцы консультанты в м...
4,200004,Специалист по охране труда,<strong>Обязанности:</strong> <ul> <li> <p>осу...


In [12]:
# logging.info("shape is {}".format(df_test.shape))
df_test.shape

(170179, 3)

In [13]:
# logging.info("read cvs other.cvs")
df_other = pd.read_csv('data/other.csv', sep='\t')

In [14]:
df_other.head()

Unnamed: 0,name,description
0,Специалист научно-производственного отдела,"<p>Образование - Среднее специальное, высшее</..."
1,Оператор по отгрузке товара в 1С (ТЗ),<p><strong>Обязанности:</strong></p> <ul> <li>...
2,Менеджер по персоналу,<strong>Обязанности:</strong> <ul> <li>Подбор ...
3,Ведущий бухгалтер по учёту заработной платы,<strong>Обязанности:</strong> <ul> <li> <p>Нач...
4,Инженер-расчетчик в строительный отдел,<p><strong>Требования:</strong></p> <ul> <li>у...


In [15]:
# logging.info("shape other.cvs is {}".format(df_other.shape))
df_other.shape

(594534, 2)

In [16]:
# stop words
stop_words = [line.rstrip('\n') for line in open("./data/stopwords-ru.txt")]

In [17]:
import re

def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext
    

In [18]:
def prepare_data(dataframe, drop_columns=None):
#     logging.info("prepare_data of dataframe")
    df = dataframe.copy()
    # clearhtml
    df['description'] = df['description'].apply(lambda x : cleanhtml(x) )
    # replace &quot to "
    df['description'] = df['description'].str.replace('&quot;','"')
    df['description'] = df['description'].str.replace('\u200b','')
    # uni name and description
    df['text'] = df['name'] + ' ' + df['description'] 
    # drop
    df = df.drop(drop_columns, axis=1)
    
    return df
    

In [19]:
import re
def f_tokenizer(text, morph, stop_words=None):
    f = []
    words = [ a[0] for a in re.findall("([А-ЯЁа-яё]+(-[А-ЯЁа-яё]+)*)", text)]
    for w in words:
        if w in stop_words:
            continue
        m = morph.parse(w)
        if len(m) != 0 :
            wrd = m[0]
            if wrd.tag.POS not in ('NUMR','PREP','CONJ','PRCL','INTJ'):
                tag = str(wrd.tag.POS) if wrd.tag.POS is not None else "None"
                f.append( "{word}".format(word=wrd.normal_form))
    return f

In [20]:
N = df_train_raw.shape[0]

In [21]:
%%time 
df_train = prepare_data(df_train_raw[:N], drop_columns=["name", "description", "id"])

CPU times: user 10.7 s, sys: 814 ms, total: 11.5 s
Wall time: 11.5 s


In [22]:
df_train.head()

Unnamed: 0,target,text
0,1,Заведующий отделом/секцией в магазин YORK (Уру...
1,0,Наладчик станков и манипуляторов с ПУ Обязанно...
2,0,Разработчик С++ (Криптограф) Требования: Опыт...
3,0,Фрезеровщик Условия: На работу вахтовым метод...
4,1,Мерчендайзер/продавец-консультант Компания Пал...


In [23]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 2 columns):
target    200000 non-null int64
text      200000 non-null object
dtypes: int64(1), object(1)
memory usage: 3.1+ MB


In [24]:
df_tmp_test = prepare_data(df_test[:N], drop_columns=["name", "description", "id"])

In [25]:
df_tmp_test.head()

Unnamed: 0,text
0,Дизайнер-консультант мебели Обязанности: Рабо...
1,Продавец-консультант (ТЦ на Пушкина) Обязаннос...
2,Менеджер по продажам Торговый Дом «Форт» это в...
3,Продавец-консультант в магазин одежды (ТЦ Волн...
4,Специалист по охране труда Обязанности: осущ...


In [26]:
df_tmp_other = prepare_data(df_other[:], drop_columns=["name", "description"])

In [27]:
df_tmp_other.shape

(594534, 1)

In [28]:
df_id = df_test[:].drop(['name', 'description'], axis=1)

In [29]:
df_id.head()

Unnamed: 0,id
0,200000
1,200001
2,200002
3,200003
4,200004


In [30]:
# подключим word2vec
# logging.info(">>> word2vec point")
from gensim.models import word2vec

In [31]:
# может кроме other ещё что-нибудь добавить.
# logging.info("concat data")
# data = pd.concat([df_tmp_other['text'], df_tmp_test['text'], df_train['text']], axis=0, ignore_index=True)
# data.head()

In [32]:
# logging.info("data shape is {}".format( data.shape))
# data.shape

In [33]:
from multiprocessing import Pool
from functools import partial
from tqdm import tqdm
import pymorphy2 # Морфологический анализатор.
morph = pymorphy2.MorphAnalyzer()

In [34]:
def wrapMyFunc(idx, arg, morph, stop_words=None):
        return idx, f_tokenizer(arg, morph, stop_words)
    
def tokernizer(data, morph, stop_words=None, N=10, proc=4):
    sentences = [None] * N  # result list of correct size

    def update(ans):
        if len(ans[1]) != 0 :
            sentences[ans[0]] = ans[1]
        pbar.update()

    pool = Pool(proc)
    pbar = tqdm(total=N)

    idx = 0
    result = 0
    for item in data[:N]:
        pool.apply_async(wrapMyFunc, args=(idx, item, morph, stop_words), callback=update)
        idx +=1
    pool.close()
    pool.join()
    pbar.close()
    
    return sentences


In [2]:
import pickle

def save_obj(obj, file_name ):    
    with open(file_name,'wb' ) as f:
        pickle.dump(obj, f)

def load_obj(file_name):
    with open(file_name,'rb' ) as f:
        obj = pickle.load(f)
    return obj

In [36]:
words_other = tokernizer(data=df_tmp_other['text'], morph=morph, stop_words=stop_words,  N=df_tmp_other['text'].shape[0], proc=8)

100%|██████████| 594534/594534 [3:29:15<00:00, 47.35it/s]  


In [None]:
save_obj(words_other, file_name="./data/words_other" )

In [42]:
words_tests = tokernizer(data=df_tmp_test['text'], morph=morph, stop_words=stop_words,  N=df_tmp_test['text'].shape[0], proc=8)

100%|██████████| 170179/170179 [1:01:44<00:00, 32.30it/s]


In [None]:
save_obj(words_tests, file_name="./data/words_tests" )

In [36]:
words_train = tokernizer(data=df_train['text'], morph=morph, stop_words=stop_words,  N=df_train['text'].shape[0], proc=8)

100%|██████████| 200000/200000 [1:12:21<00:00, 46.07it/s]


In [37]:
save_obj(words_train, file_name="./data/words_train" )

In [1]:
from operator import is_not
from functools import partial

words_train = load_obj("./data/words_train")
words_train = list(map(lambda x : x if x is not None else ["NONE"] , words_train))
words_tests = load_obj("./data/words_tests")
words_tests = list(map(lambda x : x if x is not None else ["NONE"] , words_tests))
words_other = load_obj("./data/words_other")
words_other = list(map(lambda x : x if x is not None else ["NONE"] , words_other))

NameError: name 'load_obj' is not defined

In [4]:
sentences = []
sentences.extend(words_tests)
sentences.extend(words_other)
sentences.extend(words_train)

In [12]:
len(words_train)

TypeError: object of type 'filter' has no len()

In [5]:
idx = 0
vec_nune = []
for sentence in sentences:
    if type(sentence) is not list:
        vec_nune.append(idx)
    idx += 1

In [6]:
len(vec_nune)

0

In [8]:
#создадим словарь со словами и соответсвующими им векторами
# logging.info("create model'")
# logging.info(">>> word2vec point")
from gensim.models import word2vec
model = word2vec.Word2Vec(sentences, size=300, window=10, workers=8, sample=1e-3)
model.init_sims(replace=True)

In [9]:
# logging.info("save model as 'vec_model'")
model.save('vec_model_without_morph')

In [10]:
!ls -l 'vec_model_without_morph'

-rw-r--r-- 1 jovyan users 5129852 Mar 15 23:31 vec_model_without_morph
