# Vacancy Classification SF02
Практикуем работу с текстами

## Description
Мы владельцы специфического Job-сайта и нам дали большой датасет вакансий. Одни вакансии нам интересны по своей тематике, другие не интересны (target 1 и 0 соответственно). Часть вакансий была размечена людскими ресурсами.

Ваша задача обучить классификатор, который на основе размеченной выборки умеет определять интересные вакансии для нашего сайта.

* Метрика качества __ROC_AUC__.
* ИСПОЛЬЗОВАТЬ ВНЕШНИЕ ДАННЫЕ С JOB-сайтов = __ЗАПРЕЩЕНО__
* ИСПОЛЬЗОВАТЬ другие ВНЕШНИЕ ДАННЫЕ = только с разрешения организатора (смотри Discussion)
* Результат засчитывается только при наличие кода, который этот результат повторяет
* Участие индивидуальное

### Описание данных
__train.csv__ - данные для обучения<br>
__test.csv__ - данные для подготовки самбита и проверки<br>
 __sampleSubmission.csv__ - пример корректного но бесполезного сабмита<br>
 __other.csv__ - необязательные данные для доп.статистик и прочих извращений (например обучение word2vec-а)
### Описание полей
__id__ - внутренний идетификатор<br>
__name__ - название вакансии<br>
__description__ - текст вакансии<br>
__target__ - класс заинтересованности

In [1]:
import re # Регулярные выражения.
from bs4 import BeautifulSoup # Превращалка html в текст.
import pymorphy2 # Морфологический анализатор.
import pandas as pd
from collections import Counter # Не считать же частоты самим.
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from gensim.models.word2vec import Word2Vec # Собственно модель.
from gensim.models.word2vec import LineSentence # Выравнивание текста по предложениям.
from gensim.models import KeyedVectors # Семантические вектора.
# На самом деле, нам потребуется только последняя.
import numpy as np # Вектора.
import csv, os



In [2]:
pos_conv = {'ADJF':'_ADJF','NOUN':'_NOUN','VERB':'_VERB'}
html_containers = ['p','li','h1','h2','h3','h4','h5','h6', 'td', 'caption', 'td', 'th', 'title']
cache_keys = ['name', 'description', 'sentences', 'id', 'target']
morph = pymorphy2.MorphAnalyzer()

In [3]:
Use_Data_Cache = True
Cache_Path_Prefix = 'cache\cache.'

### Красивый прогресс бар с оценкой времени

In [4]:
from ipywidgets import IntProgress, HTML, VBox
from IPython.display import display
from datetime import timedelta, datetime

def log_progress(sequence, every=None, size=None, name='Items'):    

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)
    
    timestarts = datetime.now()
    
    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size} red line at {tmto}'.format(
                        name=name,
                        index=index,
                        size=size,
                        tmto=(timestarts+(datetime.now()-timestarts)/index*size).strftime('%d %H:%M')
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

### Немного итераторов оберток для чтения текстов
и экономии памяти, и кеширования для ускорения повторов

In [10]:

class Base_Iterator:
    
    @classmethod
    def yield_true(self):
        while True:
            yield True
    
    def is_selected(self):
        return self.selected.__next__()
    
    def next(self):
        return self.__next__()  
    
    def __go_to(self, index):
        self._iter_lite()
        while index > 0:
            self.__next__(True)
            if self.is_selected(): index -= 1
    
    def __get_single_item(self, index):
        try:
            self.__go_to(index)
            item =  self.__next__()
            self._iter__stop()
            return item
        except StopIteration:
            raise IndexError
    
    def __skip_items(self, count):
        while count > 0:
            self.__next__(True)
            if self.is_selected(): count -= 1
    
    def __slice_items(self, slicer):
        
        i = 0 if (slicer.start is None) else slicer.start
        stop = 0 if (slicer.stop is None) else slicer.stop
        step = 1 if (slicer.step is None) else slicer.step
        skip = step-1
        items = []
        
        try:
            self.__go_to(i)
            while i < stop:
                if self.is_selected(): 
                    items.append(self.__next__())
                    i += step
                    if skip > 0: 
                        self.__skip_items(skip)
            
            self._iter__stop()
            
        except StopIteration:
            pass
        
        return items
    
    def __getitem__(self, key):
        if isinstance(key, int):
            return self.__get_single_item(key)
        if isinstance(key, slice):
            return self.__slice_items(key)
        else:
            raise IndexError
    
    def _iter_lite(self):
        return self.__iter__()
    
    def _iter__stop(self):
        pass
    
class Cross_Iterator(Base_Iterator):
    
    def __init__(self, iterators, func, callback=None, filters=None):
        self.iterator_maker = iterators if isinstance(iterators, list) else [iterators]
        self.func = func if callable(func) else func[0]
        self.func_args = [] if callable(func) else func[1:]
        self.callback = callback if (callable(callback) or (callback is None)) else callback[0]
        self.callback_args = [] if (callable(callback) or (callback is None)) else callback[1:]
        self.filters = filters 

    def __iter__(self):
        self.iterators = [
            (iterator().__iter__() if callable(iterator) else iterator.__iter__()) 
            for iterator in self.iterator_maker
        ]
        self.selected = self.filters.__iter__() if (self.filters is not None) else self.yield_true().__iter__()
        return self
        
    def __next__(self, skip=False):
        while True:
            try:
                if skip: 
                    for iterator in self.iterators: iterator.__next__(True)
                    return None
                
                else:
                    item = [iterator.__next__() for iterator in self.iterators]
                    item = self.func(*item, *self.func_args) if self.is_selected() else None
                
            except StopIteration:
                if self.callback is not None: self.callback(*self.callback_args)
                raise StopIteration()
                break
                
            else:
                if item is not None:
                    return item
    
class File_Iterator(Base_Iterator):
    
    def __init__(self, path, func, skip_first=False, callback=None, filters=None):
        self.path = path
        self.func = func if callable(func) else func[0]
        self.func_args = [] if callable(func) else func[1:]
        self.skip_first = skip_first
        self.callback = callback if (callable(callback) or (callback is None)) else callback[0]
        self.callback_args = [] if (callable(callback) or (callback is None)) else callback[1:]
        self.filters = filters 
        self.reader_file = None
    
    def __iter__(self):
        
        self._iter__stop()
        
        self.skip = self.skip_first
        self.size = sum(1 for line in open(file=self.path, mode='r', encoding='utf-8'))
        self.reader_file = open(file=self.path, mode='r', encoding='utf-8')
        
        self.iterator = log_progress(
            self.make_iterator(),
            size=self.size,
            name='read file {}'.format(self.path)
        )
        
        self.selected = self.filters.__iter__() if (self.filters is not None) else self.yield_true().__iter__()
        
        return self
    
    def _iter_lite(self):
        
        self._iter__stop()
        
        self.skip = self.skip_first
        self.reader_file = open(file=self.path, mode='r', encoding='utf-8')
        
        self.iterator = self.make_iterator()
        
        self.selected = self.filters.__iter__() if (self.filters is not None) else self.yield_true().__iter__()
        
        return self
    
    def make_iterator(self):
        return self.reader_file,
    
    def __next__(self, skip=False):
        
        if self.skip:
            self.skip = False
            self.iterator.__next__()
            
        while True:
            try:
                if skip: 
                    self.iterator.__next__()
                    return None
                
                else: 
                    item = self.iterator.__next__()
                    item = self.func(item, *self.func_args) if self.is_selected() else None
                
            except StopIteration:
                self._iter__stop()
                raise StopIteration()
                break
                
            else:
                if item is not None:
                    return item
    
    def _iter__stop(self):
        if self.reader_file is not None: self.reader_file.close()
        if self.callback is not None: self.callback(*self.callback_args)
    
class CSV_Iterator(File_Iterator):
    
    def make_iterator(self):
        return csv.reader(self.reader_file, delimiter='\t')

class Unpack_Iterators(Base_Iterator):
    
    def __init__(self, iterators):
        self.iterators = iterators
        self.iterator = None
    
    def __iter__(self):
        self.selected = self.yield_true().__iter__()
        self.super_iterator = self.iterators.__iter__()
        return self
        
    def __next__(self):
        
        if self.iterator is None: 
            self._unpack_iterator(self.super_iterator.__next__())
            
        while True:
            try:
                item = self.iterator.__next__()

            except StopIteration:
                self._unpack_iterator(self.super_iterator.__next__())

            else:
                return item
    
    def _unpack_iterator(self, iterator):
        self.iterator = iterator().__iter__() if callable(iterator) else iterator.__iter__()
    
    def next(self):
        return self.__next__()  

class Vacancies_Iterator:
    
    russian_language_threashold = 0.4
    
    def __init__(self, path, headers=None, filters=None):
        self.path = path
        self.headers = headers if (headers is not None) else self.read_headers()
        self._cache_files = {}
        self.filters = filters
    
    def __getitem__(self, key):
        if isinstance(key, str): #перебираем по ключу
            keys = {
                'sentences':self.sentences,
                'is_russian':self.is_russian_language,
                'texts':self.texts
            }
            if key in keys.keys():
                return keys[key]()
            else:
                return self._items(key)
        if isinstance(key, list) and isinstance(key[0], bool): #фильтруем вакансии по булевым
            if self.filters is not None:
                key = [x and self.filters[idx] for idx, x in enumerate(key)]
            return Vacancies_Iterator(path=self.path, headers=self.headers, filters=key)
        else:
            raise KeyError
    
    
    def read_headers(self):
        file_reader = open(file=self.path, mode='r', encoding='utf-8')
        csv_reader = csv.reader(file_reader, delimiter='\t')
        headers = csv_reader.__iter__().__next__()
        del csv_reader
        file_reader.close()
        return headers
        
    def parse_line(self, line):
        items = {}
        for idx, item in enumerate(line):
            items[self.headers[idx]] = item
        return items
    
    def _cache_path(self, attribute):
        return '{prefix}{path}.{attr}.txt'.format(
            prefix=Cache_Path_Prefix,
            path=self.path,
            attr=attribute
        )
    
    def _cache_make_writer(self, attribute):
        if not os.path.isdir('cache'): os.mkdir('cache')
        self._cache_files[attribute] = [open(self._cache_path(attribute), 'w', encoding='utf-8'), False]
    
    def _cache_write(self, attribute, line):
        if self._cache_files[attribute][1]: 
            line = '\n' + line
        else:
            self._cache_files[attribute][1] = True
        self._cache_files[attribute][0].write(line)
        
    def _cache_close_writer(self, attribute):
        self._cache_files[attribute][0].close()
        del self._cache_files[attribute]
    
    def _cache_make_writer_and_get_closer(self, attribute):
        if self._cache_can_write():
            self._cache_make_writer(attribute)
            return self._cache_close_writer, attribute
    
    def _cache_check(self, attribute):
        return Use_Data_Cache and os.path.isfile(self._cache_path(attribute))
    
    def _cache_can_write(self):
        return Use_Data_Cache and (self.filters is None)
    
    def vacancies(self):
        return CSV_Iterator(self.path, self._iter_vacancies, True, filters = self.filters)
    
    def _iter_vacancies(self, line):
        return self.parse_line(line)
    
    def _items(self, key):
        return Cross_Iterator(self.vacancies, (self._iter_items, key))
    
    def _iter_items(self, vacancy, key):
        return vacancy[key]
    
    def sentences(self, update_cache=False, keep_words=None):
        attribute = 'sentences'
        if not isinstance(update_cache, bool): raise Exception
        if (keep_words is not None) and (not isinstance(keep_words, set)): keep_words = set(keep_words)
        if (not update_cache) and self._cache_check(attribute):
            return CSV_Iterator(
                self._cache_path(attribute), 
                (self._iter_sentences_from_cache, keep_words), 
                filters=self.filters
            )
            
        else:
            return Cross_Iterator(
                self.vacancies, 
                (self._iter_sentences_from_vacancies, keep_words),
                callback = self._cache_make_writer_and_get_closer(attribute)
            )
    
    def _iter_sentences_from_cache(self, line, keep_words):
        if len(line)>0:
            return [
                sentence.split(' ') if keep_words is None else [w for w in sentence.split(' ') if w in keep_words]
                for sentence in line[0].split('|')
            ]
        return []
    
    def _iter_sentences_from_vacancies(self, vacancy, keep_words):
        content = self._sentences_from_vacancy(vacancy['name'], vacancy['description'], keep_words)
        if Use_Data_Cache:
            self._cache_write(
                'sentences', 
                '|'.join([' '.join(s) for s in content])
            )
        
        return content

    def _sentences_from_vacancy(self, name, description, keep_words):
        return [self._sentence_to_normalized_words(name, keep_words)] + list([ s for s in [
            self._sentence_to_normalized_words(y, keep_words) for y in
            map(lambda x: x.get_text().strip(), BeautifulSoup(description , 'lxml').find_all(name=html_containers))
            if len(y) > 0
        ] if len(s)>0])
    
    def _sentence_to_normalized_words(self, sentence, keep_words):
        words=[a[0] for a in re.findall("([A-Za-zА-ЯЁа-яё]+(-[A-Za-zА-ЯЁа-яё]+)*)", sentence)]
        reswords=[]
        
        poss = pos_conv.keys()
        
        for w in words:
            wordform = morph.parse(w)[0]
            try:
                if (wordform.tag.POS in poss):
                    word = wordform.normal_form + pos_conv[wordform.tag.POS]
                elif wordform.tag.POS is None:
                    word = wordform.normal_form
                if (keep_words is None) or (word in keep_words):
                    reswords.append(word)
            except:
                pass
        
        return reswords
    
    def texts(self):
        return Cross_Iterator(
            self.sentences,
            self._iter_texts
        )

    def _iter_texts(self, sentences):
        return ' '.join([' '.join(sentence) for sentence in sentences])
    
    def is_russian_language(self, update_cache=False):
        attribute = 'is_russian'
        if not isinstance(update_cache, bool): raise Exception
        if (not update_cache) and self._cache_check(attribute):
            return CSV_Iterator(
                self._cache_path(attribute), 
                self._iter_language_from_cache, 
                filters=self.filters
            )
            
        else:
            return Cross_Iterator(
                self.texts, 
                self._iter_is_russian_language_from_sentences,
                callback = self._cache_make_writer_and_get_closer(attribute)
            )
    
    def _iter_language_from_cache(self, line):
        return int(line[0]) == 1
    
    def _iter_is_russian_language_from_sentences(self, text):
        letters = text.replace('_NOUN','').replace('_ADJF','').replace('_VERB','').replace(' ','').replace('-','')
        is_rus = 0 if len(letters) == 0 else (sum([
            x[1] for x in list(filter(
                lambda el: ('а' <= el[0] <= 'я') or (el[0] == 'ё'),
                Counter(letters).items()
            ))
        ]) / len(letters)) > self.russian_language_threashold
        
        if Use_Data_Cache:
            self._cache_write(
                'is_russian',
                '1' if is_rus else '0'
            )
        
        return is_rus
    
    def split_by_language(self):
        is_rus = list(self.is_russian_language())
        return self[is_rus], self[[not x for x in is_rus]]
    
    def semantics(self, w2v_model=None, update_cache=False):
        attribute = 'semantics'
        if not isinstance(update_cache, bool): raise Exception
        if (not update_cache) and self._cache_check(attribute):
            return CSV_Iterator(
                self._cache_path(attribute), 
                self._iter_semantics_from_cache, 
                filters=self.filters
            )
            
        else:
            return Cross_Iterator(
                [self['id'],  self.sentences], 
                (self._iter_semantics_from_texts, w2v_model),
                callback = self._cache_make_writer_and_get_closer(attribute)
            )
        
    def _iter_semantics_from_cache(self, line):
        return [int(line[0]), [float(x) for x in line[1:]]]
    
    def _iter_semantics_from_texts(self, idv, sentences, w2v_model):
        
        semantics = self.text_to_vec(sentences, w2v_model)
        
        if Use_Data_Cache:
            self._cache_write(
                'semantics',
                '\t'.join([ idv, '\t'.join([str(s) for s in semantics]) ])
            )
        
        return [int(idv), semantics]
    
    def text_to_vec(self, sentences, model):
        text_vec = np.zeros((model.vector_size,), dtype="float32")
        n_words = 0
        dct = Counter(Unpack_Iterators(sentences))
        index2word_set = set(model.wv.index2word)
        for word, count in dct.items():
            if word in index2word_set:
                n_words = n_words + count
                text_vec = np.add(text_vec, model[word] * count) 

        if n_words != 0:
            text_vec /= n_words
        return text_vec

In [6]:
# использовал для предварительного кэширования с целью последующего ускорения, 
# для целей восттановления эксперимента - не обязательно

def cache_update(files, attributes):
    if Use_Data_Cache:
        if not isinstance(files, list): files = [files]
        if not isinstance(attributes, list): attributes = [attributes]
            
        for file in files:
            vi = Vacancies_Iterator(file)
            
            for attribute in attributes:
                if not vi._cache_check(attribute):
                    for item in vi[attribute]:
                        pass # перебираем все элементы для автоматического кэширования

cache_update(['test.csv', 'train.csv', 'other.csv'], ['sentences', 'is_russian'])

In [11]:
test = Vacancies_Iterator('test.csv')
train = Vacancies_Iterator('train.csv')
other = Vacancies_Iterator('other.csv')

In [7]:
# tf-idf фильтрация слов в текстах

In [7]:
test_rus, test_eng = test.split_by_language()
train_rus, train_eng = train.split_by_language()
other_rus, other_eng = other.split_by_language()

In [33]:
def get_count_vectorizer(texts, min_df):
    count_vect = CountVectorizer(lowercase=False, min_df=min_df)
    return count_vect.fit_transform(texts), [x[0] for x in sorted(count_vect.vocabulary_.items(), key=lambda x: x[1])]

def get_idf(texts, min_df):
    vectors, words = get_count_vectorizer(texts, min_df)
    tfidf_transformer = TfidfTransformer(norm=None, smooth_idf=True)
    tfidf_transformer.fit_transform(vectors)
    return tfidf_transformer.idf_, words

In [34]:
idf_rus, words_rus = get_idf( Unpack_Iterators(
    [test_rus.texts, train_rus.texts, other_rus.texts]
), 0)

idf_eng, words_eng = get_idf(Unpack_Iterators(
    [test_eng.texts, train_eng.texts, other_eng.texts]
), 0)

In [35]:
print(len(words_rus))
print(len(words_eng))
print(np.percentile(idf_rus, 0.2))

205066
40017
4.360214616574416


In [36]:
idf_rus_threshold = np.percentile(idf_rus, 20)
words_rus_filtered = [w for idw, w in enumerate(words_rus) if idf_rus[idw]>=idf_rus_threshold]

idf_eng_threshold = np.percentile(idf_eng, 20)
words_eng_filtered = [w for idw, w in enumerate(words_eng) if idf_rus[idw]>=idf_eng_threshold]

words_filtered = words_rus_filtered + words_eng_filtered

In [11]:
def get_w2v_model(force_update=False):
    w2v_vec_cache = Cache_Path_Prefix + 'w2v_model.vec'
    w2v_vcb_cache = Cache_Path_Prefix + 'w2v_model.vcb'
    
    if os.path.isfile(w2v_vec_cache) and (not force_update):
        return KeyedVectors.load_word2vec_format(w2v_vec_cache, binary=True)
        
    else:
        sentences = Unpack_Iterators(Unpack_Iterators(
            [
                test.sentences(keep_words=words_filtered), 
                train.sentences(keep_words=words_filtered), 
                other.sentences(keep_words=words_filtered)
            ]
        ))
        
        w2v_model = Word2Vec(sentences, size=600, min_count=5, workers=8)
        w2v_model.wv.save_word2vec_format(w2v_vec_cache, w2v_vcb_cache, binary=True)
        return w2v_model

In [12]:
# Загружаем модель word2vec
model = get_w2v_model()

In [13]:
# тестируем модель
model.wv.most_similar('реклама_NOUN')

  


[('рекламный_ADJF', 0.5489732027053833),
 ('бигборд_NOUN', 0.5254246592521667),
 ('маркетинг_NOUN', 0.514053225517273),
 ('pos_NOUN', 0.49582794308662415),
 ('фраер_NOUN', 0.47938379645347595),
 ('smm', 0.4720318913459778),
 ('googleadwords', 0.46984875202178955),
 ('прес_NOUN', 0.46643054485321045),
 ('таргентинговый_ADJF', 0.4655117988586426),
 ('yandexdirect', 0.464684396982193)]

In [154]:
for vi in [train, test]:
    for item in vi.semantics(model, True):
        pass



KeyboardInterrupt: 

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

def predict():
    X = list(x[1] for x in train.semantics())
    y = list(int(x) for x in train['target'])
    
    lr = LogisticRegression()
    lr.fit(X, y)
    
    X = list(x[1] for x in test.semantics())
    X_id = list(x[0] for x in test.semantics())
    
    y_hat = lr.predict_proba(X)
    X = []
    
    return list(zip(X_id, y_hat))

In [13]:
predict_probe = predict()

In [38]:
def write_predicts():
    
    writer = open('test_predict_probe.csv', 'w', encoding='utf-8')
    
    writer.write('id,target')
    
    for idp, p in predict_probe:
        writer.write('\n{},{}'.format(idp, p[1]))
    
    writer.close()
    

In [39]:
write_predicts()