In [1]:
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import datetime as dt
import pandas as pd


In [2]:
from bs4 import BeautifulSoup

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

In [4]:
from scipy.stats import randint as sp_randint
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

In [5]:
# import logging

# set up logging to file - see previous section for more details
# logging.basicConfig(level=logging.DEBUG,
#                     format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
#                     datefmt='%m-%d %H:%M',
#                     filename='./myapp.log',
#                     filemode='w')
# define a Handler which writes INFO messages or higher to the sys.stderr
# console = logging.StreamHandler()
# console.setLevel(logging.INFO)
# set a format which is simpler for console use
# formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
# tell the handler to use this format
# console.setFormatter(formatter)
# add the handler to the root logger
# logging.getLogger('').addHandler(console)

In [6]:
# отключаем warnings
import warnings

warnings.filterwarnings('ignore')

In [17]:
import re

def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext
    

In [7]:
def prepare_data(dataframe, drop_columns=None):
#     logging.info("prepare_data of dataframe")
    df = dataframe.copy()
    # clearhtml
    df['description'] = df['description'].apply(lambda x : cleanhtml(x) )
    # replace &quot to "
    df['description'] = df['description'].str.replace('&quot;','"')
    df['description'] = df['description'].str.replace('\u200b','')
    # uni name and description
    df['text'] = df['name'] + ' ' + df['description'] 
    # drop
    df = df.drop(drop_columns, axis=1)
    
    return df
    

In [8]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [36]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(next(iter(w2v.values())))

    def fit(self, X):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])


In [9]:
class tfidf_vectorizer(object):
    def __init__(self, word2vec, stopwords=None):
        self.stopwords = stopwords
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(next(iter(w2v.values())))

    def fit(self, X):
#         logging.info("tfidf_vectorizer fit")
#         tfidf = TfidfVectorizer(analyzer=lambda x: x, stop_words=self.stopwords, tokenizer=f_tokenizer)
#         tfidf.fit(X)
#         max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
#         logging.info("tfidf_vectorizer fit end")
        return self

    def transform(self, X):
#         logging.info("tfidf_vectorizer transform")
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [10]:
from sklearn.metrics import roc_auc_score

def get_auc_lr_valid(X, y, tests,  seed=123, ratio=0.8):
    # разделим выборку на обучающую и валидационную
    idx = round(X.shape[0] * ratio)
    print(idx)
    # обучение классификатора
    x_train = X[:idx,:]
    x_valid = X[idx:,:]
    y_train = y[:idx,:]
    y_valid = y[idx:,:]
    print(x_train.shape[0])
    print(y_train.shape[0])
#     logging.info("start RandomForestClassifier")
    clf = RandomForestClassifier(n_estimators=20, random_state=seed)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_valid)
    score = roc_auc_score(y_valid, y_pred)
#     logging.info("score {}".format(score))
    predictions = clf.predict(tests)
    
    return score, predictions

In [11]:
import pickle

def save_obj(obj, file_name ):    
    with open(file_name,'wb' ) as f:
        pickle.dump(obj, f)

def load_obj(file_name):
    with open(file_name,'rb' ) as f:
        obj = pickle.load(f)
    return obj

In [12]:
# подключим word2vec
# logging.info(">>> word2vec point")
from gensim.models import word2vec

In [13]:
from multiprocessing import Pool
from functools import partial
from tqdm import tqdm
import pymorphy2 # Морфологический анализатор.
morph = pymorphy2.MorphAnalyzer()

In [14]:
df_train_raw = pd.read_csv('data/train.csv', sep='\t')

In [18]:
%%time 
df_train = prepare_data(df_train_raw[:], drop_columns=["name", "description", "id"])

CPU times: user 10.6 s, sys: 1.04 s, total: 11.7 s
Wall time: 11.7 s


In [69]:
df_test = pd.read_csv('data/test.csv', sep='\t')
df_tmp_test = prepare_data(df_test[:], drop_columns=["name", "description", "id"])

In [70]:
df_id = df_test[:].drop(['name', 'description'], axis=1)

In [19]:
model = word2vec.Word2Vec.load('vec_model_without_morph')

In [52]:
from operator import is_not
from functools import partial

words_train = load_obj("./data/words_train")
words_train = list(map(lambda x : x if x is not None else ["NONE"] , words_train))
words_tests = load_obj("./data/words_tests")
words_tests = list(map(lambda x : x if x is not None else ["NONE"] , words_tests))
words_other = load_obj("./data/words_other")
words_other = list(map(lambda x : x if x is not None else ["NONE"] , words_other))

In [53]:
len(words_train)

200000

In [54]:
# logging.info("create w2v")
w2v = dict(zip(model.wv.index2word, model.wv.syn0))

In [55]:
label = 'target'
RND_SEED = 123
idx_features = df_train.columns != label

In [56]:
X = words_train
y = df_train.loc[:, ~idx_features].values
tests = words_tests

In [57]:
data_mean = MeanEmbeddingVectorizer(w2v).fit(X).transform(X)

In [62]:
data_mean.shape

(200000, 300)

In [59]:
tests_mean = MeanEmbeddingVectorizer(w2v).fit(tests).transform(tests)

In [63]:
tests_mean.shape

(170179, 300)

In [43]:
y.shape

(200000, 1)

In [64]:
score, predictions = get_auc_lr_valid(data_mean, y, tests_mean, seed=123)

160000
160000
160000


In [65]:
predictions

array([1, 1, 1, ..., 0, 0, 0])

In [66]:
score

0.95896448353214125

In [71]:
df_id['target'] = pd.DataFrame(predictions.tolist())
df_id.shape

(170179, 2)

In [72]:
df_id.head()

Unnamed: 0,id,target
0,200000,1
1,200001,1
2,200002,1
3,200003,1
4,200004,0


In [73]:
from datetime import datetime
today = datetime.today().strftime('%Y-%m-%d_%H-%M')
file_name = './submission/submission_{today}.cvs'.format(today=today)
df_id.to_csv(path_or_buf=file_name, index=False, sep=',')

In [74]:
competition='vacancy-classification-sf01'
submission=file_name
message='word2vec with new vocabulary and without morph'

!kaggle competitions submit -c $competition -f $submission -m "$message"

/bin/sh: 1: kaggle: not found


In [75]:
file_name

'./submission/submission_2018-03-16_00-13.cvs'