In [1]:
from scipy.stats import mode

%pylab inline

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sn

import re
import os, sys
import json
import pickle

from urllib.parse import urlparse
from urllib.request import urlretrieve, unquote

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 
import glob
from nltk.stem import SnowballStemmer
import pymorphy2
import nltk 
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import re

def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    
    stemmer = SnowballStemmer("russian")
    morph = pymorphy2.MorphAnalyzer()

    en_stop_words = set(stopwords.words('english')) 
    ru_stop_words = set(stopwords.words('russian')) 
    
    word_tokens = word_tokenize(cleantext.lower()) 
    filtered_sentence = [w for w in word_tokens if not w in en_stop_words]
    filtered_sentence = [w for w in filtered_sentence if not w in ru_stop_words]
    
    filtered_sentence = [morph.parse(word)[0].normal_form for word in filtered_sentence]
    stemm_txt = " ".join(stemmer.stem(word) for word in filtered_sentence)
    
    return "".join(word for word in stemm_txt)

Populating the interactive namespace from numpy and matplotlib


# Train

In [2]:
file_path = '/data/share/lab05data/base_*.txt'
data = pd.DataFrame()
ids = []

for filename in glob.iglob(file_path, recursive=True):
    #print(filename)
    
    with open(filename, 'r') as the_file:
        text__ = cleanhtml(the_file.read())
        ids.append(int(filename.replace('/data/share/lab05data/base_', '').replace('.txt', '')))

    data = pd.concat([data, pd.DataFrame([text__])]).reset_index(drop=True)
data.columns = ['text']
data['id_'] = ids
data['target'] = 'train'

# Test

In [3]:
%%time
numbers =  [2561, 2049, 4, 2565, 2562, 2062, 3600, 3603, 3588, 534, 3607, 24, 25, 3612, 3102, 2591, 3804, 3627, 1580, 46, 1074, 
 2100, 3125, 56, 1593, 2485, 2621, 63, 3139, 2316, 610, 2121, 1098, 1099, 588, 2126, 2639, 88, 2140, 607, 1634, 2067, 
 612, 3175, 3178, 531, 1647, 1136, 1137, 1139, 1146, 1151, 22, 1710, 2692, 1160, 1674, 1560, 3729, 623, 3224, 153, 2206, 
 1183, 160, 3235, 3748, 684, 1651, 1711, 30, 1714, 2079, 695, 2232, 698, 189, 1227, 1025, 711, 3784, 2253, 2763, 3788, 
 1741, 2255, 1232, 3283, 383, 2775, 2265, 1829, 1244, 3300, 1767, 1257, 1320, 1747, 3830, 1152, 1279, 3030, 913, 3332, 
 1798, 2311, 386, 1289, 3340, 269, 1806, 274, 1425, 789, 2863, 1305, 2332, 800, 1825, 2339, 292, 293, 2855, 3368, 562, 
 2861, 3886, 2351, 2864, 307, 2356, 822, 1019, 3722, 2101, 3387, 2688, 829, 3904, 833, 2279, 3398, 2887, 336, 337, 3473, 
 854, 3420, 2398, 3426, 3939, 3940, 357, 2918, 1387, 2924, 879, 3442, 1396, 2421, 2427, 1407, 1345, 2438, 1928, 1419, 
 2456, 1054, 2030, 1937, 402, 2628, 2968, 409, 1437, 1956, 421, 1963, 943, 2481, 3509, 955, 2635, 2496, 455, 972, 3025, 
 1188, 3542, 3544, 3550, 482, 3699, 2812, 2534, 3566, 1521, 3570, 1525, 2043]

file_paths = ['/data/share/lab05data/test_'+ str(number) +'.txt' for number in numbers]

test_data = pd.DataFrame()
ids = []
    
for filename in file_paths:
    
    with open(filename, 'r') as the_file:
        text__ = cleanhtml(the_file.read())
        ids.append(int(filename.replace('/data/share/lab05data/test_', '').replace('.txt', '')))

    test_data = pd.concat([test_data, pd.DataFrame([text__])]).reset_index(drop=True)
test_data.columns = ['text']
test_data['id_'] = ids
test_data['target'] = 'test'

CPU times: user 22.9 s, sys: 1.82 s, total: 24.7 s
Wall time: 1min 5s


In [119]:
all_data = pd.concat([data, test_data])

text = " ".join([ch for ch in all_data.text.values])
words = nltk.tokenize.word_tokenize(text)
fdist = FreqDist(words)
stop_words = [words[0] for words in fdist.most_common(9) if words not in ['javascript', 'sql', 'jav', '+', '#', 'git']] +\
['что', 'это', 'кто','то', 'тот', 'мы', 'хотеть', 'хотя', 'вряд', 'ный', 'также', 'такой', 'помимо', 'лишь', 'среди', 
 'вакансия', 'ваш', 'companies', 'company', 'candidate', 'career', 'etc', 'компан', 'итд', 
 'работ']
stop_words

# buzzwords он же тут stop_words всеже лучше вычислить, а не в ручную прописывать/корректировать
# в боевой модели, никто не будет  каждый раз при обучении прописывать новые исключения.

[',',
 '.',
 ';',
 ':',
 'работ',
 'оп',
 ')',
 '(',
 'знан',
 'что',
 'это',
 'кто',
 'то',
 'тот',
 'мы',
 'хотеть',
 'хотя',
 'вряд',
 'ный',
 'также',
 'такой',
 'помимо',
 'лишь',
 'среди',
 'вакансия',
 'ваш',
 'companies',
 'company',
 'candidate',
 'career',
 'etc',
 'компан',
 'итд',
 'работ']

In [120]:
vectorizer = TfidfVectorizer(stop_words=stop_words)
X = vectorizer.fit_transform(all_data['text'])
print(vectorizer.get_feature_names())

# все комментарии что делаем кончились. Через неделю автор и не вспомнит, что он тут делал...

['00', '000', '000т', '02', '06', '09', '10', '100', '10000', '10к', '10тба', '11', '115', '12', '13', '130', '14', '1420', '15', '150', '15000', '153', '16', '17', '18', '1800', '19', '1900', '1989', '1992', '1994', '19ч', '1c', '1с', '20', '2000', '2003', '2005', '2008', '2008r2', '2010', '2011', '2012', '2013', '2014', '2015', '2015г', '21', '22', '23', '25', '25000', '26', '264', '265', '28', '29', '2d', '2гис', '2е', '2миллион', '2стабильн', '2х', '2х2', '30', '300', '30000', '32', '320', '32000', '3222487201', '35', '38', '38442', '39', '3d', '3д', '3х', '40', '40000', '41', '42', '45', '46', '50', '500', '5000', '50000', '500гб', '505', '55', '550', '568', '59', '60', '60000', '64', '70', '70000', '711', '72гб', '80', '80000', '8000грн', '84', '86', '900', '99', 'aac', 'abilit', 'abilities', 'about', 'abroad', 'access', 'actel', 'actionscript3', 'activ', 'activities', 'add', 'ado', 'adob', 'advantag', 'adventur', 'ag', 'agil', 'agile', 'air', 'ajax', 'akk', 'alter', 'altium', 'a

In [130]:
cosine_matrix = cosine_similarity(X[20 :], X[ : 20], dense_output=True)

array([[0.11399299, 0.0132791 , 0.07749268, ..., 0.11944907, 0.06719652,
        0.01532518],
       [0.07413765, 0.        , 0.10737387, ..., 0.07257144, 0.08995961,
        0.00663333],
       [0.10060706, 0.        , 0.12661206, ..., 0.09969255, 0.15882033,
        0.        ],
       ...,
       [0.04050248, 0.        , 0.00216642, ..., 0.02851245, 0.00360767,
        0.        ],
       [0.05905044, 0.        , 0.02080912, ..., 0.03634972, 0.00902765,
        0.        ],
       [0.08307553, 0.0262753 , 0.08536526, ..., 0.08082966, 0.08312448,
        0.04732588]])

In [156]:
test_doc = cosine_matrix.sum(axis=1)
test_doc.mean()

1.0623238781519262

In [162]:
cosine_matrix = pd.DataFrame(cosine_matrix)
cosine_matrix.columns = data.id_.values
cosine_matrix.index = test_data.id_.values

defined = cosine_matrix.index[np.where(test_doc > test_doc.mean())[0]]
undefined = test_data[~test_data.id_.isin(defined)].id_.values.astype(int)
undefined

array([2562, 2062,   25, 1580,   46, 2100, 3125,   56, 1593, 2485,   63,
        610, 2121,  588, 2126, 1634, 2067,  612, 3175, 3178, 1146, 1151,
       1710, 1560, 3729,  623, 2206, 1183,  160, 3235, 1651, 1711, 1714,
       2079,  695, 2232,  698,  189, 1227, 1025,  711, 2253, 3788, 2255,
       1232,  383, 2775, 2265, 1829, 3300, 1257, 1320,  913, 1798,  386,
       1289,  269, 1806,  274, 1425, 1305, 2339,  293,  562,  307, 2356,
       1019, 3722, 2101, 2688, 2279, 3398,  337, 3473, 3420, 2398, 3939,
       1396, 1407, 1345, 2438, 1419, 2456, 1054, 2968,  409, 1437, 1956,
       2481, 3509, 2635, 2496, 3025, 3550,  482, 3699, 2812, 1521, 3570,
       1525])

In [163]:
defined

Int64Index([2561, 2049,    4, 2565, 3600, 3603, 3588,  534, 3607,   24, 3612,
            3102, 2591, 3804, 3627, 1074, 2621, 3139, 2316, 1098, 1099, 2639,
              88, 2140,  607,  531, 1647, 1136, 1137, 1139,   22, 2692, 1160,
            1674, 3224,  153, 3748,  684,   30, 3784, 2763, 1741, 3283, 1244,
            1767, 1747, 3830, 1152, 1279, 3030, 3332, 2311, 3340,  789, 2863,
            2332,  800, 1825,  292, 2855, 3368, 2861, 3886, 2351, 2864,  822,
            3387,  829, 3904,  833, 2887,  336,  854, 3426, 3940,  357, 2918,
            1387, 2924,  879, 3442, 2421, 2427, 1928, 2030, 1937,  402, 2628,
             421, 1963,  943,  955,  455,  972, 1188, 3542, 3544, 2534, 3566,
            2043],
           dtype='int64')

In [None]:
# а где *.json пишем?