# workshop.g-sh.tech

In [1]:
import os

os.environ['MKL_NUM_THREADS']='1'
os.environ['NUMEXPR_NUM_THREADS']='1'
os.environ['OMP_NUM_THREADS']='1'

import pandas as pd
import numpy as np

from itertools import chain

from tqdm import tqdm_notebook as tqdm
from tqdm import tqdm_pandas

from string import punctuation, whitespace, digits

import json

from matplotlib import pyplot as plt
import seaborn as sns
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode()

%matplotlib inline

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN, AgglomerativeClustering

from gensim.matutils import sparse2full

from sklearn.metrics import jaccard_similarity_score

from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import cross_val_score
from sklearn.cross_validation import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score, make_scorer, precision_score, roc_auc_score

from gensim.corpora import Dictionary
from gensim.models import LdaMulticore, FastText, TfidfModel

In [396]:
raw_posts = pd.read_json('./_shared/stackexchange_posts.jsonl', lines=True)

In [None]:
import spacy
import re
import string

posts = pd.read_json('./_shared/stackexchange_posts.jsonl', lines=True)

cleanr = re.compile('<.*?>')
# cleanr = re.compile('<[^>]+>')

def strip_html(doc):
    cleantext = re.sub(cleanr, '', doc)
    #nopunct = "".join(l for l in cleantext if l not in string.punctuation)
    return cleantext #re.sub('\n', ' ', nopunct)

nlp = spacy.load('en')
nlp.remove_pipe('parser')
# nlp.remove_pipe('tagger')
nlp.remove_pipe('ner')


body_processed = []
for doc in tqdm(nlp.pipe(posts.Body.apply(strip_html), batch_size=500, )):
    body_processed.append([t.lemma_ for t in doc])
    
posts['body_processed'] = body_processed

In [None]:
answers_processed = []

for answers in tqdm(posts.Answers):
    if answers is None:
        answers_processed.append(None)
        continue
        
    prepared_bodies = map(lambda x: strip_html(x['Body']), answers)
    bodies_processed = []
    for doc in nlp.pipe(prepared_bodies, batch_size=50):
        bodies_processed.append([t.lemma_ for t in doc])
    answers_processed.append(bodies_processed)

In [None]:
posts['answers_processed'] = posts.Answers.copy()

In [None]:
for_update = []

for i in tqdm(range(len(posts.answers_processed))):
    if answers_processed[i] is None:
        for_update.append([])
        continue
    
    res = posts.answers_processed[i].copy()
    for j, processed_anser_body in enumerate(answers_processed[i]):
        res[j]['Body'] = processed_anser_body
    for_update.append(res)
        

In [None]:
posts.answers_processed = for_update

In [None]:
posts.to_json('posts.processed.jsonl', orient='records', lines=True, force_ascii=False)

## Загружаем обработанное и работаем

In [4]:
posts = pd.read_json('posts.processed.jsonl', lines=True)

In [5]:
hub2id = {name:i for i, name in enumerate(posts.Hub.unique())}

In [398]:
for n, (answers, hub) in posts[['answers_processed', 'Hub']].iterrows():
    if answers is None:
        continue
    for answer in answers:
        answer['Hub'] = hub
        answer['aId'] = answer['Id']*10 + hub2id[answer['Hub']]

In [7]:
posts['aId'] = posts.Id*10 + posts.Hub.map(hub2id.get)

In [8]:
posts.set_index('aId', inplace=True)

In [10]:
answers_df = pd.DataFrame(list(chain.from_iterable(filter(None, posts.answers_processed))))

In [11]:
answers_df.set_index('aId', inplace=True)

In [49]:
from collections import defaultdict

In [50]:
just_texts = []

id2path = {}
current_pointer = 0

answers_texts = []
questions_texts = []

true_answers = defaultdict(list)

for n, (question, answers) in tqdm(posts[['body_processed', 'answers_processed']].iterrows()):      
    just_texts.append(question)
    questions_texts.append(question)
    id2path[current_pointer] = ('post', n)
    current_pointer += 1
    just_texts.extend(_['Body'] for _ in answers)
    answers_texts.extend(_['Body'] for _ in answers)
    true_answers[len(questions_texts) - 1].extend(range(len(answers_texts) - len(answers), len(answers_texts)))
    for answer in answers:
        id2path[current_pointer] = ('ans', answer['aId'])
        current_pointer += 1
        




In [252]:
gdict = Dictionary(documents=just_texts)

2018-04-05 07:13:24,030 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-04-05 07:13:26,056 : INFO : adding document #10000 to Dictionary(49997 unique tokens: ['\n', '\n\n', '"', ',', '-PRON-']...)
2018-04-05 07:13:28,257 : INFO : adding document #20000 to Dictionary(99535 unique tokens: ['\n', '\n\n', '"', ',', '-PRON-']...)
2018-04-05 07:13:30,663 : INFO : adding document #30000 to Dictionary(157415 unique tokens: ['\n', '\n\n', '"', ',', '-PRON-']...)
2018-04-05 07:13:32,598 : INFO : built Dictionary(231092 unique tokens: ['\n', '\n\n', '"', ',', '-PRON-']...) from 39001 documents (total 7985131 corpus positions)


In [253]:
gdict.filter_extremes(no_below=1, no_above=0.3, keep_n=1000000)

2018-04-05 07:13:48,017 : INFO : discarding 39 tokens: [('\n', 37457), ('\n\n', 30674), (',', 35053), ('-PRON-', 37425), ('.', 37261), ('?', 17130), ('a', 32917), ('as', 18404), ('be', 36999), ('but', 17140)]...
2018-04-05 07:13:48,019 : INFO : keeping 231053 tokens which were in no less than 1 and no more than 11700 (=30.0%) documents
2018-04-05 07:13:48,348 : INFO : resulting dictionary: Dictionary(231053 unique tokens: ['"', 'backprop', 'backpropagation', 'basically', 'different']...)


In [263]:
def is_trash_token(args, trash_is=punctuation+whitespace):
    token = args
    return all(map(lambda ch: ch in trash_is, token))

def is_trash_token_fragile(args, trash_is=punctuation+whitespace+'()+-/*$'):
    token = args
    return any(map(lambda ch: ch in trash_is, token))

def clean_token_fragile(args, trash_is=punctuation+whitespace+'()+-/*$'):
    token = args
    return ''.join(filter(lambda ch: ch not in trash_is, token))

In [264]:
tokens_to_remove, token_ids_to_remove = zip(*filter(is_trash_token, gdict.token2id.items()))

In [266]:
print('Top 10(total: %s) tokens to remove:'%len(tokens_to_remove))

for t, tid in zip(tokens_to_remove[:10], token_ids_to_remove):
    print('\t Token: %20s; \t\t token id: %5s'%(json.dumps(t), tid))

Top 10(total: 1326) tokens to remove:
	 Token:                 "\""; 		 token id:     0
	 Token:             "\n\n\n"; 		 token id:    41
	 Token:                  "&"; 		 token id:    42
	 Token:                  "'"; 		 token id:    43
	 Token:                  ";"; 		 token id:    66
	 Token:           "\n\n\n  "; 		 token id:   204
	 Token:                 "--"; 		 token id:   253
	 Token:                  " "; 		 token id:   303
	 Token:                 ":)"; 		 token id:   304
	 Token:                "\n "; 		 token id:   408


In [267]:
gdict.filter_tokens(token_ids_to_remove)

In [268]:
gcorpus = [gdict.doc2bow(doc) for doc in just_texts]

In [424]:
tfidf = TfidfModel(gcorpus,id2word=gdict)

2018-04-05 16:38:39,916 : INFO : collecting document frequencies
2018-04-05 16:38:39,920 : INFO : PROGRESS: processing document #0
2018-04-05 16:38:40,145 : INFO : PROGRESS: processing document #10000
2018-04-05 16:38:40,349 : INFO : PROGRESS: processing document #20000
2018-04-05 16:38:40,575 : INFO : PROGRESS: processing document #30000
2018-04-05 16:38:40,775 : INFO : calculating IDF weights for 39001 documents and 229726 features (2583428 matrix non-zeros)


In [269]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

topics_number = 150

lda = LdaMulticore(gcorpus,num_topics=topics_number, id2word=gdict, workers=10, passes=25, chunksize=5000 )

In [None]:
for topic_id, representation in lda.show_topics(num_topics=topics_number, num_words=7):
    print("#{}: {}".format(topic_id, representation))

In [638]:
clear_texts = [[wrd for wrd in doc if not is_trash_token(wrd)] for doc in just_texts]

In [None]:
fsttxt = FastText(clear_texts, word_ngrams=0,
    sg=1,
    size=25, 
    workers=10, 
    negative=350, 
    min_count=1, 
    min_n=3, 
    max_n=3, 
    window=10, 
    hs=1, 
    iter=50, 
    batch_words=100000)

In [928]:
graph_vec = fsttxt.wv.get_vector('model')
text_vec = fsttxt.wv.get_vector('deep')
graph_gerald = fsttxt.wv.get_vector('error')

In [929]:
tot = 10
for i in range(tot+1):
    print(fsttxt.wv.similar_by_vector((i*graph_vec + (tot-i)*graph_gerald)/tot)[0])

('error', 0.9999999403953552)
('error', 0.9972934722900391)
('error', 0.98841392993927)
('error', 0.9723179340362549)
('error', 0.9482318162918091)
('get', 0.9408340454101562)
('test', 0.9425910711288452)
('model', 0.9666866660118103)
('model', 0.9857156872749329)
('model', 0.9965918660163879)
('model', 1.0)


In [933]:
def embed_doc(doc, model=fsttxt, tf_model=tfidf, gensim_dict=gdict, print_report=False, return_report=False):
    embs = []
    ws = 0
    
    report = ''
    
    for _ in doc:
        if _ in gdict.token2id:
            w = tf_model.idfs[gdict.token2id[_]]
        elif not is_trash_token(_):
            w = 2
        else:
            w=0.1
        try:
            report += ' %3.2f*%s'%(w, _)
            
            embs.append(w*model.wv.get_vector(_))
            ws += w
        except KeyError:
            pass
    if print_report:
        print(report)
    if return_report:
        return np.sum(embs, axis=0)/ws, report
    return np.sum(embs, axis=0)/ws

In [94]:
vecs = list(map(lambda x: list(sparse2full(x, topics_number)), tqdm(lda[gcorpus])))

vecs = np.array(vecs)




In [935]:
vecs = np.array(list(embed_doc(_) for _ in tqdm(just_texts)))




In [1504]:
vecs = np.array(list(vanilla_embed_doc(_) for _ in tqdm(just_texts)))

In [1505]:
to_draw_num = 5000

In [1506]:
clusterizer = DBSCAN(metric='cosine', p=1, n_jobs=1, eps=0.08, min_samples=5, algorithm='brute')

In [1507]:
clusterizer = AgglomerativeClustering(n_clusters=200, affinity='cosine', linkage='complete')

In [1508]:
clusters = clusterizer.fit_predict(vecs[:to_draw_num])

In [1509]:
proj = TSNE(2)

In [1510]:
vecs2draw = proj.fit_transform(vecs[:to_draw_num])

In [1]:
plt.figure(figsize=(20,15))

for cluster_id in set(clusters) - set([-1]):
    plt.scatter(vecs2draw[clusters == cluster_id, 0], vecs2draw[clusters == cluster_id, 1], s=10, alpha=0.5, label='%s cl'%cluster_id)

plt.scatter(vecs2draw[clusters == -1, 0], vecs2draw[clusters == -1, 1], s=5, c='lightblue', alpha=0.5, label='unlabeled')
# plt.legend()

NameError: name 'plt' is not defined

In [1384]:
def get_repr_of(id_to_repr):
    index_type, index_id = id2path[id_to_repr]
    if index_type == 'post':
        return '[%s] post[sc: %s]: %s'%(index_id, posts.loc[index_id].Score, posts.loc[index_id].Title)
    elif index_type == 'ans':
        words = answers_df.loc[index_id].Body
        return '[%s] ans[sc: %s]: %s'%(index_id, answers_df.loc[index_id].Score, ' '.join(words[:min(10, len(words))]))
    else:
        raise Exception()

In [1385]:
colors = [ 'red' if id2path[_][0] == 'post' else 'green' for _ in range(vecs2draw.shape[0])]

In [1386]:
topic_labels = [get_repr_of(_) for _ in tqdm(range(vecs2draw.shape[0]))]

In [None]:
#### colors = ['hsl({}, 70%, 50%)'.format((43 * hash(row)) % 360) for row in topic_labels]

trace = go.Scatter(
    x = vecs2draw[:, 0],
    y = vecs2draw[:, 1],
    text = topic_labels,
    mode = 'markers',
    marker=dict(size=4, color=colors, opacity=0.3),
)

py.iplot([trace])


In [107]:
from sklearn.neighbors import NearestNeighbors

In [1615]:
indexer = NearestNeighbors(n_neighbors=20, metric='cosine', p=1, algorithm='brute', n_jobs=1)

In [None]:
indexer.fit(np.array(list(map(lambda x: list(sparse2full(x, topics_number)), tqdm(lda[[gdict.doc2bow(doc) for doc in answers_texts]])))))

In [None]:
indexer.fit(list(embed_doc(_) for _ in tqdm(answers_texts)))

In [1616]:
indexer.fit(list(vanilla_embed_doc(_) for _ in tqdm(answers_texts)))




NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=20, p=1, radius=1.0)

In [122]:
dists, names = indexer.kneighbors(np.array(list(map(lambda x: list(sparse2full(x, topics_number)), tqdm(lda[[gdict.doc2bow(doc) for doc in questions_texts[:10]]])))), n_neighbors=100)




In [950]:
dists, names = indexer.kneighbors(list(embed_doc(_) for _ in tqdm(questions_texts[:10])), n_neighbors=10)




In [1617]:
dists, names = indexer.kneighbors(list(vanilla_embed_doc(_) for _ in tqdm(questions_texts[:10])), n_neighbors=10)




In [1606]:
from itertools import islice

In [1618]:
for (name, question), dist_to, name_ids in zip(islice(raw_posts.iterrows(), 10), dists, names):
    print(question.Body)
    for dist, name in zip(dist_to, name_ids):
        print('\t %s, %s'%(dist, ', '.join(_.strip() for _ in answers_texts[name])))

<p>What does "backprop" mean? I've Googled it, but it's showing backpropagation.</p>

<p>Is the "backprop" term basically the same as "backpropagation" or does it have a different meaning?</p>

	 0.00675398, if, -PRON-, understand, that, post, right, (, -PRON-, just, skim, through, ,, so, -PRON-, be, possible, -PRON-, miss, some, detail, ), ,, -PRON-, be, use, several, predictor, on, the, input, with, several, rectangle, ., this, basically, mean, separate, ,, not, share, ,, weight, to, detect, each, rectangle, ., , -PRON-, be, fairly, likely, that, these, neuron, will, adjust, to, certain, area, on, the, input, image, ., as, far, as, the, neuron, be, concern, ,, -PRON-, be, ", punish, ", if, -PRON-, detect, the, wrong, rectangle, ,, as, if, the, other, rectangle, be, the, noise, that, the, network, should, learn, to, ignore, ., as, a, result, ,, the, ", left, ", predictor, learn, to, fire, more, actively, when, -PRON-, detect, a, rectangle, on, the, left, ,, because, there, be, high, c

In [1619]:
jaccards = []

for i, found in (enumerate(names)):
    jaccards.append(len(set(true_answers[i]) & set(found))/len(set(true_answers[i]) | set(found)))

In [1620]:
jaccards

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [175]:
vectors_questions = np.array(list(map(lambda x: list(sparse2full(x, topics_number)), tqdm(lda[[gdict.doc2bow(doc) for doc in questions_texts]]))))




In [1448]:
vectors_questions = np.array(list(vanilla_embed_doc(_) for _ in tqdm(questions_texts)))

In [957]:
ml_encoder = MultiLabelBinarizer()

In [958]:
labels = ml_encoder.fit_transform(posts.Tags.map(lambda x: [_ for _ in x.replace('>', ' ').replace('<', ' ').split() if tags_counter[_] > 5]))

In [959]:
kfold = KFold(vectors_questions.shape[0], n_folds=5)

In [960]:
def choose(arra):
    
    res = arra.copy()
    res[res>0.1] = 1
    
    print(arra)
    print(arra.shape)
    print(' '.join(map(str, res.sum(axis=1))))
    
    return res

In [None]:
estimations = defaultdict(list)

for i, (train_ids, test_ids) in enumerate(kfold):
    print('%s...'%i)
    classifier = RandomForestClassifier(n_estimators=100, n_jobs=12, max_depth=6, verbose=1)
#     classifier = KNeighborsClassifier(n_neighbors=10, metric='cosine', algorithm='brute', n_jobs=1,)
    classifier.fit(vectors_questions[train_ids, :], labels[train_ids, :])
    
    print('\t counting recall')
    estimations['recall'].append(recall_score(labels[test_ids, :], classifier.predict(vectors_questions[test_ids, :]), average='micro'))
    print('\t counting precision')
    estimations['precision'].append(precision_score(labels[test_ids, :], classifier.predict(vectors_questions[test_ids, :]), average='micro'))
    print('\t counting rocauc')
    estimations['roc-auc'].append(roc_auc_score(labels[test_ids, :], np.array(list(map(lambda x: x[:, -1], classifier.predict_proba(vectors_questions[test_ids, :])))).T, average='micro'))

In [1450]:
estimations

defaultdict(list,
            {'precision': [0.20792079207920791],
             'recall': [0.0025684931506849314],
             'roc-auc': [0.82721699711539309]})

In [1124]:
import fastText as vanilla_fastText

In [1165]:
from sklearn.cross_validation import train_test_split

In [1236]:
posts_train, posts_test = train_test_split(posts, test_size=0.1, random_state=42)

In [1422]:
for set_name, dataset in [('train', posts_train), ('test', posts_test)]:
    with open('exchange.posts.fastext.%s.txt'%set_name, 'w') as outcome:
        for name, row in dataset.iterrows():
            outcome.write('%s %s %s\n'%(
                ' '.join(['__label__%s'%_.replace(whitespace, '_') for _ in row.Tags.replace('>', ' ').replace('<', ' ').split()]),
                ' '.join(clean_token_fragile(_) for _ in row.body_processed),
                ' '.join(clean_token_fragile(_) for _ in chain.from_iterable(map(lambda x: x['Body'], row.answers_processed)))
            ))

In [1632]:
v_fsttxt = vanilla_fastText.train_supervised(
    'exchange.posts.fastext.train.txt', 
    dim=20, 
    neg=20,
    minn=3,
    maxn=5,
    wordNgrams=1,
    t=0.0001,
    loss='softmax',
#    pretrainedVectors='./_shared/wiki-news-300d-1M-subword.vec',
    ws=12,
    epoch=85)

print(ws)
print(v_fsttxt.test('exchange.posts.fastext.test.txt', k=3))

10
(1711, 0.2415741281901422, 0.27115678985348785)


In [1633]:
def vanilla_embed_doc(doc, model=v_fsttxt):
       
    return v_fsttxt.get_sentence_vector(' '.join(clean_token_fragile(_) for _ in doc))

In [1636]:
def vanilla_embed_doc(doc, model=v_fsttxt, tf_model=tfidf, gensim_dict=gdict, print_report=False, return_report=False):
    embs = []
    ws = 0
    
    report = ''
    
    for _ in doc:
        if _ in gdict.token2id:
            w = tf_model.idfs[gdict.token2id[_]]
        elif not is_trash_token(_):
            w = 2
        else:
            w=0.1
        try:
            report += ' %3.2f*%s'%(w, _)
            
            embs.append(w*model.get_word_vector(clean_token_fragile(_)))
            ws += w
        except KeyError:
            pass
    if print_report:
        print(report)
    if return_report:
        return np.sum(embs, axis=0)/ws, report
    return np.sum(embs, axis=0)/ws

In [1558]:
vwords_index = NearestNeighbors(algorithm='brute', metric='cosine')

In [1559]:
vwords_index.fit(v_fsttxt.get_input_matrix())

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [1564]:
vanilla_embed_doc('what is backpropagation'.split())

array([-0.1590645 ,  0.03249418,  0.22335471,  0.49984992,  0.0355899 ,
       -0.05659072,  0.14868015,  0.26408234,  0.24661049,  0.1608884 ], dtype=float32)

In [1562]:
vanilla_embed_doc('backpropagation is the same as backprob'.split())

array([-0.3252534 ,  0.0479237 ,  0.12801714,  0.36829898,  0.01719299,
       -0.03883877,  0.4273454 ,  0.10028217,  0.25121862,  0.34242961], dtype=float32)

In [1634]:
cosine(v_fsttxt.get_sentence_vector('what is backpropagation'), v_fsttxt.get_sentence_vector('backpropagation is the same as backprob'))

0.055971163409385238

In [1637]:
cosine(vanilla_embed_doc('what is backpropagation'.split()), vanilla_embed_doc('backpropagation is the same as backprob'.split()))

0.15402217531549145

In [1565]:
from scipy.spatial.distance import cosine

In [1553]:
wdists, wids = vwords_index.kneighbors([v_fsttxt.get_sentence_vector('what is backpropagation')], n_neighbors=20)
print('\n'.join('%3.3f\t %s'%(wdist, v_fsttxt.f.getVocab()[0][wid]) for wdist, wid in zip(wdists[0], wids[0]) if wid < len(v_fsttxt.get_words())))

0.054	 overweight
0.058	 simple


In [None]:
def build_vanilla_fasttext_similar_word_search():
    pass

In [1546]:
wdists, wids = vwords_index.kneighbors([v_fsttxt.get_word_vector('nlp')], n_neighbors=20)
print('\n'.join('%3.3f\t %s'%(wdist, v_fsttxt.f.getVocab()[0][wid]) for wdist, wid in zip(wdists[0], wids[0]) if wid < len(v_fsttxt.get_words())))

0.000	 nlp
0.015	 extract
0.017	 sentence
0.019	 word
0.027	 textual
0.028	 parse
0.028	 entity
0.029	 text
0.035	 processing
0.037	 university
0.040	 nltk


In [1547]:
v_fsttxt.test('exchange.posts.fastext.test.txt', k=3)

(1711, 0.2458601207870641, 0.275967636125082)

In [None]:
id2look = 379

tags = '#'+', #'.join(v_fsttxt.predict(' '.join(questions_texts[id2look]).replace('\n', ' '), k=5)[0])
print(tags)
print('='*len(tags))
print(posts.iloc[test_ids[id2look]].Body.replace('<p>','').replace('</p>', '\n'))
print(posts.iloc[test_ids[id2look]].Tags)

In [151]:
tags_counter = Counter(chain.from_iterable(posts.Tags.map(lambda x: x.replace('>', ' ').replace('<', ' ').split())))

hist = np.array(list(map(lambda x: x[-1], tags_counter.items())))

tags = np.array(list(map(lambda x: x[0], tags_counter.items())))

In [162]:
tags = tags[np.argsort(hist)][::-1]
hist = hist[np.argsort(hist)][::-1]