In [35]:
import os
import pickle 
import re
import gc

import numpy as np

from pymorphy2 import MorphAnalyzer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

try:
    from pymongo import MongoClient
except:
    # there is no mongo on  AU server
    pass

from collections import defaultdict

import tqdm

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
def read_stops():
    res = set()
    with open('stops.txt') as in_file:
        for line in in_file:
            res.add(line.strip())
    
    return res

In [3]:
morpher = MorphAnalyzer()

russian_stopwords = set(stopwords.words('russian'))
english_stopwords = set(stopwords.words('english'))
custom_stops = {'br', 'ask', 'fm', 'http', 'https', 'www', 'ru', 'com', 'vk', 'view',
                'vkontakte', 'd1', 'd0', 'amp', 'utm_source',  'utm_medium', 'utm_campaign'}

stops = russian_stopwords | english_stopwords | custom_stops | read_stops()

tokenizer = RegexpTokenizer(r'\w+')

In [4]:
polit_views = {
                1 : 'Communist',
                2 : 'Socialist',
                3 : 'Moderate',
                4 : 'Liberal',
                5 : 'Conservative',
                6 : 'Monarchist',
                7 : 'Ultraconservative',
                8 : 'Apathetic',
                9 : 'Libertian'
                }

In [21]:
def create_or_load_ids_dict(db):
    if os.path.exists('ids_indices_dict.pickle'):
        with open('ids_indices_dict.pickle', 'rb') as handle:
            return pickle.load(handle)

    ids = [user['uid'] for user in db.users.find()]
    indices = list(range(len(ids)))

    dictionary = dict(zip(ids, indices))
    dictionary.update(zip(indices, ids))

    with open('ids_indices_dict.pickle', 'wb') as handle:
        pickle.dump(dictionary, handle)

    return dictionary

def create_or_load_users_to_posts(db, ids_indices_dict):
    if os.path.exists('users_to_posts.pickle'):
        with open('users_to_posts.pickle', 'rb') as handle:
            return pickle.load(handle)

    users_to_posts = defaultdict(str)

    for user_post in tqdm.tqdm_notebook(db.wall_posts.find(), total=db.wall_posts.count()):
        users_to_posts[ids_indices_dict[user_post['from_id']]] += user_post['text']

    with open('users_to_posts.pickle', 'wb') as handle:
        pickle.dump(users_to_posts, handle)

    return users_to_posts

def is_number(s):
    try:
        float(s)
        return True
    except:
        return False

def is_valid(w):
    return not(w.startswith('id') or is_number(w) or w in stops 
               or w.startswith('club') or w.startswith('app') or set(w) == {'_'})
    
def stemming(text):    
    words = [morpher.parse(w.lower())[0].normal_form for w in tokenizer.tokenize(text)]
    words = [word for word in words if is_valid(word)]
    
    return ' '.join(words)

def links_content_map():
    res = defaultdict(list)
    
    for link in db.links_content.find():
        res[link['url']].append(link)
    
    return res


def user_links_map():
    res = defaultdict(list)
    
    for user_links in db.links.find():
        res[user_links['uid']].append(user_links)
    
    return res


def process_user(db, uid, users_to_posts, ids_indices_dict, links_content, user_links_m):
    text = ''

    def not_none(value):
        return value if value is not None else ''

    for user_links in user_links_m[uid]:
        for user_link in user_links['links']:
            for link in links_content[user_link]:
                if link['type'] == 'sprashivai':
                    text += ' '.join(not_none(link['answers']))
                elif link['type'] == 'livejournal' or link['type'] == 'pikabu':
                    text += ' ' + not_none(link['title'])
                    text += ' ' + not_none(link['text'])
                elif link['type'] == 'youtube':
                    text += ' ' + not_none(link['description'])
                    text += ' '.join(not_none(link['tags']))
                    text += ' ' + not_none(link['name'])
                elif link['type'] == 'ali':
                    text += ' ' + not_none(link['name'])
                elif link['type'] == 'ask':
                    text += ' '.join(not_none(link['answers']))
                elif link['type'] == 'unknown':
                    text += ' ' + not_none(link['description'])
                    text += ' ' + not_none(link['title'])
                    
    for u in db.user_info.find({'uid':uid}):
        u =  defaultdict(str, u)
        
        text += ' ' + u['about']
        text += ' ' + u['quotes']
        text += ' ' + u['activities']
        text += ' ' + u['interests']
        text += ' ' + u['music']
        text += ' ' + u['movies']
        text += ' ' + u['tv']
        text += ' ' + u['books']
        
    text += ' ' + users_to_posts[ids_indices_dict[uid]]
    
    return stemming(text)

In [6]:
client = MongoClient()
db = client.ir_project
ids = create_or_load_ids_dict(db)
users_to_posts = create_or_load_users_to_posts(db, ids)

In [22]:
def read_or_create_texts():
    try:
        with open('users_texts.bin', 'rb') as in_file:
            return pickle.load(in_file)
    except:
        links_content = links_content_map()
        user_links = user_links_map()
        
        print('Total links parsed: {}'.format(len(links_content)))
        
        users_texts = dict()

        for u in tqdm.tqdm_notebook(db.users.find(no_cursor_timeout=True), total=db.users.count()):
            users_texts[u['uid']] = process_user(db, u['uid'], users_to_posts, ids, links_content, user_links)
            
        with open('users_texts.bin', 'wb') as out:
            pickle.dump(users_texts, out)
            
        return users_texts

In [23]:
users_texts = read_or_create_texts()

Total links parsed: 62124






In [24]:
users_texts = {uid:text for uid, text in users_texts.items() if len(text) > 0}

In [25]:
only_texts = [t for t in users_texts.values()]

In [26]:
TOP_WORDS = 10
TOPICS_COUNT = 25

In [27]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [28]:
tf_vectorizer = CountVectorizer(min_df = 7, max_df = 0.9, stop_words=stops)
tf = tf_vectorizer.fit_transform(only_texts)

In [29]:
print(tf.shape)

(43988, 103106)


In [31]:
lda = LatentDirichletAllocation(n_topics=TOPICS_COUNT, max_iter=40,
                                learning_method='online', learning_offset=50.,
                                random_state=0, verbose=1, n_jobs=1).fit(tf)



iteration: 1 of max_iter: 40
iteration: 2 of max_iter: 40
iteration: 3 of max_iter: 40
iteration: 4 of max_iter: 40
iteration: 5 of max_iter: 40
iteration: 6 of max_iter: 40
iteration: 7 of max_iter: 40
iteration: 8 of max_iter: 40
iteration: 9 of max_iter: 40
iteration: 10 of max_iter: 40
iteration: 11 of max_iter: 40
iteration: 12 of max_iter: 40
iteration: 13 of max_iter: 40
iteration: 14 of max_iter: 40
iteration: 15 of max_iter: 40
iteration: 16 of max_iter: 40
iteration: 17 of max_iter: 40
iteration: 18 of max_iter: 40
iteration: 19 of max_iter: 40
iteration: 20 of max_iter: 40
iteration: 21 of max_iter: 40
iteration: 22 of max_iter: 40
iteration: 23 of max_iter: 40
iteration: 24 of max_iter: 40
iteration: 25 of max_iter: 40
iteration: 26 of max_iter: 40
iteration: 27 of max_iter: 40
iteration: 28 of max_iter: 40
iteration: 29 of max_iter: 40
iteration: 30 of max_iter: 40
iteration: 31 of max_iter: 40
iteration: 32 of max_iter: 40
iteration: 33 of max_iter: 40
iteration: 34 of ma

In [32]:
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, TOP_WORDS)

Topic #0:
репост сделать запись конкурс группа победитель место друг розыгрыш свой
Topic #1:
50рубль рубль 40рубль 100рубль 300рубль 200рубль 30рубль 120рубль 250рубль 10рубль
Topic #2:
дом автомобиль квартира ремонт машина метр комната дверь система материал
Topic #3:
любить свой друг хороший самый любовь знать женщина мужчина большой
Topic #4:
святой секс украина господь бог молиться ангел ти мина ми
Topic #5:
игра gt получить уровень набрать бонус заходить открытка очки новый
Topic #6:
аниме дрова сорняк пётр story двор pikabu идти кун маркер
Topic #7:
мир русский война россия стать свой страна история земля век
Topic #8:
фильм жанр сша история хороший дом любовь самый комедия реж
Topic #9:
рубль цена купить магазин cc заказ товар скидка доставка сайт
Topic #10:
ряд плата лицо петлить сантиметр вязать петля лицевой спица сторона
Topic #11:
язык слово английский буква урок ctrl страница текст alt извинить
Topic #12:
работа рубль свой компания сайт проект работать россия деньга бизнес

In [33]:
with open('vect.bin', 'wb') as out:
    pickle.dump(tf_vectorizer, out)
with open('lda.bin', 'wb') as out:
    pickle.dump(lda, out)

In [34]:
topics = lda.transform(tf)

In [36]:
topics_tsne = TSNE(n_components=2).fit_transform(topics)
topics_pca = PCA(n_components=2).fit_transform(topics)