In [1]:
import os
import pickle 
import re
import gc

import numpy as np

from pymorphy2 import MorphAnalyzer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

from collections import defaultdict

# from pymongo import MongoClient

import tqdm

In [16]:
def read_stops():
    res = set()
    with open('stops.txt') as in_file:
        for line in in_file:
            res.add(line.strip())
    
    return res

In [20]:
morpher = MorphAnalyzer()

russian_stopwords = set(stopwords.words('russian'))
english_stopwords = set(stopwords.words('english'))
custom_stops = {'br', 'ask', 'fm', 'http', 'https', 'www', 'ru', 'com', 'vk', 'view'}

stops = russian_stopwords | english_stopwords | custom_stops | read_stops()

tokenizer = RegexpTokenizer(r'\w+')

In [18]:
polit_views = {
                1 : 'Communist',
                2 : 'Socialist',
                3 : 'Moderate',
                4 : 'Liberal',
                5 : 'Conservative',
                6 : 'Monarchist',
                7 : 'Ultraconservative',
                8 : 'Apathetic',
                9 : 'Libertian'
                }

In [4]:
def create_or_load_ids_dict(db):
    if os.path.exists('ids_indices_dict.pickle'):
        with open('ids_indices_dict.pickle', 'rb') as handle:
            return pickle.load(handle)

    ids = [user['uid'] for user in db.users.find()]
    indices = list(range(len(ids)))

    dictionary = dict(zip(ids, indices))
    dictionary.update(zip(indices, ids))

    with open('ids_indices_dict.pickle', 'wb') as handle:
        pickle.dump(dictionary, handle)

    return dictionary

def create_or_load_users_to_posts(db, ids_indices_dict):
    if os.path.exists('users_to_posts.pickle'):
        with open('users_to_posts.pickle', 'rb') as handle:
            return pickle.load(handle)

    users_to_posts = defaultdict(str)

    for user_post in tqdm.tqdm_notebook(db.wall_posts.find(), total=db.wall_posts.count()):
        users_to_posts[ids_indices_dict[user_post['from_id']]] += user_post['text']

    with open('users_to_posts.pickle', 'wb') as handle:
        pickle.dump(users_to_posts, handle)

    return users_to_posts

def is_number(s):
    try:
        float(s)
        return True
    except:
        return False

def stemming(text):
    def is_valid(w):
        return not(w.startswith('id') or is_number(w) or w in stops)
    
    words = [w.lower() for w in tokenizer.tokenize(text)]
    words = [morpher.parse(word)[0].normal_form for word in words if is_valid(word)]
    
    return ' '.join(words)

def process_user(db, uid, users_to_posts, ids_indices_dict):
    text = ''

    def not_none(value):
        return value if value is not None else ''

    for user_links in db.links.find({'uid': uid}):
        for user_link in user_links['links']:
            for link in db.links_content.find({'url': user_link}):
                if link['type'] == 'sprashivai':
                    text += ' '.join(not_none(link['answers']))
                elif link['type'] == 'livejournal' or link['type'] == 'pikabu':
                    text += ' ' + not_none(link['title'])
                    text += ' ' + not_none(link['text'])
                elif link['type'] == 'youtube':
                    text += ' ' + not_none(link['description'])
                    text += ' '.join(not_none(link['tags']))
                    text += ' ' + not_none(link['name'])
                elif link['type'] == 'ali':
                    text += ' ' + not_none(link['name'])
                elif link['type'] == 'ask':
                    text += ' '.join(not_none(link['answers']))
                elif link['type'] == 'unknown':
                    text += ' ' + not_none(link['description'])
                    text += ' ' + not_none(link['title'])
                    
    for u in db.user_info.find({'uid':uid}):
        u =  defaultdict(str, u)
        
        text += ' ' + u['about']
        text += ' ' + u['quotes']
        text += ' ' + u['activities']
        text += ' ' + u['interests']
        text += ' ' + u['music']
        text += ' ' + u['movies']
        text += ' ' + u['tv']
        text += ' ' + u['books']
        
    text += ' ' + users_to_posts[ids_indices_dict[uid]]
    
    return stemming(text)

In [5]:
# client = MongoClient()
# db = client.ir_project

In [6]:
def read_or_create_texts():
    try:
        with open('users_texts.bin', 'rb') as in_file:
            return pickle.load(in_file)
    except:
        ids = create_or_load_ids_dict(db)
        users_to_posts = create_or_load_users_to_posts(db, ids)
        
        
        users_texts = dict()
        for uid in tqdm.tqdm_notebook(ids.keys(), total=db.users.count()):
            users_texts[uid] = process_user(db, uid, users_to_posts, ids)
        with open('users_texts.bin', 'wb') as out:
            pickle.dump(users_texts, out)
            
        return users_texts

In [7]:
users_texts = read_or_create_texts()

In [8]:
users_texts = {uid:text for uid, text in users_texts.items() if len(text) > 0}

In [9]:
only_texts = [t for t in users_texts.values()]

In [10]:
TOP_WORDS = 10
TOPICS_COUNT = 20

In [11]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [21]:
tf_vectorizer = CountVectorizer(min_df = 7, max_df = 0.9, stop_words=stops)
tf = tf_vectorizer.fit_transform(only_texts)

In [22]:
print(tf.shape)

(44074, 104334)


In [23]:
lda = LatentDirichletAllocation(n_topics=TOPICS_COUNT, max_iter=10,
                                learning_method='online', learning_offset=50.,
                                random_state=0, verbose=1, n_jobs=10).fit(tf)



iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [24]:
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, TOP_WORDS)

Topic #0:
заклинание орхидея раб магия боязнь противник гарри тысяча порча проклятие
Topic #1:
репост запись конкурс группа наш сделать весь победитель место розыгрыш
Topic #2:
d0 amp альбом co d1 музыка music песня английский love
Topic #3:
любить секс украина club59394712 ти го мина ми мена що
Topic #4:
ряд плата лицо amp петля сбн utm_source utm_medium utm_campaign social
Topic #5:
весь это человек свой жизнь любить который хотеть мочь друг
Topic #6:
год весь это который день работа рубль наш свой россия
Topic #7:
игра получить уровень amp app3882511 набрать бонус очки заходить ad_id
Topic #8:
спасибо весь день свадьба москва лето фото instagram любимый очень
Topic #9:
ребёнок весь год это очень мочь дом помочь помощь собака
Topic #10:
это нога рука растение упражнение который год весь вода каждый
Topic #11:
это свой который человек мочь весь ваш жизнь время самый
Topic #12:
gt открытка свой отправить друг фото сделать смотреть новый vkontakte
Topic #13:
масло год литр соль минута д