In [1]:
import os
import pickle 
import re

from pymorphy2 import MorphAnalyzer

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

from collections import defaultdict

from pymongo import MongoClient

import tqdm

In [2]:
morpher = MorphAnalyzer()

russian_stopwords = set(stopwords.words('russian'))
english_stopwords = set(stopwords.words('english'))
custom_stops = {'br', 'ask', 'fm', 'http', 'https', 'www', 'ru', 'com', 'vk', 'view'}

stops = set.intersection(russian_stopwords, english_stopwords) | custom_stops

tokenizer = RegexpTokenizer(r'\w+')

In [3]:
polit_views = {
                1 : 'Communist',
                2 : 'Socialist',
                3 : 'Moderate',
                4 : 'Liberal',
                5 : 'Conservative',
                6 : 'Monarchist',
                7 : 'Ultraconservative',
                8 : 'Apathetic',
                9 : 'Libertian'
                }

In [4]:
def create_or_load_ids_dict(db):
    if os.path.exists('ids_indices_dict.pickle'):
        with open('ids_indices_dict.pickle', 'rb') as handle:
            return pickle.load(handle)

    ids = [user['uid'] for user in db.users.find()]
    indices = list(range(len(ids)))

    dictionary = dict(zip(ids, indices))
    dictionary.update(zip(indices, ids))

    with open('ids_indices_dict.pickle', 'wb') as handle:
        pickle.dump(dictionary, handle)

    return dictionary

def create_or_load_users_to_posts(db, ids_indices_dict):
    if os.path.exists('users_to_posts.pickle'):
        with open('users_to_posts.pickle', 'rb') as handle:
            return pickle.load(handle)

    users_to_posts = defaultdict(str)

    for user_post in tqdm.tqdm_notebook(db.wall_posts.find(), total=db.wall_posts.count()):
        users_to_posts[ids_indices_dict[user_post['from_id']]] += user_post['text']

    with open('users_to_posts.pickle', 'wb') as handle:
        pickle.dump(users_to_posts, handle)

    return users_to_posts

def is_number(s):
    try:
        float(s)
        return True
    except:
        return False

def stemming(text):
    def is_valid(w):
        return not(w.startswith('id') or is_number(w))
    
    words = [w.lower() for w in tokenizer.tokenize(text)]
    words = [morpher.parse(word)[0].normal_form for word in words if (not word in stops and is_valid(word))]
    
    return ' '.join(words)

def process_user(db, uid, users_to_posts, ids_indices_dict):
    text = ''

    def not_none(value):
        return value if value is not None else ''

    for user_links in db.links.find({'uid': uid}):
        for user_link in user_links['links']:
            for link in db.links_content.find({'url': user_link}):
                if link['type'] == 'sprashivai':
                    text += ' '.join(not_none(link['answers']))
                elif link['type'] == 'livejournal' or link['type'] == 'pikabu':
                    text += ' ' + not_none(link['title'])
                    text += ' ' + not_none(link['text'])
                elif link['type'] == 'youtube':
                    text += ' ' + not_none(link['description'])
                    text += ' '.join(not_none(link['tags']))
                    text += ' ' + not_none(link['name'])
                elif link['type'] == 'ali':
                    text += ' ' + not_none(link['name'])
                elif link['type'] == 'ask':
                    text += ' '.join(not_none(link['answers']))
                elif link['type'] == 'unknown':
                    text += ' ' + not_none(link['description'])
                    text += ' ' + not_none(link['title'])
                    
    for u in db.user_info.find({'uid':uid}):
        u =  defaultdict(str, u)
        
        text += ' ' + u['about']
        text += ' ' + u['quotes']
        text += ' ' + u['activities']
        text += ' ' + u['interests']
        text += ' ' + u['music']
        text += ' ' + u['movies']
        text += ' ' + u['tv']
        text += ' ' + u['books']
        
    text += ' ' + users_to_posts[ids_indices_dict[uid]]
    
    return stemming(text)

In [5]:
client = MongoClient()
db = client.ir_project

In [6]:
ids = create_or_load_ids_dict(db)
users_to_posts = create_or_load_users_to_posts(db, ids)

In [8]:
users_texts = dict()
for uid in tqdm.tqdm_notebook(ids.keys(), total=db.users.count()):
    users_texts[uid] = process_user(db, uid, users_to_posts, ids)

KeyboardInterrupt: 