In [4]:
# coding: utf-8

from collections import defaultdict
from datetime import datetime
import time
import json

import gensim

import numpy as np
import pandas as pd

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
import pymorphy2

In [3]:
!echo $PYTHONPATH

/root/python:/root/python:


In [5]:
%%time
model = gensim.models.KeyedVectors.load_word2vec_format('web_0_300_20.bin.gz', binary=True)

CPU times: user 13 s, sys: 336 ms, total: 13.3 s
Wall time: 13.3 s


In [6]:
%%time
w2v = dict(zip(model.index2word, model.syn0))

CPU times: user 148 ms, sys: 40 ms, total: 188 ms
Wall time: 188 ms


In [7]:
morph = pymorphy2.MorphAnalyzer()

In [8]:
sentenses = [unicode('мама мыла раму папа мыл паркет', encoding='utf-8'),
             unicode('ехал грека через реку видит грека в реке хуй', encoding='utf-8')]

In [9]:
mapping = {
    'ADJF': '_ADJ',
    'NOUN': '_NOUN',
    'INFN': '_VERB',
}

def prepare_word(w, mapping, word2vec_model):
    w = morph.parse(w)[0]
    if w.tag.POS in mapping:
        new_w = '{}{}'.decode('utf-8').format(w.normal_form, mapping[w.tag.POS])
        v = vector(new_w, word2vec_model)  
        return v, new_w
    return (None, None)

def w2v_transformation(data, word2vec_model):
    """
    transform the data - it can be pandas format or just array
    :param data: data - pandas, numpy, list
    :param word2vec_model
    :return: transformed_data
    """
    dim = len(word2vec_model.itervalues().next())

    train_w2v_tfidf = []
    for i, s in enumerate(data):
        words = s.split(' ')

        words_vector = None
        words_count = 0

        new_s = []
        sentence_vector = []
        for w in words:
            v, new_w = prepare_word(w, mapping, word2vec_model)
            if new_w:
                new_s.append(new_w)
            
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(new_s)
            
        max_idf = max(tfidf.idf_)
        word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()]
        )
            
        train_w2v_tfidf.append(
            np.array(
                np.mean(
                    [word2vec_model[w] * word2weight[w] for w in new_s if w in word2vec_model]
                    or [np.zeros(dim)],
                    axis=0
                )
            )
        )

#     train_vectors = np.array(train_vectors)
    return np.array(train_w2v_tfidf)

In [10]:
def vector(q, word2vec_model):
    """
    transform a single word to word2vec model - 500 digits
    :param q: just a word from w2v vocabulary
    :param word2vec_model: word2vec model
    :return: vector from 500 digits, numpy
    """
    qf = q

    if q not in word2vec_model:
        candidates_set = set()

        candidates_set.add(q.upper())
        candidates_set.add(q.lower())
        candidates_set.add(q.capitalize())

        no_results = True
        for candidate in candidates_set:
            if candidate in word2vec_model:
                qf = candidate
                no_results = False
                break

        if no_results:
            # obvious that not all elements
            # in corpus will from our vocabulary
            return None

    raw_vector = word2vec_model[qf]
    return raw_vector

In [11]:
w2v_tfidf = w2v_transformation([sentenses[0]], w2v)

In [12]:
o = json.loads("""{
    "friends": {
        "count": 0,
        "items": []
    },
    
    "basic": {
        "id": 30343797,
        "first_name": "Artash",
        "last_name": "Muradyan",
        "sex": 2,
        "country": {
            "id": 1,
            "title": "Russia"
        },
        "photo": "https://vk.com/images/camera_50.png",
        "university": 0,
        "university_name": "",
        "faculty": 0,
        "faculty_name": "",
        "graduation": 0,
        "home_town": "",
        "relation": 0,
        "about": ""
    },
    "wall": {
        "count": 0,
        "items": []
    },
    "id": 30343797,
    "groups": null
}""")

In [13]:
from pprint import pprint

pprint(o)

{u'basic': {u'about': u'',
            u'country': {u'id': 1, u'title': u'Russia'},
            u'faculty': 0,
            u'faculty_name': u'',
            u'first_name': u'Artash',
            u'graduation': 0,
            u'home_town': u'',
            u'id': 30343797,
            u'last_name': u'Muradyan',
            u'photo': u'https://vk.com/images/camera_50.png',
            u'relation': 0,
            u'sex': 2,
            u'university': 0,
            u'university_name': u''},
 u'friends': {u'count': 0, u'items': []},
 u'groups': None,
 u'id': 30343797,
 u'wall': {u'count': 0, u'items': []}}


In [14]:
name_mapping = {}
interest_mapping = {}
name_counter = 0


def get_main_ohe(o):
    v = []
    
    basics = o['basic']
    
    v.append(basics['id'])
    
#     if basics['first_name'] in mapping:
#         v.append(mapping[basics['first_name']])
#     else:
#         name_counter += 1
#         mapping[basics['first_name']] = name_counter
#         v.append(name_counter)
        
    v.append(basics['sex'])
    
    city = basics['city']['id'] if 'city' in basics else -1
    v.append(city)

    country = basics['country']['id'] if 'country' in basics else -1
    v.append(country)
    
    photo = 1 if 'photo' in basics else 0
    v.append(photo)
    
    birthdate = (time.mktime(datetime.strptime(basics['bdate'], '%d.%m.%Y').timetuple())
                 if 'bdate' in basics else 788907600.0)
    v.append(birthdate)
    
    relation = basics['relation'] if 'relation' in basics else -1
    v.append(relation)
    
    university = basics['university'] if 'university' in basics else -1
    
    about = w2v_transformation(basics['about'], w2v) if 'about' in basics else np.zeros
    if not about:
        about = np.zeros((300,))

    v += list(about)
    
    return np.array(v)

In [17]:
!echo -en $(curl http://138.201.80.213:5005/get_user_info?uid=25553)

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   452  100   452    0     0    858      0 --:--:-- --:--:-- --:--:--   857
-en {"interests": ["\u042e\u043c\u043e\u0440", "\u041d\u043e\u0432\u043e\u0441\u0442\u0438", "\u041b\u0438\u0442\u0435\u0440\u0430\u0442\u0443\u0440\u0430", "\u041f\u0443\u0442\u0435\u0448\u0435\u0441\u0442\u0432\u0438\u044f", "\u041c\u0443\u0437\u044b\u043a\u0430"], "mbti": {"probs": {"intro_extra": 0.2446371465921402, "sensor_intuit": 0.25243809819221497, "logic_ethic": 0.049992892891168594, "irratio_ratio": 0.41522619128227234}, "psy_type": "ENFP"}}


In [15]:
def datafuel_api(user_id):
     pass

In [18]:
print '\xd0\x94\xd0\xb5\xd1\x82\xd0\xb8 - \xd0\xbc\xd0\xb0\xd0\xbc\xd1\x8b'

Дети - мамы


In [24]:
%%timeit
!echo -en $(curl http://138.201.80.213:5005/get_user_info?uid=25553&fields=interests)

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   452  100   452    0     0    845      0 --:--:-- --:--:-- --:--:--   844
-en {"interests": ["\u042e\u043c\u043e\u0440", "\u041d\u043e\u0432\u043e\u0441\u0442\u0438", "\u041b\u0438\u0442\u0435\u0440\u0430\u0442\u0443\u0440\u0430", "\u041f\u0443\u0442\u0435\u0448\u0435\u0441\u0442\u0432\u0438\u044f", "\u041c\u0443\u0437\u044b\u043a\u0430"], "mbti": {"probs": {"intro_extra": 0.2446371465921402, "sensor_intuit": 0.25243809819221497, "logic_ethic": 0.049992892891168594, "irratio_ratio": 0.41522619128227234}, "psy_type": "ENFP"}}
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   452  100   452    0     0    741      0 --:--:-- --:--:-- --:--:--   742
-en {"interests": ["\u042e\u043c\u043e\u0440", "\u041d\u043e\u0432\u0