1. Модифицировать код функции get_user_embedding таким образом, чтобы считалось не среднее (как в примере np.mean), а медиана. Применить такое преобразование к данным, обучить модель прогнозирования оттока и посчитать метрики качества и сохранить их: roc auc, precision/recall/f_score (для 3 последних - подобрать оптимальный порог с помощью precision_recall_curve, как это делалось на уроке)<br>

2. Повторить п.2, но используя уже не медиану, а max<br>

3. (опциональное) Воспользовавшись полученными знаниями из п.1, повторить пункт 2, но уже взвешивая новости по tfidf (подсказка: нужно получить веса-коэффициенты для каждого документа. Не все документы одинаково информативны и несут какой-то положительный сигнал). Подсказка 2 - нужен именно idf, как вес.<br>

4. Сформировать на выходе единую таблицу, сравнивающую качество 3 разных метода получения эмбедингов пользователей: mean, median, max, idf_mean по метрикам roc_auc, precision, recall, f_score<br>

5. Сделать самостоятельные выводы и предположения о том, почему тот или ной способ оказался эффективнее остальных

In [25]:
#предобработка текстов
import re
import numpy as np
from nltk.corpus import stopwords

from razdel import tokenize
from gensim.corpora.dictionary import Dictionary

import pymorphy2

import pandas as pd


#обучение
from gensim.models import LdaModel
from gensim.test.utils import datapath

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score, roc_auc_score, precision_score,\
                            classification_report, precision_recall_curve, confusion_matrix
import itertools

import matplotlib.pyplot as plt

%matplotlib inline

In [26]:
news = pd.read_csv("articles.csv")
print(news.shape)
news.head(3)

(27000, 2)


Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...


In [27]:
users = pd.read_csv("users_articles.csv")
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [28]:
stopword_ru = stopwords.words('russian')
len(stopword_ru)

morph = pymorphy2.MorphAnalyzer()

In [29]:
with open('stopwords.txt') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
stopword_ru += additional_stopwords
len(stopword_ru)

776

In [30]:
def clean_text(text):
    '''
    очистка текста
    
    на выходе очищеный текст
    
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    
    return text

cache = {}

def lemmatization(text):
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

In [31]:
%%time
#Запускаем очистку текста. Будет долго...
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

Wall time: 35 s


In [32]:
%%time
#Запускаем лемматизацию текста. Будет очень долго...
news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

Wall time: 4min 42s


In [33]:
news['title']

0        [заместитель, председатель, правительство, рф,...
1        [матч, финал, кубок, россия, футбол, приостано...
2        [форвард, авангард, томаш, заборский, прокомме...
3        [главный, тренер, кубань, юрий, красножанин, п...
4        [решение, попечительский, совет, владивостокск...
                               ...                        
26995    [учёный, токийский, университет, морской, наук...
26996    [глава, кафедра, отечественный, история, xx, в...
26997    [американский, учёный, уточнить, возраст, расп...
26998    [последний, год, тропический, углеродный, цикл...
26999    [жить, примерно, тыс, год, назад, территория, ...
Name: title, Length: 27000, dtype: object

In [34]:
#сформируем список наших текстов, разбив еще и на пробелы
texts = [t for t in news['title'].values]

# Create a corpus from a list of texts
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

In [35]:
common_dictionary[10]

'ватутин'

In [36]:
%%time
# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=25, id2word=common_dictionary)#, passes=10)

Wall time: 41.3 s


In [37]:
# Save model to disk.
temp_file = datapath("model.lda")
lda.save(temp_file)

# Load a potentially pretrained model from disk.
lda = LdaModel.load(temp_file)

In [39]:
# Create a new corpus, made of previously unseen documents.
other_texts = [t for t in news['title'].iloc[:3]]
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]

unseen_doc = other_corpus[2]
print(other_texts[2])
lda[unseen_doc] 

['форвард', 'авангард', 'томаш', 'заборский', 'прокомментировать', 'игра', 'свой', 'команда', 'матч', 'чемпионат', 'кхл', 'против', 'атланта', 'nnnn', 'провести', 'плохой', 'матч', 'нижний', 'новгород', 'против', 'торпедо', 'настраиваться', 'первый', 'минута', 'включиться', 'работа', 'сказать', 'заборский', 'получиться', 'забросить', 'быстрый', 'гол', 'задать', 'хороший', 'темп', 'поединок', 'мочь', 'играть', 'ещё', 'хороший', 'сторона', 'пять', 'очко', 'выезд', 'девять', 'это', 'хороший']


[(1, 0.11892904), (16, 0.58197117), (18, 0.1767135), (19, 0.103507645)]

In [40]:
x=lda.show_topics(num_topics=25, num_words=7,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

#Below Code Prints Only Words 
for topic,words in topics_words:
    print("topic_{}: ".format(topic)+" ".join(words))

topic_0: дело nn статья рейс группа область суд
topic_1: налог устойчивый тур дания остров платёжный австралия
topic_2: год суд рубль решение размер выплата который
topic_3: компания организм производитель испытание вицепремьер исследование лаборатория
topic_4: рекорд кг резерв явиться игра втб допустимый
topic_5: фонд наука проверка великобритания британский налоговый лондон
topic_6: млн год тыс составить цена стоимость сумма
topic_7: статья планета век обращение фильм японский небо
topic_8: препарат след вирус арбитраж лечение просрочить соколов
topic_9: ракета экипаж запуск поверхность станция доллар км
topic_10: это человек который год свой сотрудник данные
topic_11: мозг космонавт автомобиль год врач азия разместить
topic_12: авария лекарство музыка студия королевский таблетка сустав
topic_13: операция год управление рекомендовать глава отряд должность
topic_14: вуз лётчик полоса рт гостиница таиланд макаров
topic_15: украина украинский киев донбасс параметр киевский украинец
topi

In [41]:
def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(25):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [42]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(25)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(25)]]
topic_matrix.head(5)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,6,0.0,0.0,0.0,0.0,0.047357,0.0,0.0,0.0,0.0,...,0.0,0.024128,0.0,0.0,0.072179,0.693133,0.0,0.0,0.0,0.0
1,4896,0.132933,0.289799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4897,0.0,0.118925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.581989,0.0,0.176627,0.10358,0.0,0.0,0.0,0.0,0.0
3,4898,0.0,0.077618,0.0,0.11976,0.0,0.0,0.021819,0.0,0.0,...,0.0,0.0,0.0,0.410329,0.044728,0.0,0.0,0.0,0.164451,0.0
4,4899,0.0,0.183766,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.159288,0.0959,0.166225,0.0,0.0,0.0


In [43]:
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [44]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(25)]].values))

In [45]:
doc_dict[6]

array([0.        , 0.        , 0.        , 0.        , 0.04735743,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.15609956, 0.        ,
       0.        , 0.02412784, 0.        , 0.        , 0.07217939,
       0.69313335, 0.        , 0.        , 0.        , 0.        ])

In [46]:
def get_user_embedding(user_articles_list, agg='mean'):
    if agg not in ['mean', 'max', 'median', 'tfidf']:
        raise AttributeError("Вводимое значение должно быть одним из следующих:"\
                            "'mean', 'max', 'median', 'tfidf'")
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    if agg == 'mean':
        user_vector = np.mean(user_vector, 0)
    elif agg == 'max':
        user_vector = np.nanmax(user_vector, 0)
    elif agg == 'median':
        user_vector = np.quantile(user_vector, 0.5, 0)
#     elif agg == 'tfidf':
#         user_vector = np.mean(user_vector, 0)
    return user_vector

In [47]:
users

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"
3,u101138,"[5933, 6186, 5055, 6977, 5206, 488389]"
4,u108248,"[707, 1144, 2532, 2928, 3133, 324592]"
...,...,...
7995,u107463,"[323918, 323362, 323704, 323452, 324291, 251]"
7996,u101241,"[5532, 5860, 7755, 7140, 5182, 488337]"
7997,u106486,"[322811, 323898, 321858, 323345, 323491, 2193]"
7998,u102220,"[5436, 6092, 6891, 7045, 5320, 487379]"


In [57]:
users['articles']

0       [293672, 293328, 293001, 293622, 293126, 1852]
1               [3405, 1739, 2972, 1158, 1599, 322665]
2               [1845, 2009, 2356, 1424, 2939, 323389]
3               [5933, 6186, 5055, 6977, 5206, 488389]
4                [707, 1144, 2532, 2928, 3133, 324592]
                             ...                      
7995     [323918, 323362, 323704, 323452, 324291, 251]
7996            [5532, 5860, 7755, 7140, 5182, 488337]
7997    [322811, 323898, 321858, 323345, 323491, 2193]
7998            [5436, 6092, 6891, 7045, 5320, 487379]
7999    [294096, 293759, 294178, 293544, 293921, 2909]
Name: articles, Length: 8000, dtype: object

In [61]:
user_embeddings.loc[6:]

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
6,u105949,0.063571,0.005586,0.002045,0.000000,0.003755,0.005814,0.027507,0.114626,0.0000,...,0.010307,0.048565,0.000000,0.147070,0.000000,0.020754,0.203656,0.000000,0.123902,0.007548
7,u102457,0.000000,0.044619,0.004176,0.000000,0.006760,0.000000,0.102154,0.110414,0.0000,...,0.000000,0.151291,0.000000,0.217958,0.044891,0.000000,0.000000,0.006841,0.088840,0.137171
8,u104124,0.058239,0.001871,0.009045,0.018944,0.000000,0.000000,0.007100,0.005970,0.0000,...,0.000000,0.000000,0.004141,0.119692,0.174384,0.284640,0.062326,0.008315,0.056484,0.000000
9,u101386,0.018544,0.110281,0.000000,0.002727,0.039177,0.024418,0.012186,0.001962,0.0000,...,0.000000,0.140535,0.004700,0.222739,0.002239,0.040996,0.118028,0.029091,0.085794,0.039961
10,u104519,0.030788,0.000000,0.044310,0.016724,0.000000,0.005584,0.037724,0.009926,0.0000,...,0.000000,0.003365,0.044048,0.036756,0.206946,0.285798,0.075345,0.000000,0.066301,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,u107463,0.083365,0.000000,0.082444,0.000000,0.000000,0.000000,0.000000,0.020821,0.0000,...,0.000000,0.008023,0.001795,0.044720,0.188742,0.177589,0.071953,0.031403,0.046584,0.002267
7996,u101241,0.010217,0.088096,0.018859,0.000000,0.000000,0.000000,0.046899,0.035532,0.0000,...,0.000000,0.045709,0.000000,0.161967,0.007890,0.076902,0.007039,0.000000,0.159086,0.034259
7997,u106486,0.144862,0.003672,0.019190,0.011719,0.000000,0.002015,0.000000,0.007427,0.0024,...,0.002271,0.000000,0.030107,0.053403,0.116248,0.164812,0.063479,0.008534,0.102912,0.000000
7998,u102220,0.000000,0.010211,0.000000,0.109225,0.023753,0.000000,0.073622,0.000000,0.0000,...,0.000000,0.006878,0.000000,0.170922,0.000000,0.005064,0.018755,0.088939,0.027629,0.119891


In [60]:
X_test.sort_index()

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
6,0.063571,0.005586,0.002045,0.000000,0.003755,0.005814,0.027507,0.114626,0.000000,0.042907,...,0.010307,0.048565,0.000000,0.147070,0.000000,0.020754,0.203656,0.000000,0.123902,0.007548
8,0.058239,0.001871,0.009045,0.018944,0.000000,0.000000,0.007100,0.005970,0.000000,0.000000,...,0.000000,0.000000,0.004141,0.119692,0.174384,0.284640,0.062326,0.008315,0.056484,0.000000
12,0.138246,0.000000,0.047743,0.012009,0.000000,0.002518,0.010022,0.000000,0.000000,0.003988,...,0.009690,0.022336,0.020486,0.006221,0.120896,0.206522,0.090502,0.031182,0.059831,0.005943
14,0.140228,0.000000,0.040497,0.013700,0.000000,0.000000,0.000000,0.009632,0.000000,0.000000,...,0.008522,0.000000,0.011919,0.039401,0.089273,0.301797,0.121270,0.029652,0.057732,0.003663
15,0.089023,0.000000,0.092599,0.026526,0.000000,0.006654,0.014281,0.002463,0.000000,0.014362,...,0.010377,0.000000,0.023746,0.016530,0.153574,0.215735,0.070592,0.030916,0.001683,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7983,0.077860,0.000000,0.018898,0.014874,0.000000,0.008644,0.011970,0.000000,0.002204,0.007831,...,0.006457,0.002705,0.006908,0.030040,0.113395,0.237724,0.085464,0.017167,0.044855,0.004422
7987,0.011474,0.301211,0.000000,0.027976,0.020211,0.000000,0.000000,0.007295,0.000000,0.006656,...,0.000000,0.097372,0.028393,0.207414,0.006151,0.020607,0.016471,0.000000,0.178368,0.018782
7992,0.068111,0.000000,0.013235,0.006944,0.000000,0.000000,0.000000,0.016017,0.000000,0.000000,...,0.009166,0.036338,0.004354,0.117346,0.098691,0.149606,0.175234,0.021110,0.077169,0.000000
7998,0.000000,0.010211,0.000000,0.109225,0.023753,0.000000,0.073622,0.000000,0.000000,0.052176,...,0.000000,0.006878,0.000000,0.170922,0.000000,0.005064,0.018755,0.088939,0.027629,0.119891


In [63]:
agg_list = ['mean', 'median', 'max']

target = pd.read_csv("users_churn.csv")

for agg in agg_list:
    user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x, agg), 1)])
    user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
    user_embeddings['uid'] = users['uid'].values
    user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
    
    X = pd.merge(user_embeddings, target, 'left')
    X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                        X['churn'], random_state=0)
    logreg = LogisticRegression() 
    logreg.fit(X_train, y_train)
    preds = logreg.predict_proba(X_test)[:, 1]
    
    precision, recall, thresholds = precision_recall_curve(y_test, preds)
    fscore = (2 * precision * recall) / (precision + recall)
    ix = np.argmax(fscore)
    print('Agg method used:', agg)
    print('Best Threshold=%f, Roc-AUC=%.3f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix],
                                                                                          roc_auc_score(y_test, preds),
                                                                                          fscore[ix],
                                                                                          precision[ix],
                                                                                          recall[ix]))
    print('------------')
    
#     roc_auc, precision, recall, f_score

Agg method used: mean
Best Threshold=0.236742, Roc-AUC=0.940, F-Score=0.657, Precision=0.568, Recall=0.780
------------
Agg method used: median
Best Threshold=0.254772, Roc-AUC=0.956, F-Score=0.704, Precision=0.617, Recall=0.820
------------
Agg method used: max
Best Threshold=0.329860, Roc-AUC=0.969, F-Score=0.742, Precision=0.707, Recall=0.780
------------


Предположительно предсказание с использованием среднего показало самый плохой результат из-за того, что часто находились случаи, когда встречались "выбросы" тем, что могло влиять сильно на общую картину и не учитывались какие-то другие значимые факторы.<br>

В то же время предсказание по медиане сглаживало выделяющиеся значения и больше было нацелено на предсказание какого-то общего тренда.<br>

Достаточно высокие значения в предсказаниях на основе максимальных можно интерпретировать как то, что у нас, вероятно, существует одна или небольшое множествотем, после которых наблюдался реальный частый отток.