## Урок 2. 

### Практическое задание

In [1]:
#предобработка текстов
import re
import numpy as np
import pandas as pd
from nltk.corpus import stopwords

from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from gensim.test.utils import datapath

from razdel import tokenize

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve

import pymorphy2 
import itertools

### 1. Загружаем данные

In [2]:
news = pd.read_csv("articles.csv")
print(news.shape)
news.head(3)

(27000, 2)


Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...


Загрузим пользователей и списки последних прочитанных новостей

In [3]:
users = pd.read_csv("users_articles.csv")
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [4]:
target = pd.read_csv("users_churn.csv")
target.head(3)

Unnamed: 0,uid,churn
0,u107120,0
1,u102277,0
2,u102444,0


Итак, нам нужно получить векторные представления пользователей на основе прочитанным ими новостей и самих новостей

### 2. Получаем векторные представления новостей

In [5]:
stopword_ru = stopwords.words('russian')
len(stopword_ru)

morph = pymorphy2.MorphAnalyzer()

In [6]:
with open('stopwords.txt') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
stopword_ru += additional_stopwords
len(stopword_ru)

776

In [7]:
def clean_text(text):
    '''
    очистка текста
    
    на выходе очищеный текст
    
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    
    #tokens = list(tokenize(text))
    #words = [_.text for _ in tokens]
    #words = [w for w in words if w not in stopword_ru]
    
    #return " ".join(words)
    return text

cache = {}

def lemmatization(text):
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

In [8]:
%%time
#Запускаем очистку текста. Будет долго...
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

  text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)


CPU times: user 18.4 s, sys: 616 ms, total: 19 s
Wall time: 19 s


In [9]:
%%time
#Запускаем лемматизацию текста. Будет очень долго...
news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

CPU times: user 2min 21s, sys: 36.2 ms, total: 2min 21s
Wall time: 2min 21s


In [10]:
#сформируем список наших текстов, разбив еще и на пробелы
texts = [t for t in news['title'].values]

# Create a corpus from a list of texts
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

Запускаем обучение

In [11]:
%%time
# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=25, id2word=common_dictionary)#, passes=10)

CPU times: user 1min 8s, sys: 832 ms, total: 1min 9s
Wall time: 22.2 s


In [12]:
# Save model to disk.
temp_file = datapath("model.lda")
lda.save(temp_file)

# Load a potentially pretrained model from disk.
lda = LdaModel.load(temp_file)

In [13]:
# Create a new corpus, made of previously unseen documents.
other_texts = [t for t in news['title'].iloc[:3]]
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]

unseen_doc = other_corpus[2]
print(other_texts[2])
lda[unseen_doc] 

['форвард', 'авангард', 'томаш', 'заборский', 'прокомментировать', 'игра', 'команда', 'матч', 'чемпионат', 'кхл', 'против', 'атланта', 'nnnn', 'плохой', 'матч', 'нижний', 'новгород', 'против', 'торпедо', 'настраиваться', 'первый', 'минута', 'включиться', 'заборский', 'получиться', 'забросить', 'быстрый', 'гол', 'задать', 'хороший', 'темп', 'поединок', 'играть', 'хороший', 'сторона', 'пять', 'очко', 'выезд', 'девять', 'хороший']


[(1, 0.32786116),
 (3, 0.14677003),
 (5, 0.26634985),
 (21, 0.17293012),
 (22, 0.064920194)]

In [14]:
x=lda.show_topics(num_topics=25, num_words=7,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

#Below Code Prints Only Words 
for topic,words in topics_words:
    print("topic_{}: ".format(topic)+" ".join(words))

topic_0: наука снижение мозг писать болезнь километр белый
topic_1: nn выяснить день первый медведев двигатель смерть
topic_2: статья ракета журнал nn писать первый опубликовать
topic_3: nn россия научный гражданин новый земля всё
topic_4: риск доллар женщина обнаружить исследование nn метод
topic_5: рост рынок экономика млрд экономический система развитие
topic_6: вирус пресссекретарить песок констатировать сон армения захватить
topic_7: население земля спрос россия nn объект ресурс
topic_8: фонд фестиваль конкурс супруг употребление рекорд автор
topic_9: исследование млн тыс рубль цена составить банк
topic_10: район территория взрыв министерство граница пострадать армия
topic_11: тело nn всё операция москва информация помощь
topic_12: иск собственность лётчик пища бизнесмен подать рассмотрение
topic_13: авария атмосферный гражданство вдвое архитектор ночью новак
topic_14: санкция рубль закон решение документ торговый российский
topic_15: россия российский газ путин владимир система п

In [15]:
# функция, которая возвращает векторное представление новости

def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(25):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [16]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(25)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(25)]]
topic_matrix.head(5)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,6,0.0,0.0,0.0,0.026695,0.0,0.025547,0.0,0.0,0.0,...,0.202015,0.0,0.0,0.0,0.0,0.335942,0.0,0.0,0.402059,0.0
1,4896,0.0,0.0,0.208716,0.0,0.0,0.0,0.0,0.144851,0.029217,...,0.0,0.0,0.0,0.0,0.0,0.0,0.161182,0.0,0.0,0.0
2,4897,0.0,0.327962,0.0,0.146798,0.0,0.266277,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.172923,0.064871,0.0,0.0
3,4898,0.0,0.0,0.0,0.080533,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.048343,0.338113,0.0,0.0
4,4899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.164345,0.0,0.0,0.0,0.0,0.408288,0.158845,0.0,0.243784,0.0


### 3. Получаем векторныe представления пользователей

In [17]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(25)]].values))

In [18]:
def get_user_embedding_mean(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.mean(user_vector, 0)
    return user_vector

In [19]:
def get_user_embedding_median(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.median(user_vector, 0)
    return user_vector

In [20]:
def get_user_embedding_max(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.max(user_vector, 0)
    return user_vector

In [21]:
user_embeddings_mean = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding_mean(x), 1)])
user_embeddings_mean.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings_mean['uid'] = users['uid'].values
user_embeddings_mean = user_embeddings_mean[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings_mean.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.019822,0.024867,0.011748,0.130624,0.009628,0.020358,0.013538,0.060713,0.012651,...,0.146272,0.008467,0.0,0.0,0.066743,0.009467,0.0,0.105135,0.010542,0.056762
1,u108690,0.0257,0.026998,0.027575,0.191078,0.020237,0.029264,0.006586,0.022884,0.004329,...,0.10299,0.0,0.0,0.002871,0.072655,0.002825,0.0,0.034723,0.074819,0.106745
2,u108339,0.00667,0.025765,0.059174,0.093718,0.09479,0.020678,0.0,0.01163,0.002804,...,0.102278,0.0,0.005728,0.021689,0.0,0.010831,0.00474,0.002224,0.105665,0.081976


In [22]:
user_embeddings_median = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding_median(x), 1)])
user_embeddings_median.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings_median['uid'] = users['uid'].values
user_embeddings_median = user_embeddings_median[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings_median.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.0,0.0,0.0,0.10957,0.0,0.0,0.0,0.036145,0.0,...,0.114503,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,u108690,0.015978,0.0,0.032542,0.205003,0.015243,0.023707,0.0,0.009773,0.0,...,0.08824,0.0,0.0,0.0,0.024343,0.0,0.0,0.028016,0.041679,0.106351
2,u108339,0.0,0.017732,0.036691,0.094177,0.056844,0.009491,0.0,0.005302,0.0,...,0.103979,0.0,0.0,0.0,0.0,0.011013,0.0,0.0,0.107659,0.043351


In [23]:
user_embeddings_max = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding_max(x), 1)])
user_embeddings_max.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings_max['uid'] = users['uid'].values
user_embeddings_max = user_embeddings_max[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings_max.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.062458,0.1492,0.070489,0.368434,0.057768,0.06339,0.053217,0.154243,0.075903,...,0.473023,0.050801,0.0,0.0,0.381234,0.056804,0.0,0.326242,0.063252,0.216378
1,u108690,0.06511,0.128389,0.035792,0.267281,0.049584,0.069999,0.027299,0.088207,0.01354,...,0.282101,0.0,0.0,0.017223,0.201547,0.016952,0.0,0.090096,0.234978,0.152305
2,u108339,0.029191,0.06067,0.136773,0.167857,0.268446,0.065789,0.0,0.048514,0.016823,...,0.226171,0.0,0.034367,0.130136,0.0,0.022103,0.028439,0.013342,0.191896,0.229473


In [24]:
X_mean = pd.merge(user_embeddings_mean, target, 'left')

In [25]:
X_median = pd.merge(user_embeddings_median, target, 'left')

In [26]:
X_max = pd.merge(user_embeddings_max, target, 'left')

In [27]:
#разделим данные на train/test
X_mean_train, X_mean_test, y_mean_train, y_mean_test = train_test_split(X_mean[['topic_{}'.format(i) for i in range(25)]], 
                                                    X_mean['churn'], random_state=0)

In [28]:
X_median_train, X_median_test, y_median_train, y_median_test = train_test_split(X_median[['topic_{}'.format(i) for i in range(25)]], 
                                                    X_median['churn'], random_state=0)

In [29]:
X_max_train, X_max_test, y_max_train, y_max_test = train_test_split(X_max[['topic_{}'.format(i) for i in range(25)]], 
                                                    X_max['churn'], random_state=0)

In [30]:
logreg_mean = LogisticRegression()
#обучим 
logreg_mean.fit(X_mean_train, y_mean_train)

LogisticRegression()

In [31]:
logreg_median = LogisticRegression()
#обучим 
logreg_median.fit(X_median_train, y_median_train)

LogisticRegression()

In [32]:
logreg_max = LogisticRegression()
#обучим 
logreg_max.fit(X_max_train, y_max_train)

LogisticRegression()

In [33]:
#наши прогнозы для тестовой выборки
preds_mean = logreg_mean.predict_proba(X_mean_test)[:, 1]

In [34]:
preds_median = logreg_median.predict_proba(X_median_test)[:, 1]

In [35]:
preds_max = logreg_max.predict_proba(X_max_test)[:, 1]

### 4. Рассчитаем Precision, Recall, F_score

In [36]:
def result(user_embeddings):
    
    if user_embeddings == 'mean':
        precision, recall, thresholds = precision_recall_curve(y_mean_test, preds_mean)
        fscore = (2 * precision * recall) / (precision + recall)

        ix = np.argmax(fscore)

        roc_auc = roc_auc_score(y_mean_test, preds_mean)
        
    if user_embeddings == 'median':
        precision, recall, thresholds = precision_recall_curve(y_median_test, preds_median)
        fscore = (2 * precision * recall) / (precision + recall)

        ix = np.argmax(fscore)
        
        roc_auc = roc_auc_score(y_median_test, preds_median)
    
    if user_embeddings == 'max':
        precision, recall, thresholds = precision_recall_curve(y_max_test, preds_max)
        fscore = (2 * precision * recall) / (precision + recall)

        ix = np.argmax(fscore)

        roc_auc = roc_auc_score(y_max_test, preds_max)



    return f'User_embeddings_{user_embeddings} - RocAuc = {roc_auc:.3f}, Precision = {precision[ix]:.3f}, Recall = {recall[ix]:.3f}, F-Score = {fscore[ix]:.3f}'

#     return fscore[ix], precision[ix], recall[ix], roc_auc

print(result('mean'))
print(result('median'))
print(result('max'))

User_embeddings_mean - RocAuc = 0.931, Precision = 0.570, Recall = 0.678, F-Score = 0.619
User_embeddings_median - RocAuc = 0.967, Precision = 0.670, Recall = 0.853, F-Score = 0.750
User_embeddings_max - RocAuc = 0.981, Precision = 0.838, Recall = 0.824, F-Score = 0.831


Предсказания, построенные на "User_embeddings_max", показали лучший результат, потому что при обучении модели учитывались только статьи максимально интересные для читателя.