In [1]:
import pandas as pd
import re
import numpy as np
import nltk
import pymorphy2 
import itertools
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from gensim.test.utils import datapath
from razdel import tokenize
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

%matplotlib inline

In [2]:
news = pd.read_csv("materials.csv")
users = pd.read_csv("users_articles.csv")

In [3]:
nltk.download('stopwords')
stopword_ru = stopwords.words('russian')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
morph = pymorphy2.MorphAnalyzer()

In [5]:
with open('stopwords.txt', encoding='utf8') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
    
stopword_ru += additional_stopwords

In [6]:
def clean_text(text):
    '''
    очистка текста
    
    на выходе очищеный текст
    
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    text = re.sub("n", ' ', text)
    
    #tokens = list(tokenize(text))
    #words = [_.text for _ in tokens]
    #words = [w for w in words if w not in stopword_ru]
    
    #return " ".join(words)
    return text

cache = {}

def lemmatization(text):
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена из одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

In [7]:
# news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

  text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)


In [8]:
# news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

In [9]:
# чтобы не ждать
import pickle
# news.to_pickle('news_to_lemm.pkl')
news = pd.read_pickle('news_to_lemm.pkl')

In [10]:
texts = [t for t in news['title'].values]

In [11]:
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

обучение модели

In [12]:
N_topic = 20

In [15]:
# lda = LdaModel(common_corpus, num_topics=N_topic, id2word=common_dictionary, random_state=0)

In [16]:
temp_file = datapath("model.lda")
# lda.save(temp_file)
lda = LdaModel.load(temp_file)

In [17]:
def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(N_topic):
        if i not in not_null_topics: 
            output_vector.append(0)  
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [18]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(N_topic)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(N_topic)]]
topic_matrix.head(5)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,6,0.0,0.0,0.0,0.0,0.0,0.052303,0.0,0.01669,0.0,...,0.074118,0.0,0.0,0.05681,0.0,0.0,0.0,0.0,0.0,0.0
1,4896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.655769,0.0,0.0,0.320517,0.0,0.0,0.0,0.0,0.0,0.0
2,4897,0.0,0.0,0.067836,0.416939,0.0,0.097612,0.0,0.0,0.0,...,0.0,0.0,0.0,0.396015,0.0,0.0,0.0,0.0,0.0,0.0
3,4898,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.097021,0.0,...,0.402601,0.0,0.0,0.10668,0.0,0.0,0.0,0.0,0.383401,0.0
4,4899,0.047274,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.138709,0.0,0.405803,0.0,0.0,0.0,0.0


векторные представления пользователей

In [19]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(N_topic)]].values))

In [20]:
def get_user_embedding(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.mean(user_vector, 0)
    return user_vector

In [21]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(N_topic)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(N_topic)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,u105138,0.121398,0.0,0.058165,0.046335,0.002526,0.0,0.0,0.175309,0.015213,...,0.298997,0.032539,0.0,0.035737,0.004055,0.05488,0.0,0.021961,0.041055,0.058991
1,u108690,0.156175,0.0,0.026913,0.075968,0.0,0.0,0.006627,0.023031,0.027461,...,0.378534,0.024746,0.0,0.002596,0.046467,0.047547,0.017869,0.015187,0.088915,0.024648
2,u108339,0.214957,0.020432,0.085885,0.117659,0.0,0.0,0.0,0.06363,0.038892,...,0.145566,0.014012,0.0,0.0,0.025873,0.046994,0.052854,0.0,0.099068,0.020172


модель для предсказания

In [22]:
target = pd.read_csv("users_churn.csv")

In [23]:
X = pd.merge(user_embeddings, target, 'left')

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(N_topic)]], 
                                                    X['churn'], random_state=0, test_size=0.25)

In [25]:
logreg = LogisticRegression(C=1.0, random_state=0)
logreg.fit(X_train, y_train)

In [26]:
preds = logreg.predict_proba(X_test)[:, 1]

In [27]:
# сохраняет метрики качества
metrics = []

def get_metrics(y_test, preds):
    
    precision, recall, thresholds = precision_recall_curve(y_test, preds)
    fscore = (2 * precision * recall) / (precision + recall)
    ix = np.nanargmax(fscore)  # игнорирует nan
    
    return roc_auc_score(y_test, preds), precision[ix], recall[ix], fscore[ix]

In [28]:
# метрики для mean
metrics.append(get_metrics(y_test, preds))
metrics

[(0.9572370486656201,
  0.5970588235294118,
  0.8285714285714286,
  0.6940170940170941)]

__2. Модифицировать код функции get_user_embedding таким образом, чтобы считалось не среднее (как в примере np.mean), а медиана. Применить такое преобразование к данным, обучить модель прогнозирования оттока и посчитать метрики качества и сохранить их: roc auc, precision/recall/f_score (для 3 последних - подобрать оптимальный порог с помощью precision_recall_curve, как это делалось на уроке)__

In [29]:
# в заивисимости от func вернёт median, max или mean
def get_user_embedding(user_articles_list, func='mean'):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    if func=='median':
        user_vector = np.median(user_vector, 0)
    elif func=='max':
        user_vector = np.max(user_vector, 0)
    else:
        user_vector = np.mean(user_vector, 0)
    return user_vector

In [30]:
# func='median'
user_embeddings = pd.DataFrame([i for i in users['articles'].\
                                apply(lambda x: get_user_embedding(x, func='median'), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(N_topic)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(N_topic)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,u105138,0.121091,0.0,0.036742,0.031866,0.0,0.0,0.0,0.178294,0.0,...,0.325452,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033597,0.034918
1,u108690,0.147623,0.0,0.013384,0.050741,0.0,0.0,0.0,0.010177,0.012465,...,0.410626,0.014944,0.0,0.0,0.027902,0.007021,0.0,0.0,0.069863,0.0
2,u108339,0.195963,0.0,0.061321,0.126738,0.0,0.0,0.0,0.054151,0.027287,...,0.136477,0.011249,0.0,0.0,0.025188,0.038645,0.066556,0.0,0.07578,0.0


In [31]:
X = pd.merge(user_embeddings, target, 'left')

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(N_topic)]], 
                                                    X['churn'], random_state=0, test_size=0.25)

In [33]:
logreg.fit(X_train, y_train)

In [34]:
preds = logreg.predict_proba(X_test)[:, 1]

In [35]:
metrics.append(get_metrics(y_test, preds))
metrics

[(0.9572370486656201,
  0.5970588235294118,
  0.8285714285714286,
  0.6940170940170941),
 (0.9795522995522995,
  0.8089430894308943,
  0.8122448979591836,
  0.8105906313645621)]

__3. Повторить п.2, но используя уже не медиану, а max__

In [36]:
# func='max'
user_embeddings = pd.DataFrame([i for i in users['articles'].\
                                apply(lambda x: get_user_embedding(x, func='max'), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(N_topic)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(N_topic)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,u105138,0.216782,0.0,0.190576,0.129654,0.015156,0.0,0.0,0.346829,0.057606,...,0.610707,0.149071,0.0,0.12851,0.024329,0.183808,0.0,0.085922,0.105692,0.164139
1,u108690,0.186089,0.0,0.069811,0.184925,0.0,0.0,0.028213,0.083762,0.072371,...,0.492686,0.066254,0.0,0.015578,0.141552,0.241326,0.088162,0.056445,0.194722,0.137228
2,u108339,0.388467,0.096759,0.21523,0.227426,0.0,0.0,0.0,0.111009,0.096752,...,0.240069,0.031635,0.0,0.0,0.055377,0.104862,0.077346,0.0,0.224682,0.08715


In [37]:
X = pd.merge(user_embeddings, target, 'left')

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(N_topic)]], 
                                                    X['churn'], random_state=0, test_size=0.25)

In [39]:
logreg.fit(X_train, y_train)

In [40]:
preds = logreg.predict_proba(X_test)[:, 1]

In [41]:
metrics.append(get_metrics(y_test, preds))
metrics

[(0.9572370486656201,
  0.5970588235294118,
  0.8285714285714286,
  0.6940170940170941),
 (0.9795522995522995,
  0.8089430894308943,
  0.8122448979591836,
  0.8105906313645621),
 (0.9797034711320425,
  0.7803030303030303,
  0.8408163265306122,
  0.8094302554027505)]

__4. (опциональное, если очень хочется) Воспользовавшись полученными знаниями из п.1, повторить пункт 2, но уже взвешивая новости по tfidf (подсказка: нужно получить веса-коэффициенты для каждого документа. Не все документы одинаково информативны и несут какой-то положительный сигнал). Подсказка 2 - нужен именно idf, как вес.__

In [43]:
# счётчик для вхождений статей в пользовательские списки
articles_counter = Counter()

for users_articles in users['articles']:
    # каждая из статей вроде входит только один раз, но на всякий случай
    for article in set(users_articles.strip('[]').split(', ')):
        articles_counter[int(article)] += 1

In [45]:
# не очень ясно, что брать за длину корпуса - общее число статей (27000), или длину счётчика (14787)
n_articles = len(articles_counter)
for key in articles_counter:
    articles_counter[key] = np.log(n_articles/articles_counter[key])

In [46]:
articles_counter[4896]

7.992065782939387

Далее - взвешенное среднее:
$$\frac{idf_0*v_0 + idf_1*v_1+...+idf_5*v_5}{idf_0 + idf_1 + ... + idf_5}$$

In [47]:
def get_user_embedding_idf(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    # добавлен вектор весов
    w_vector = np.array([articles_counter[doc_id] for doc_id in user_articles_list])
    
    # и взвешенное усреднение
    user_vector = (user_vector.T @ w_vector) / np.sum(w_vector)
    return user_vector

In [48]:
# get_user_embedding_idf
user_embeddings = pd.DataFrame([i for i in users['articles']\
                                .apply(lambda x: get_user_embedding_idf(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(N_topic)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(N_topic)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,u105138,0.120811,0.0,0.058922,0.047997,0.002836,0.0,0.0,0.175863,0.015124,...,0.291063,0.03554,0.0,0.037797,0.004031,0.052729,0.0,0.022814,0.041718,0.060294
1,u108690,0.156445,0.0,0.028431,0.075682,0.0,0.0,0.006576,0.0229,0.028303,...,0.380309,0.024548,0.0,0.002575,0.043022,0.052412,0.017534,0.014261,0.08647,0.023312
2,u108339,0.213666,0.018783,0.089106,0.114883,0.0,0.0,0.0,0.062538,0.039958,...,0.14296,0.01429,0.0,0.0,0.025631,0.045276,0.054203,0.0,0.103047,0.021306


In [49]:
X = pd.merge(user_embeddings, target, 'left')

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(N_topic)]], 
                                                    X['churn'], random_state=0, test_size=0.25)

In [51]:
logreg.fit(X_train, y_train)

In [52]:
preds = logreg.predict_proba(X_test)[:, 1]

In [53]:
metrics.append(get_metrics(y_test, preds))
metrics

[(0.9572370486656201,
  0.5970588235294118,
  0.8285714285714286,
  0.6940170940170941),
 (0.9795522995522995,
  0.8089430894308943,
  0.8122448979591836,
  0.8105906313645621),
 (0.9797034711320425,
  0.7803030303030303,
  0.8408163265306122,
  0.8094302554027505),
 (0.960200011628583,
  0.6081871345029239,
  0.8489795918367347,
  0.7086882453151618)]

невзвешенное усреднение - более эффективно

In [71]:
def get_user_embedding_idf(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    w_vector = np.array([articles_counter[doc_id] for doc_id in user_articles_list])
    
    # простое усреднение
    user_vector = (user_vector.T @ w_vector)/len(user_articles_list)
    return user_vector

In [72]:
# get_user_embedding_idf
user_embeddings = pd.DataFrame([i for i in users['articles']\
                                .apply(lambda x: get_user_embedding_idf(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(N_topic)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(N_topic)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,u105138,1.033286,0.0,0.503952,0.410516,0.024253,0.0,0.0,1.504139,0.129353,...,2.489436,0.303972,0.0,0.323276,0.034478,0.45099,0.0,0.195123,0.356811,0.515688
1,u108690,1.341131,0.0,0.243728,0.64879,0.0,0.0,0.05637,0.196314,0.242626,...,3.260214,0.210435,0.0,0.022077,0.368805,0.449303,0.150312,0.122249,0.741269,0.199844
2,u108339,1.782292,0.156674,0.743281,0.958296,0.0,0.0,0.0,0.521658,0.333313,...,1.192499,0.119198,0.0,0.0,0.213802,0.377667,0.45213,0.0,0.859565,0.17772


In [73]:
X = pd.merge(user_embeddings, target, 'left')

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(N_topic)]], 
                                                    X['churn'], random_state=0, test_size=0.25)

In [75]:
logreg.fit(X_train, y_train)

In [76]:
preds = logreg.predict_proba(X_test)[:, 1]

In [77]:
metrics.append(get_metrics(y_test, preds))
metrics

[(0.9572370486656201,
  0.5970588235294118,
  0.8285714285714286,
  0.6940170940170941),
 (0.9795522995522995,
  0.8089430894308943,
  0.8122448979591836,
  0.8105906313645621),
 (0.9797034711320425,
  0.7803030303030303,
  0.8408163265306122,
  0.8094302554027505),
 (0.960200011628583,
  0.6081871345029239,
  0.8489795918367347,
  0.7086882453151618),
 (0.9909390080818652,
  0.8505747126436781,
  0.9061224489795918,
  0.8774703557312253)]

__5. Сформировать на выходе единую таблицу, сравнивающую качество 3 разных метода получения эмбедингов пользователей: mean, median, max, idf_mean по метрикам roc_auc, precision, recall, f_score__

In [78]:
columns = ['roc_auc_score', 'precision', 'recall', 'f_score']
indexes = ['mean', 'median', 'max', 'idf_w_mean', 'idf_mean']

metrics_df = pd.DataFrame(metrics, columns=columns, index=indexes)
metrics_df

Unnamed: 0,roc_auc_score,precision,recall,f_score
mean,0.957237,0.597059,0.828571,0.694017
median,0.979552,0.808943,0.812245,0.810591
max,0.979703,0.780303,0.840816,0.80943
idf_w_mean,0.9602,0.608187,0.84898,0.708688
idf_mean,0.990939,0.850575,0.906122,0.87747


__6. Сделать самостоятельные выводы и предположения о том, почему тот или иной способ оказался эффективнее остальных__

По метрикам - наихудшим вариантом является вычисление среднего вектора (при этом остаётся вопрос, насколько значимы различия для остальных методов). 

Так как в модели вероятностное распределение конкретной темы - признак (один из 20), на основе которого делаем предсказание, то разница между эмбеддингами, полученными с использованием медианы или среднего предполагает, что в распределении распределений вероятностей тем для одного пользователя (20 распределений по 6 объектов) медиана и среднее достаточно значимо разнесены друг от друга. То есть, для 6 статей усреднение не даёт "общей" тенденции, а скорее размывает информацию. И, возможно, по этой же причине взятие "максимального" вектора даёт сравнимые с медианой метрики.

Если зайти со стороны смысла: отписаться от чтения можно, если какие-то из последних статей сильно задели/разозлили/огорчили, либо наоборот оказались скучными. Можно предположить, что для большей эмоциональной нагруженности - определённые темы должны быть сильно выражены в тексте (т.е. иметь большую вероятность = большое число слов из темы = тема более полно раскрыта). "Скучные" тексты можно формализовать как те, в которых ни одна тема не представлена полно, либо слишком много тем представлены примерно в равной мере. И тогда более информативным окажется поиск пиков вероятностей, выраженных тем, а не усреднение.

Idf-mean даёт самыевысокие метрики (idf c взвешенным усреднением даёт высокую полноту и точность лишь незначительно превышающую точность у mean). То есть, фактор весов скорее повышает информативность признаков. И опять же, выигрыш в метриках дают способы, которые не сглаживают и ищут общий центр масс, а скорее заостряют тенденции. И тогда скорее верно предположение о том, что некоторые темы более значимы, чем другие (в том числе - для некоторых пользователей некоторые темы более значимы, чем другие).