In [1]:
import pandas as pd
import numpy as np
import re
import itertools
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from razdel import tokenize
import pymorphy2
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel, TfidfModel
from gensim.test.utils import datapath
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

%matplotlib inline


__2 и 3 задание:__ Модифицировать код функции get_user_embedding таким образом, чтобы считалось не среднее (как в примере np.mean), а медиана. Применить такое преобразование к данным, обучить модель прогнозирования оттока и посчитать метрики качества и сохранить их: roc auc, precision/recall/f_score (для 3 последних - подобрать оптимальный порог с помощью precision_recall_curve, как это делалось на уроке).

Повторить п.2, но используя уже не медиану, а max

Прежде, чем приступить к выполнению этого задания, выполним обработку новостей.

In [2]:
news = pd.read_csv("materials.csv")
news.head(3)

Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...


In [3]:
# Стопслова, которые исключаются из обработки
stopword_ru = stopwords.words('russian')
with open('stopwords.txt') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
stopword_ru += additional_stopwords

In [4]:
morph = pymorphy2.MorphAnalyzer()

In [5]:
def clean_text(text):
    '''
    очистка текста
    на выходе очищеный текст
    
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    
    return text

cache = {}

def lemmatization(text):
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

In [6]:
# Очистка и лемматизация текста
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

  text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)


In [7]:
news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

In [39]:
#сформируем список наших текстов, разбив еще и на пробелы
texts = [t for t in news['title'].values]

# Create a corpus from a list of texts
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

In [40]:
%%time
# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=25, id2word=common_dictionary)#, passes=10)

CPU times: total: 1min 9s
Wall time: 1min 3s


Напишем функцию, которая будет нам возвращать векторное представление новости


In [41]:
def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(25):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [42]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(25)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(25)]]
topic_matrix.head(5)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,6,0.0,0.0,0.022973,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.797235,0.0,0.0
1,4896,0.0,0.0,0.051581,0.0,0.0,0.56986,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4897,0.0,0.0,0.0,0.0,0.0,0.0,0.030714,0.0,0.0,...,0.236366,0.0,0.114211,0.0,0.0,0.0,0.0,0.0,0.128398,0.0
3,4898,0.0,0.0,0.0,0.0,0.0,0.0,0.173547,0.0,0.0,...,0.348599,0.0,0.0,0.0,0.0,0.0,0.080619,0.171179,0.0,0.0
4,4899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.114728,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.351396


Следующий шаг - векторные представления пользователей

In [43]:
users = pd.read_csv("users_articles.csv")
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [47]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(25)]].values))

In [48]:
def get_user_embedding(user_articles_list, option):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    if option == 'mean':
        user_vector = np.mean(user_vector, 0)
        return user_vector
    if option == 'median':
        user_vector = np.median(user_vector, 0)
        return user_vector
    if option == 'max':
        user_vector = np.max(user_vector, 0)
        return user_vector
    

In [49]:
target = pd.read_csv("users_churn.csv")
target.head(3)

Unnamed: 0,uid,churn
0,u107120,0
1,u102277,0
2,u102444,0


In [50]:
class UserProcessor():
    def __init__(self, users, target, option) -> None:
        self.users = users
        self.option = option
        self.target = target
        self.user_embeddings = None

    def fit(self):
        self.user_embeddings = pd.DataFrame([i for i in self.users['articles'].apply(lambda x: get_user_embedding(x, self.option), 1)])
        self.user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
        self.user_embeddings['uid'] = self.users['uid'].values
        self.user_embeddings = self.user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
    
    def transform(self):
        X = pd.merge(self.user_embeddings, self.target, 'left')
        return X

In [51]:
class MLModel():
    def __init__(self, users, target, option) -> None:
        self.users = users
        self.target = target
        self.option = option
        self.logreg = None
        self.X_test = None
        self.y_test = None
    
    def fit(self):
        X = UserProcessor(self.users, self.target, self.option)
        X.fit()
        X = X.transform()
        X_train, self.X_test, y_train, self.y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=0)
        self.logreg = LogisticRegression()
        self.logreg.fit(X_train, y_train)

    def transform(self):
        preds = self.logreg.predict_proba(self.X_test)[:, 1]
        return preds


In [52]:
ML_mean = MLModel(users, target, 'mean')
ML_mean.fit()
ML_mean_preds = ML_mean.transform()

In [53]:
ML_median = MLModel(users, target, 'median')
ML_median.fit()
ML_median_preds = ML_median.transform()

In [54]:
ML_max = MLModel(users, target, 'max')
ML_max.fit()
ML_max_preds = ML_max.transform()

__Задание 5:__ Сформировать на выходе единую таблицу, сравнивающую качество 3 разных метода получения эмбедингов пользователей: mean, median, max, idf_mean по метрикам roc_auc, precision, recall, f_score

In [55]:
def accuracy_metrics(y_test, preds):
    precision, recall, thresholds = precision_recall_curve(y_test, preds)
    fscore = (2 * precision * recall) / (precision + recall)
    fscore = np.nan_to_num(fscore, nan=0)
    # locate the index of the largest f score
    ix = np.argmax(fscore)

    return [roc_auc_score(y_test, preds), fscore[ix], precision[ix], recall[ix]]
    
    # print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
    #                                                                     fscore[ix],
    #                                                                     precision[ix],
    #                                                                     recall[ix]))
    # print(f'ROC_AUC score = {roc_auc_score(y_test, preds)}')

In [56]:
table = {'mean': accuracy_metrics(ML_mean.y_test, ML_mean_preds),
        'median': accuracy_metrics(ML_median.y_test, ML_median_preds),
        'max': accuracy_metrics(ML_max.y_test, ML_max_preds)}

table = pd.DataFrame(data=table, index=['roc_auc score', 'F-score', 'Precision', 'Recall'])
table

Unnamed: 0,mean,median,max
roc_auc score,0.928784,0.9599,0.958958
F-score,0.628571,0.707865,0.720682
Precision,0.55873,0.653979,0.754464
Recall,0.718367,0.771429,0.689796


__Задание 6:__ Сделать самостоятельные выводы и предположения о том, почему тот или ной способ оказался эффективнее остальных

Вывод: наилучший результат в той модели, где в рассчете user_embedding используется median и max

__Задание 4:__ (опциональное, если очень хочется) Воспользовавшись полученными знаниями из п.1, повторить пункт 2, но уже взвешивая новости по tfidf (подсказка: нужно получить веса-коэффициенты для каждого документа. Не все документы одинаково информативны и несут какой-то положительный сигнал). Подсказка 2 - нужен именно idf, как вес

Считаем tf-idf значения для каждого слова в каждом документе.

In [57]:
tfidf_model = TfidfModel(common_corpus)
common_corpus_tfidf = [tfidf_model[doc] for doc in common_corpus]

Тренируем LDA на tf-idf корпусе.

In [58]:
lda_tfidf = LdaModel(common_corpus_tfidf, num_topics=25, id2word=common_dictionary)#, passes=10)

Получаем эмбеддинги документов по темам, с учетом подсчета для нового документа значений tf-idf.

In [59]:
def get_lda_vector_tfidf(text):
    unseen_doc = tfidf_model[common_dictionary.doc2bow(text)]
    lda_tuple = lda_tfidf[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(25):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [60]:
topic_matrix_tfidf = pd.DataFrame([get_lda_vector_tfidf(text) for text in news['title'].values])
topic_matrix_tfidf.columns = ['topic_{}'.format(i) for i in range(25)]
topic_matrix_tfidf['doc_id'] = news['doc_id'].values
topic_matrix_tfidf = topic_matrix_tfidf[['doc_id']+['topic_{}'.format(i) for i in range(25)]]
doc_dict = dict(zip(topic_matrix_tfidf['doc_id'].values, topic_matrix_tfidf[['topic_{}'.format(i) for i in range(25)]].values))

In [61]:
ML_tfidf_mean = MLModel(users, target, 'mean')
ML_tfidf_mean.fit()
ML_tfidf_mean_preds = ML_tfidf_mean.transform()

In [62]:
ML_tfidf_median = MLModel(users, target, 'median')
ML_tfidf_median.fit()
ML_tfidf_median_preds = ML_tfidf_median.transform()

In [63]:
ML_tfidf_max = MLModel(users, target, 'max')
ML_tfidf_max.fit()
ML_tfidf_max_preds = ML_tfidf_max.transform()

In [64]:
table = {'mean': accuracy_metrics(ML_mean.y_test, ML_mean_preds),
        'median': accuracy_metrics(ML_median.y_test, ML_median_preds),
        'max': accuracy_metrics(ML_max.y_test, ML_max_preds),
        'tfidf_mean': accuracy_metrics(ML_tfidf_mean.y_test, ML_tfidf_mean_preds),
        'tfidf_median': accuracy_metrics(ML_tfidf_median.y_test, ML_tfidf_median_preds),
        'tfidf_max': accuracy_metrics(ML_tfidf_max.y_test, ML_tfidf_max_preds)}

table = pd.DataFrame(data=table, index=['roc_auc score', 'F-score', 'Precision', 'Recall'])
table

  fscore = (2 * precision * recall) / (precision + recall)
  fscore = (2 * precision * recall) / (precision + recall)


Unnamed: 0,mean,median,max,tfidf_mean,tfidf_median,tfidf_max
roc_auc score,0.928784,0.9599,0.958958,0.901324,0.904613,0.873988
F-score,0.628571,0.707865,0.720682,0.55163,0.574871,0.556
Precision,0.55873,0.653979,0.754464,0.413442,0.497024,0.545098
Recall,0.718367,0.771429,0.689796,0.828571,0.681633,0.567347


Tfidf модель дала плохие результаты, попробуем немного другой алгоритм

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [66]:
users['articles']

0       [293672, 293328, 293001, 293622, 293126, 1852]
1               [3405, 1739, 2972, 1158, 1599, 322665]
2               [1845, 2009, 2356, 1424, 2939, 323389]
3               [5933, 6186, 5055, 6977, 5206, 488389]
4                [707, 1144, 2532, 2928, 3133, 324592]
                             ...                      
7995     [323918, 323362, 323704, 323452, 324291, 251]
7996            [5532, 5860, 7755, 7140, 5182, 488337]
7997    [322811, 323898, 321858, 323345, 323491, 2193]
7998            [5436, 6092, 6891, 7045, 5320, 487379]
7999    [294096, 293759, 294178, 293544, 293921, 2909]
Name: articles, Length: 8000, dtype: object

In [67]:
N_features=100
tf = TfidfVectorizer(stop_words=stopword_ru, max_features=N_features)

In [82]:
news['title_str'] = news['title'].apply(lambda x: ' '.join(x))
news['title_str']

0        заместитель председатель правительство рф серг...
1        матч финал кубок россия футбол приостановить с...
2        форвард авангард томаш заборский прокомментиро...
3        главный тренер кубань юрий красножанин прокомм...
4        решение попечительский совет владивостокский с...
                               ...                        
26995    учёный токийский университет морской наука тех...
26996    глава кафедра отечественный история xx век ист...
26997    американский учёный уточнить возраст расположи...
26998    последний год тропический углеродный цикл стат...
26999    жить примерно тыс год назад территория совреме...
Name: title_str, Length: 27000, dtype: object

In [83]:
tfidf_titles = tf.fit_transform(news['title_str'])
tfidf_titles

<27000x100 sparse matrix of type '<class 'numpy.float64'>'
	with 651766 stored elements in Compressed Sparse Row format>

In [84]:
tfidf_matrix = pd.DataFrame.sparse.from_spmatrix(tfidf_titles, columns=sorted(tf.vocabulary_))
tfidf_matrix.columns = ['word_{}'.format(i) for i in range(N_features)]  # changing names of columns from real words 
                                                                         # to word_0, word_1, word_2...
tfidf_matrix.head(2)

Unnamed: 0,word_0,word_1,word_2,word_3,word_4,word_5,word_6,word_7,word_8,word_9,...,word_90,word_91,word_92,word_93,word_94,word_95,word_96,word_97,word_98,word_99
0,0.544075,0.0,0.0,0.246495,0.0,0.0,0.0,0.171238,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.086973,0.0
1,0.0,0.686993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17153,0.0


In [85]:
tfidf_matrix['doc_id'] = news['doc_id'].values
tfidf_matrix.head(2)

Unnamed: 0,word_0,word_1,word_2,word_3,word_4,word_5,word_6,word_7,word_8,word_9,...,word_91,word_92,word_93,word_94,word_95,word_96,word_97,word_98,word_99,doc_id
0,0.544075,0.0,0.0,0.246495,0.0,0.0,0.0,0.171238,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.086973,0.0,6
1,0.0,0.686993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17153,0.0,4896


In [86]:
doc_dict = dict(zip(tfidf_matrix['doc_id'].values, tfidf_matrix[['word_{}'.format(i) for i in range(N_features)]].values))

In [88]:
class UserProcessor2(UserProcessor):
    def __init__(self, users, target, option) -> None:
        super().__init__(users, target, option)
    def fit(self):
        self.user_embeddings = pd.DataFrame([i for i in self.users['articles'].apply(lambda x: get_user_embedding(x, self.option), 1)])
        self.user_embeddings.columns = ['topic_{}'.format(i) for i in range(N_features)]
        self.user_embeddings['uid'] = self.users['uid'].values
        self.user_embeddings = self.user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]

In [89]:
class MLModel2(MLModel):
    def __init__(self, users, target, option) -> None:
        super().__init__(users, target, option)

    def fit(self):
        X = UserProcessor2(self.users, self.target, self.option)
        X.fit()
        X = X.transform()
        X_train, self.X_test, y_train, self.y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=0)
        self.logreg = LogisticRegression()
        self.logreg.fit(X_train, y_train)

In [91]:
ML_tfidf_median_version2 = MLModel2(users, target, 'median')
ML_tfidf_median_version2.fit()
ML_tfidf_median_preds_version2 = ML_tfidf_median_version2.transform()

In [92]:
ML_tfidf_mean_version2 = MLModel2(users, target, 'mean')
ML_tfidf_mean_version2.fit()
ML_tfidf_mean_preds_version2 = ML_tfidf_mean_version2.transform()

In [93]:
ML_tfidf_max_version2 = MLModel2(users, target, 'max')
ML_tfidf_max_version2.fit()
ML_tfidf_max_preds_version2 = ML_tfidf_max_version2.transform()

In [94]:
table = {'mean': accuracy_metrics(ML_mean.y_test, ML_mean_preds),
        'median': accuracy_metrics(ML_median.y_test, ML_median_preds),
        'max': accuracy_metrics(ML_max.y_test, ML_max_preds),
        'tfidf_mean': accuracy_metrics(ML_tfidf_mean.y_test, ML_tfidf_mean_preds),
        'tfidf_median': accuracy_metrics(ML_tfidf_median.y_test, ML_tfidf_median_preds),
        'tfidf_max': accuracy_metrics(ML_tfidf_max.y_test, ML_tfidf_max_preds),
        'tfidf_mean_version2': accuracy_metrics(ML_tfidf_mean_version2.y_test, ML_tfidf_mean_preds_version2),
        'tfidf_median_version2': accuracy_metrics(ML_tfidf_median_version2.y_test, ML_tfidf_median_preds_version2),
        'tfidf_max_version2': accuracy_metrics(ML_tfidf_max_version2.y_test, ML_tfidf_max_preds_version2)}

table = pd.DataFrame(data=table, index=['roc_auc score', 'F-score', 'Precision', 'Recall'])
table

  fscore = (2 * precision * recall) / (precision + recall)


Unnamed: 0,mean,median,max,tfidf_mean,tfidf_median,tfidf_max,tfidf_mean_version2,tfidf_median_version2,tfidf_max_version2
roc_auc score,0.928784,0.9599,0.958958,0.901324,0.992921,0.873988,0.977222,0.992921,0.819506
F-score,0.628571,0.707865,0.720682,0.55163,0.900196,0.556,0.81336,0.900196,0.454701
Precision,0.55873,0.653979,0.754464,0.413442,0.864662,0.545098,0.784091,0.864662,0.391176
Recall,0.718367,0.771429,0.689796,0.828571,0.938776,0.567347,0.844898,0.938776,0.542857


Самым лучшим алгоритмо оказался tfidf второй версии и использованием медианы в расчетах user_embedding