In [30]:
import pandas as pd
import numpy as np
import re
import itertools
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from razdel import tokenize
import pymorphy2
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from gensim.test.utils import datapath
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

%matplotlib inline


__2 и 3 задание:__ Модифицировать код функции get_user_embedding таким образом, чтобы считалось не среднее (как в примере np.mean), а медиана. Применить такое преобразование к данным, обучить модель прогнозирования оттока и посчитать метрики качества и сохранить их: roc auc, precision/recall/f_score (для 3 последних - подобрать оптимальный порог с помощью precision_recall_curve, как это делалось на уроке).

Повторить п.2, но используя уже не медиану, а max

Прежде, чем приступить к выполнению этого задания, выполним обработку новостей.

In [10]:
news = pd.read_csv("materials.csv")
news.head(3)

Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...


In [3]:
# Стопслова, которые исключаются из обработки
stopword_ru = stopwords.words('russian')
with open('stopwords.txt') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
stopword_ru += additional_stopwords

In [4]:
morph = pymorphy2.MorphAnalyzer()

In [5]:
def clean_text(text):
    '''
    очистка текста
    на выходе очищеный текст
    
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    
    return text

cache = {}

def lemmatization(text):
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

In [11]:
# Очистка и лемматизация текста
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

In [12]:
news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

In [13]:
#сформируем список наших текстов, разбив еще и на пробелы
texts = [t for t in news['title'].values]

# Create a corpus from a list of texts
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

In [14]:
%%time
# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=25, id2word=common_dictionary)#, passes=10)

CPU times: total: 1min 8s
Wall time: 1min 1s


Напишем функцию, которая будет нам возвращать векторное представление новости


In [15]:
def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(25):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [16]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(25)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(25)]]
topic_matrix.head(5)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,6,0.0,0.0,0.0,0.0,0.0,0.089387,0.0,0.0,0.0,...,0.0,0.873849,0.0,0.0,0.0,0.010083,0.0,0.0,0.0,0.0
1,4896,0.0,0.0,0.0,0.0,0.0,0.333155,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.107724,0.0,0.0,0.0,0.0,0.537106
2,4897,0.0,0.0,0.0,0.0,0.0,0.231215,0.0,0.0,0.0,...,0.077312,0.0,0.182922,0.0,0.0,0.0,0.453184,0.0,0.0,0.0
3,4898,0.307329,0.0,0.0,0.0,0.0,0.107512,0.15821,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.396103,0.0,0.0,0.0
4,4899,0.0,0.0,0.0,0.0,0.02955,0.05985,0.0,0.0,0.0,...,0.0,0.778898,0.0,0.0,0.0,0.0,0.108302,0.0,0.0,0.0


Следующий шаг - векторные представления пользователей

In [17]:
users = pd.read_csv("users_articles.csv")
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [18]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(25)]].values))

In [19]:
def get_user_embedding(user_articles_list, option):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    if option == 'mean':
        user_vector = np.mean(user_vector, 0)
        return user_vector
    if option == 'median':
        user_vector = np.median(user_vector, 0)
        return user_vector
    if option == 'max':
        user_vector = np.max(user_vector, 0)
        return user_vector
    

In [20]:
target = pd.read_csv("users_churn.csv")
target.head(3)

Unnamed: 0,uid,churn
0,u107120,0
1,u102277,0
2,u102444,0


In [21]:
class UserProcessor():
    def __init__(self, users, target, option) -> None:
        self.users = users
        self.option = option
        self.target = target
        self.user_embeddings = None

    def fit(self):
        self.user_embeddings = pd.DataFrame([i for i in self.users['articles'].apply(lambda x: get_user_embedding(x, self.option), 1)])
        self.user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
        self.user_embeddings['uid'] = self.users['uid'].values
        self.user_embeddings = self.user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
    
    def transform(self):
        X = pd.merge(self.user_embeddings, self.target, 'left')
        return X

In [31]:
class MLModel():
    def __init__(self, users, target, option) -> None:
        self.users = users
        self.target = target
        self.option = option
        self.logreg = None
        self.X_test = None
        self.y_test = None
    
    def fit(self):
        X = UserProcessor(self.users, self.target, self.option)
        X.fit()
        X = X.transform()
        X_train, self.X_test, y_train, self.y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=0)
        self.logreg = LogisticRegression()
        self.logreg.fit(X_train, y_train)

    def transform(self):
        preds = self.logreg.predict_proba(self.X_test)[:, 1]
        return preds


In [43]:
ML_mean = MLModel(users, target, 'mean')
ML_mean.fit()
ML_mean_preds = ML_mean.transform()

In [44]:
ML_median = MLModel(users, target, 'median')
ML_median.fit()
ML_median_preds = ML_median.transform()

In [45]:
ML_max = MLModel(users, target, 'max')
ML_max.fit()
ML_max_preds = ML_max.transform()

__Задание 5:__ Сформировать на выходе единую таблицу, сравнивающую качество 3 разных метода получения эмбедингов пользователей: mean, median, max, idf_mean по метрикам roc_auc, precision, recall, f_score

In [46]:
def accuracy_metrics(y_test, preds):
    precision, recall, thresholds = precision_recall_curve(y_test, preds)
    fscore = (2 * precision * recall) / (precision + recall)
    fscore = np.nan_to_num(fscore, nan=0)
    # locate the index of the largest f score
    ix = np.argmax(fscore)

    return [roc_auc_score(y_test, preds), fscore[ix], precision[ix], recall[ix]]
    
    # print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
    #                                                                     fscore[ix],
    #                                                                     precision[ix],
    #                                                                     recall[ix]))
    # print(f'ROC_AUC score = {roc_auc_score(y_test, preds)}')

In [47]:
table = {'mean': accuracy_metrics(ML_mean.y_test, ML_mean_preds),
        'median': accuracy_metrics(ML_median.y_test, ML_median_preds),
        'max': accuracy_metrics(ML_max.y_test, ML_max_preds)}

table = pd.DataFrame(data=table, index=['roc_auc score', 'F-score', 'Precision', 'Recall'])
table

Unnamed: 0,mean,median,max
roc_auc score,0.934745,0.964975,0.970524
F-score,0.65411,0.756,0.772532
Precision,0.563422,0.741176,0.81448
Recall,0.779592,0.771429,0.734694


__Задание 6:__ Сделать самостоятельные выводы и предположения о том, почему тот или ной способ оказался эффективнее остальных

Вывод: наилучший результат в той модели, где в рассчете user_embedding используется максимальное значение в векторном представлении пользователя. Таким образом, выбирается тема с наибольшей вероятностью (максимальной вероятностью), поэтому и точность модели повышается. А mean и median - усредняют вероятность, что бессмысленно в построении модели.