# 2th_homework

In [1]:
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords

import re
from razdel import tokenize
import pymorphy2

from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from gensim.test.utils import datapath

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix

## 2. Модифицировать код функции get_user_embedding таким образом, чтобы считалось не среднее (как в примере np.mean), а медиана. Применить такое преобразование к данным, обучить модель прогнозирования оттока и посчитать метрики качества и сохранить их: roc auc, precision/recall/f_score (для 3 последних - подобрать оптимальный порог с помощью precision_recall_curve, как это делалось на уроке)

### Исходный код из методички:

In [2]:
users = pd.read_csv("users_articles.csv")
news = pd.read_csv("articles.csv")

# nltk.download('stopwords')
stopword_ru = stopwords.words('russian')
with open(r'stopwords.txt') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
stopword_ru += additional_stopwords

morph = pymorphy2.MorphAnalyzer()

In [3]:
def clean_text(text):
    '''
    очистка текста
    
    на выходе очищеный текст
    
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    
    #tokens = list(tokenize(text))
    #words = [_.text for _ in tokens]
    #words = [w for w in words if w not in stopword_ru]
    
    #return " ".join(words)
    return text

In [4]:
cache = {}

def lemmatization(text):
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

In [5]:
%%time

#Запускаем очистку текста. Будет долго...
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

  text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)


Wall time: 19.1 s


In [6]:
%%time

#Запускаем лемматизацию текста. Будет очень долго...
news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

Wall time: 2min 15s


In [7]:
%%time

#сформируем список наших текстов, разбив еще и на пробелы
texts = [t for t in news['title'].values]

# Create a corpus from a list of texts
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

Wall time: 7.75 s


In [8]:
%%time

# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=25, id2word=common_dictionary)#, passes=10)

Wall time: 29.2 s


In [9]:
def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(25):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [10]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(25)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(25)]]
topic_matrix.head(5)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,6,0.0,0.71027,0.0,0.0,0.0,0.0,0.0,0.019563,0.0,...,0.0,0.0,0.0,0.116719,0.0,0.0,0.0,0.0,0.0,0.0
1,4896,0.0,0.0,0.523769,0.0,0.0,0.0,0.083562,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4897,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.091903,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4898,0.0,0.028886,0.540329,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.09729,0.0,0.0,0.0
4,4899,0.0,0.82235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(25)]].values))

In [12]:
def get_user_embedding(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.mean(user_vector, 0)
    return user_vector

In [13]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.0,0.004518,0.06607,0.029519,0.003678,0.0,0.051755,0.03419,0.0,...,0.005672,0.133824,0.0,0.055275,0.013913,0.164821,0.08851,0.042236,0.0,0.023768
1,u108690,0.0,0.057849,0.061715,0.011212,0.007467,0.0,0.029935,0.032712,0.0,...,0.022146,0.145202,0.0,0.005692,0.080201,0.094988,0.071358,0.038795,0.033027,0.004759
2,u108339,0.0,0.043439,0.069624,0.0,0.004557,0.0,0.050667,0.030596,0.006149,...,0.024338,0.056349,0.0,0.010414,0.039812,0.158112,0.107339,0.019243,0.044504,0.081482


In [14]:
target = pd.read_csv("users_churn.csv")

X = pd.merge(user_embeddings, target, 'left')

In [15]:
#разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=0)

In [16]:
logreg = LogisticRegression()
#обучим 
logreg.fit(X_train, y_train)

LogisticRegression()

In [17]:
#наши прогнозы для тестовой выборки
preds = logreg.predict_proba(X_test)[:, 1]

In [18]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.297593, F-Score=0.744, Precision=0.729, Recall=0.759


##### Сохраним метрики исходной модели.

In [19]:
theshold_0 = thresholds[ix]
fscore_0 = fscore[ix]
precision_0 = precision[ix]
recall_0 = recall[ix]
roc_auc_score_0 = roc_auc_score(y_test, preds)

##### Внесём изменения в функцию "get_user_embedding" согласно заданию: вычислять не среднее значение, а медианное для вектора признаков пользователя.

In [20]:
def get_user_embedding(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
#     ИЗМЕНЕНИЯ
#     user_vector = np.mean(user_vector, 0)
    user_vector = np.median(user_vector, 0)
    return user_vector

##### Применим остальные преобразования и обучим новую модель.

In [21]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.0,0.0,0.031535,0.0,0.0,0.0,0.026533,0.0,0.0,...,0.0,0.021057,0.0,0.0,0.0,0.0,0.046076,0.018146,0.0,0.0
1,u108690,0.0,0.053319,0.073458,0.0,0.0,0.0,0.0,0.020248,0.0,...,0.014887,0.102062,0.0,0.0,0.055372,0.048481,0.063623,0.0,0.024006,0.0
2,u108339,0.0,0.040743,0.036114,0.0,0.0,0.0,0.038296,0.032316,0.0,...,0.019603,0.015462,0.0,0.0,0.031613,0.146312,0.122366,0.009974,0.0,0.081229


In [22]:
X = pd.merge(user_embeddings, target, 'left')

In [23]:
#разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=0)

In [24]:
logreg = LogisticRegression()
#обучим 
logreg.fit(X_train, y_train)

LogisticRegression()

##### Выполним прогнозирование.

In [25]:
#наши прогнозы для тестовой выборки
preds = logreg.predict_proba(X_test)[:, 1]

In [26]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.272451, F-Score=0.795, Precision=0.744, Recall=0.853


##### Сравним метрики новой и изначальной моделей.

In [27]:
theshold_1 = thresholds[ix]
fscore_1 = fscore[ix]
precision_1 = precision[ix]
recall_1 = recall[ix]
roc_auc_score_1 = roc_auc_score(y_test, preds)

In [28]:
print(f'Base Mean Model:' \
     f'\n\tBest Threshold = {np.round(theshold_0, 3)}' \
     f'\n\tF-Score = {np.round(fscore_0, 3)}' \
     f'\n\tPrecision = {np.round(precision_0, 3)}' \
     f'\n\tRecall = {np.round(recall_0, 3)}' \
     f'\n\tROC AUC = {np.round(roc_auc_score_0, 3)}' \
     f'\n'
     f'\nNew Median Model:' \
     f'\n\tBest Threshold = {np.round(theshold_1, 3)}' \
     f'\n\tF-Score = {np.round(fscore_1, 3)}' \
     f'\n\tPrecision = {np.round(precision_1, 3)}' \
     f'\n\tRecall = {np.round(recall_1, 3)}'
     f'\n\tROC AUC = {np.round(roc_auc_score_1, 3)}')

Base Mean Model:
	Best Threshold = 0.298
	F-Score = 0.744
	Precision = 0.729
	Recall = 0.759
	ROC AUC = 0.967

New Median Model:
	Best Threshold = 0.272
	F-Score = 0.795
	Precision = 0.744
	Recall = 0.853
	ROC AUC = 0.976


##### Порог выбора класса изменился. Точность прогнозирования улучшилась в ущерб Полноте. Метрика F-Score незначительно ухудшилась.

## 3. Повторить п.2, но используя уже не медиану, а max

##### Внесём изменения в функцию "get_user_embedding" согласно заданию: вычислять не среднее значение, а максимальное для вектора признаков пользователя.

In [29]:
def get_user_embedding(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
#     ИЗМЕНЕНИЯ
#     user_vector = np.mean(user_vector, 0)
    user_vector = np.max(user_vector, 0)
    return user_vector

##### Применим остальные преобразования и обучим новую модель.

In [30]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.0,0.027107,0.232136,0.098201,0.022071,0.0,0.196888,0.182661,0.0,...,0.034032,0.394648,0.0,0.33165,0.072692,0.758645,0.227219,0.110803,0.0,0.088531
1,u108690,0.0,0.120409,0.132234,0.050176,0.044802,0.0,0.103502,0.093545,0.0,...,0.055222,0.371946,0.0,0.034154,0.212331,0.330294,0.174794,0.150093,0.098181,0.01509
2,u108339,0.0,0.109939,0.194357,0.0,0.027341,0.0,0.142424,0.060171,0.025646,...,0.060767,0.242333,0.0,0.062487,0.101118,0.335134,0.214531,0.048433,0.189861,0.175773


In [31]:
X = pd.merge(user_embeddings, target, 'left')

In [32]:
#разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=0)

In [33]:
logreg = LogisticRegression()
#обучим 
logreg.fit(X_train, y_train)

LogisticRegression()

##### Выполним прогнозирование.

In [34]:
#наши прогнозы для тестовой выборки
preds = logreg.predict_proba(X_test)[:, 1]

In [35]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.366854, F-Score=0.820, Precision=0.812, Recall=0.829


##### Сравним метрики новой и изначальной моделей.

In [36]:
theshold_2 = thresholds[ix]
fscore_2 = fscore[ix]
precision_2 = precision[ix]
recall_2 = recall[ix]
roc_auc_score_2 = roc_auc_score(y_test, preds)

In [37]:
print(f'Base Mean Model:' \
     f'\n\tBest Threshold = {np.round(theshold_0, 3)}' \
     f'\n\tF-Score = {np.round(fscore_0, 3)}' \
     f'\n\tPrecision = {np.round(precision_0, 3)}' \
     f'\n\tRecall = {np.round(recall_0, 3)}' \
     f'\n\tROC AUC = {np.round(roc_auc_score_0, 3)}' \
     f'\n'
     f'\nNew Max Model:' \
     f'\n\tBest Threshold = {np.round(theshold_2, 3)}' \
     f'\n\tF-Score = {np.round(fscore_2, 3)}' \
     f'\n\tPrecision = {np.round(precision_2, 3)}' \
     f'\n\tRecall = {np.round(recall_2, 3)}'
     f'\n\tROC AUC = {np.round(roc_auc_score_2, 3)}')

Base Mean Model:
	Best Threshold = 0.298
	F-Score = 0.744
	Precision = 0.729
	Recall = 0.759
	ROC AUC = 0.967

New Max Model:
	Best Threshold = 0.367
	F-Score = 0.82
	Precision = 0.812
	Recall = 0.829
	ROC AUC = 0.98


##### Порог выбора класса изменился. Все метрики улучшились.

## 4. (опциональное, если очень хочется) Воспользовавшись полученными знаниями из п.1, повторить пункт 2, но уже взвешивая новости по tfidf (подсказка: нужно получить веса-коэффициенты для каждого документа. Не все документы одинаково информативны и несут какой-то положительный сигнал). Подсказка 2 - нужен именно idf, как вес.

##### Скопируем оригинальный набор данных, чтобы выполнять преобразования на копии.

In [38]:
news_2 = news.copy()
news_2.head()

Unnamed: 0,doc_id,title
0,6,"[заместитель, председатель, правительство, рф,..."
1,4896,"[матч, финал, кубок, россия, футбол, приостано..."
2,4897,"[форвард, авангард, томаш, заборский, прокомме..."
3,4898,"[главный, тренер, кубань, юрий, красножанин, п..."
4,4899,"[решение, попечительский, совет, владивостокск..."


##### Столбец "title" представим в виде строк.

In [39]:
news_2['title_str'] = [' '.join(map(str, word)) for word in news_2['title']]
news_2.head()

Unnamed: 0,doc_id,title,title_str
0,6,"[заместитель, председатель, правительство, рф,...",заместитель председатель правительство рф серг...
1,4896,"[матч, финал, кубок, россия, футбол, приостано...",матч финал кубок россия футбол приостановить с...
2,4897,"[форвард, авангард, томаш, заборский, прокомме...",форвард авангард томаш заборский прокомментиро...
3,4898,"[главный, тренер, кубань, юрий, красножанин, п...",главный тренер кубань юрий красножанин прокомм...
4,4899,"[решение, попечительский, совет, владивостокск...",решение попечительский совет владивостокский с...


##### Инициализируем и обучим векторизатор.

In [40]:
%%time

tf_idf = TfidfVectorizer()
news_vectorized = tf_idf.fit_transform(news_2['title_str'])

Wall time: 4.66 s


##### Теперь можно получить idf для каждого слова.
##### Однако в словаре присутствуют не все значения. Некоторые мусорные биграммы не попали, поэтому итерировать подряд не получается.

In [41]:
tf_idf.idf_[tf_idf.vocabulary_['заместитель']]

3.5402215164441695

##### Также можно получить tf-idf значения для каждого слова документа.

In [42]:
[x for x in news_vectorized.getrow(0).toarray()[0] if x != 0][0:5]

[0.13150991844482543,
 0.09432359963511662,
 0.055132338031055064,
 0.1135083846056379,
 0.10476209891237813]

##### Представим все слова всех документов в виде idf слов.

In [43]:
%%time

title = news['title']

dict_words_idf = dict(zip(tf_idf.get_feature_names_out(), tf_idf.idf_))

# title_idf = title.apply(lambda x: [tf_idf.idf_[tf_idf.vocabulary_[word]] if word in tf_idf.vocabulary_.keys() else 0 for word in x])
title_idf = title.apply(lambda x: [dict_words_idf[word] if word in dict_words_idf.keys() else 0 for word in x])

Wall time: 1.62 s


##### Получили idf всех слов для каждого документа.

In [44]:
title_idf.head()

0    [3.5402215164441695, 3.488505577705544, 2.8771...
1    [3.883764251528679, 4.882860887087066, 5.54763...
2    [5.635284677576553, 6.485130310042554, 7.59271...
3    [2.8873289323008016, 4.648272289526984, 6.3914...
4    [2.5373265673335705, 7.044746097977977, 3.2181...
Name: title, dtype: object

##### Далее необходимо вычислить idf вес для каждого документа.
##### Используем среднее всех весов слов для вычисления idf веса документа, чтобы в равной мере учитывалось влияние каждого слова в документе. 

In [45]:
%%time

title_idf_mean = title_idf.apply(lambda x: np.mean(x))

title_idf_mean.head()

Wall time: 461 ms


0    4.586159
1    4.765937
2    4.673049
3    4.526728
4    4.985130
Name: title, dtype: float64

##### Далее домножим значения LDA матрицу на вычисленные idf веса.

In [46]:
topic_matrix.head(5)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,6,0.0,0.71027,0.0,0.0,0.0,0.0,0.0,0.019563,0.0,...,0.0,0.0,0.0,0.116719,0.0,0.0,0.0,0.0,0.0,0.0
1,4896,0.0,0.0,0.523769,0.0,0.0,0.0,0.083562,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4897,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.091903,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4898,0.0,0.028886,0.540329,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.09729,0.0,0.0,0.0
4,4899,0.0,0.82235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
%%time

topic_matrix_idf = topic_matrix.copy()

topic_matrix_idf['title_idf_mean'] = title_idf_mean

topic_matrix_idf.head(5)

Wall time: 1.98 ms


Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24,title_idf_mean
0,6,0.0,0.71027,0.0,0.0,0.0,0.0,0.0,0.019563,0.0,...,0.0,0.0,0.116719,0.0,0.0,0.0,0.0,0.0,0.0,4.586159
1,4896,0.0,0.0,0.523769,0.0,0.0,0.0,0.083562,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.765937
2,4897,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.091903,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.673049
3,4898,0.0,0.028886,0.540329,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.09729,0.0,0.0,0.0,4.526728
4,4899,0.0,0.82235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.98513


In [48]:
%%time

for i in range(0, 25):
    topic_matrix_idf[f'topic_{i}'] = topic_matrix_idf[f'topic_{i}'] * topic_matrix_idf['title_idf_mean']

topic_matrix_idf = topic_matrix_idf.drop(columns=(['title_idf_mean']))

topic_matrix_idf.head(5)

Wall time: 11 ms


Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,6,0.0,3.25741,0.0,0.0,0.0,0.0,0.0,0.089721,0.0,...,0.0,0.0,0.0,0.535293,0.0,0.0,0.0,0.0,0.0,0.0
1,4896,0.0,0.0,2.496252,0.0,0.0,0.0,0.398251,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4897,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.429469,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4898,0.0,0.130761,2.445922,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.440406,0.0,0.0,0.0
4,4899,0.0,4.099521,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##### Теперь набор данных подготовлен с применением весов idf документов.
##### Далее аналогично выполняем оставшиеся действия как в п.2.

In [49]:
doc_dict = dict(zip(topic_matrix_idf['doc_id'].values, topic_matrix_idf[['topic_{}'.format(i) for i in range(25)]].values))

In [50]:
def get_user_embedding(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
#     ИЗМЕНЕНИЯ
#     user_vector = np.mean(user_vector, 0)
    user_vector = np.median(user_vector, 0)
    return user_vector

In [51]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.0,0.0,0.167749,0.0,0.0,0.0,0.13506,0.0,0.0,...,0.0,0.093899,0.0,0.0,0.0,0.0,0.205466,0.082371,0.0,0.0
1,u108690,0.0,0.231554,0.326605,0.0,0.0,0.0,0.0,0.092317,0.0,...,0.068814,0.445967,0.0,0.0,0.24759,0.211849,0.294684,0.0,0.109769,0.0
2,u108339,0.0,0.176696,0.15443,0.0,0.0,0.0,0.164454,0.144241,0.0,...,0.090894,0.067056,0.0,0.0,0.14379,0.646114,0.534345,0.04222,0.0,0.375828


In [52]:
X = pd.merge(user_embeddings, target, 'left')

In [53]:
#разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=0)

In [54]:
logreg = LogisticRegression()
#обучим 
logreg.fit(X_train, y_train)

LogisticRegression()

##### Выполним прогнозирование.

In [55]:
#наши прогнозы для тестовой выборки
preds = logreg.predict_proba(X_test)[:, 1]

In [56]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.325538, F-Score=0.835, Precision=0.796, Recall=0.878


##### Сравним метрики новой и изначальной моделей.

In [57]:
theshold_3 = thresholds[ix]
fscore_3 = fscore[ix]
precision_3 = precision[ix]
recall_3 = recall[ix]
roc_auc_score_3 = roc_auc_score(y_test, preds)

In [58]:
print(f'Base Mean Model:' \
     f'\n\tBest Threshold = {np.round(theshold_0, 3)}' \
     f'\n\tF-Score = {np.round(fscore_0, 3)}' \
     f'\n\tPrecision = {np.round(precision_0, 3)}' \
     f'\n\tRecall = {np.round(recall_0, 3)}' \
     f'\n\tROC AUC = {np.round(roc_auc_score_0, 3)}' \
     f'\n'
     f'\nNew Max Model:' \
     f'\n\tBest Threshold = {np.round(theshold_3, 3)}' \
     f'\n\tF-Score = {np.round(fscore_3, 3)}' \
     f'\n\tPrecision = {np.round(precision_3, 3)}' \
     f'\n\tRecall = {np.round(recall_3, 3)}'
     f'\n\tROC AUC = {np.round(roc_auc_score_3, 3)}')

Base Mean Model:
	Best Threshold = 0.298
	F-Score = 0.744
	Precision = 0.729
	Recall = 0.759
	ROC AUC = 0.967

New Max Model:
	Best Threshold = 0.326
	F-Score = 0.835
	Precision = 0.796
	Recall = 0.878
	ROC AUC = 0.985


##### Порог выбора класса изменился. Все метрики улучшились.

## 5. Сформировать на выходе единую таблицу, сравнивающую качество 3 разных метода получения эмбедингов пользователей: mean, median, max, idf_mean по метрикам roc_auc, precision, recall, f_score

In [59]:
columns = ['Base Mean Model',
          'Median Model',
          'Max Model',
          'IDF Median Model']

indices = ['Best Threshold',
          'F-Score',
          'Precision',
          'Recall',
          'ROC AUC']

data = [[np.round(theshold_0, 3), np.round(theshold_1, 3), np.round(theshold_2, 3), np.round(theshold_3, 3)],
       [np.round(fscore_0, 3), np.round(fscore_1, 3), np.round(fscore_2, 3), np.round(fscore_3, 3)],
       [np.round(precision_0, 3), np.round(precision_1, 3), np.round(precision_2, 3), np.round(precision_3, 3)],
       [np.round(recall_0, 3), np.round(recall_1, 3), np.round(recall_2, 3), np.round(recall_3, 3)],
       [np.round(roc_auc_score_0, 3), np.round(roc_auc_score_1, 3), np.round(roc_auc_score_2, 3), np.round(roc_auc_score_3, 3)]]

df_metrics = pd.DataFrame(data=data, index=indices, columns=columns)

df_metrics

Unnamed: 0,Base Mean Model,Median Model,Max Model,IDF Median Model
Best Threshold,0.298,0.272,0.367,0.326
F-Score,0.744,0.795,0.82,0.835
Precision,0.729,0.744,0.812,0.796
Recall,0.759,0.853,0.829,0.878
ROC AUC,0.967,0.976,0.98,0.985


## 6. Сделать самостоятельные выводы и предположения о том, почему тот или ной способ оказался эффективнее остальных

#### Base Mean Model - базовая модель, использует средние значения векторов вероятностей статей для формирования признаков читателей.
В сравнении с модифицированными моделями, данная базовая модель показала наихудшие метрики качества прогнозирования. Очевидно, что использование усреднения векторов не является подходящим подходом для решения данной задачи.

#### Median Model - модель, использующая медианные значения векторов вместо средних.
Согласно метрикам качества, данный подход продемонстрировал лучшую эффективность, чем базовая усредняющая модель. Медианные значения лучше отражают интересы читателя при большом количестве прочитанных статей. Так например, пользователь мог редко читать статьи опредеённых категорий. В данном случае медиана значений векторов статей будет более объективно отражать интересы читателя.

#### Max Model - данная модель использует максимальные значения векторов.
Данный подход позволяет зафиксировать интересы читателя, если он когда-либо интересовался определёнными темами. Эффективность оказалась лучше двух предыдущих методов.

#### IDF Median Model - модель является усовершенствованной версией медианной модели засчёт применения idf весов документов.
Наилучшие результаты показала модель, учитывающая качества статей. В данном подходе вектора статей домножаются на веса, которые отражают уникальность слов в статье. Таким образом, наибольшие веса получили статьи в которых используются наиболее редкие слова, что позволяет лучше определить тематику статьи и соответственно, интересы читателя в дальнейшем.