# 2th_homework

In [44]:
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords

import re
from razdel import tokenize
import pymorphy2

from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from gensim.test.utils import datapath

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix

## 2. Модифицировать код функции get_user_embedding таким образом, чтобы считалось не среднее (как в примере np.mean), а медиана. Применить такое преобразование к данным, обучить модель прогнозирования оттока и посчитать метрики качества и сохранить их: roc auc, precision/recall/f_score (для 3 последних - подобрать оптимальный порог с помощью precision_recall_curve, как это делалось на уроке)

### Исходный код из методички:

In [18]:
users = pd.read_csv("users_articles.csv")
news = pd.read_csv("articles.csv")

# nltk.download('stopwords')
stopword_ru = stopwords.words('russian')
with open(r'E:\MyDocuments\GeekBrains\11. Business ML\2th lesson\2th homework\stopwords.txt') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
stopword_ru += additional_stopwords

morph = pymorphy2.MorphAnalyzer()

In [19]:
def clean_text(text):
    '''
    очистка текста
    
    на выходе очищеный текст
    
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    
    #tokens = list(tokenize(text))
    #words = [_.text for _ in tokens]
    #words = [w for w in words if w not in stopword_ru]
    
    #return " ".join(words)
    return text

In [20]:
cache = {}

def lemmatization(text):
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

In [21]:
%%time

#Запускаем очистку текста. Будет долго...
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

  text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)


Wall time: 18.4 s


In [22]:
%%time

#Запускаем лемматизацию текста. Будет очень долго...
news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

Wall time: 2min 12s


In [25]:
#сформируем список наших текстов, разбив еще и на пробелы
texts = [t for t in news['title'].values]

# Create a corpus from a list of texts
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

In [28]:
%%time

# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=25, id2word=common_dictionary)#, passes=10)

Wall time: 28.3 s


In [30]:
# Save model to disk.
temp_file = datapath("model.lda")
lda.save(temp_file)

# Load a potentially pretrained model from disk.
lda = LdaModel.load(temp_file)

In [31]:
def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(25):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [32]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(25)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(25)]]
topic_matrix.head(5)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,6,0.0,0.0,0.182175,0.040575,0.0,0.0,0.0,0.0,0.0,...,0.684349,0.085612,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4896,0.0,0.0,0.0,0.0,0.356071,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.505239,0.0,0.0,0.0,0.0
2,4897,0.0,0.191091,0.0,0.0,0.095974,0.0,0.0,0.0,0.104203,...,0.0,0.054423,0.0,0.0,0.0,0.0,0.0,0.536369,0.0,0.0
3,4898,0.145756,0.059826,0.020929,0.0,0.099752,0.249674,0.0,0.0,0.0,...,0.0,0.108402,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4899,0.0,0.0,0.327729,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.109968,0.0,0.0


In [37]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(25)]].values))

In [86]:
def get_user_embedding(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.mean(user_vector, 0)
    return user_vector

In [87]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.005326,0.11037,0.006331,0.05685,0.0,0.019125,0.0,0.0,0.028442,...,0.009654,0.244002,0.0,0.0,0.0,0.008698,0.074788,0.076235,0.030413,0.01337
1,u108690,0.0,0.180969,0.020035,0.004105,0.0,0.027041,0.0,0.001708,0.007296,...,0.020371,0.209953,0.0,0.0,0.0,0.066436,0.03567,0.013271,0.026737,0.0
2,u108339,0.033509,0.081496,0.013163,0.015112,0.003059,0.055254,0.002305,0.0,0.008384,...,0.047054,0.240364,0.0,0.0,0.0,0.015002,0.115107,0.007395,0.01457,0.0


In [88]:
target = pd.read_csv("users_churn.csv")

X = pd.merge(user_embeddings, target, 'left')

In [89]:
#разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=0)

In [90]:
logreg = LogisticRegression()
#обучим 
logreg.fit(X_train, y_train)

LogisticRegression()

In [91]:
#наши прогнозы для тестовой выборки
preds = logreg.predict_proba(X_test)[:, 1]

In [92]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.303952, F-Score=0.737, Precision=0.683, Recall=0.800


##### Сохраним метрики исходного кода.

In [93]:
theshold_0 = thresholds[ix]
fscore_0 = fscore[ix]
precision_0 = precision[ix]
recall_0 = recall[ix]
roc_auc_score_0 = roc_auc_score(y_test, preds)

##### Внесём изменения в функцию "get_user_embedding" согласно заданию: вычислять не среднее значение, а медианное для вектора признаков пользователя.

In [94]:
def get_user_embedding(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
#     ИЗМЕНЕНИЯ
#     user_vector = np.mean(user_vector, 0)
    user_vector = np.median(user_vector, 0)
    return user_vector

##### Применим остальные преобразования и обучим новую модель.

In [95]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.0,0.086614,0.0,0.0,0.0,0.0,0.0,0.0,0.005922,...,0.0,0.20278,0.0,0.0,0.0,0.0,0.0,0.061957,0.0,0.0
1,u108690,0.0,0.141468,0.0,0.0,0.0,0.009167,0.0,0.0,0.0,...,0.019024,0.226258,0.0,0.0,0.0,0.0,0.015088,0.008927,0.0,0.0
2,u108339,0.047042,0.07349,0.0,0.015707,0.0,0.046214,0.0,0.0,0.0,...,0.044833,0.248609,0.0,0.0,0.0,0.006965,0.070304,0.0,0.005004,0.0


In [96]:
X = pd.merge(user_embeddings, target, 'left')

In [97]:
#разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=0)

In [98]:
logreg = LogisticRegression()
#обучим 
logreg.fit(X_train, y_train)

LogisticRegression()

##### Выполним прогнозирование.

In [99]:
#наши прогнозы для тестовой выборки
preds = logreg.predict_proba(X_test)[:, 1]

In [100]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.338405, F-Score=0.728, Precision=0.703, Recall=0.755


##### Сравним метрики новой и изначальной моделей.

In [101]:
theshold_1 = thresholds[ix]
fscore_1 = fscore[ix]
precision_1 = precision[ix]
recall_1 = recall[ix]
roc_auc_score_1 = roc_auc_score(y_test, preds)

In [116]:
print(f'Base Mean Model:' \
     f'\n\tBest Threshold = {np.round(theshold_0, 3)}' \
     f'\n\tF-Score = {np.round(fscore_0, 3)}' \
     f'\n\tPrecision = {np.round(precision_0, 3)}' \
     f'\n\tRecall = {np.round(recall_0, 3)}' \
     f'\n\tROC AUC = {np.round(roc_auc_score_0, 3)}' \
     f'\n'
     f'\nNew Median Model:' \
     f'\n\tBest Threshold = {np.round(theshold_1, 3)}' \
     f'\n\tF-Score = {np.round(fscore_1, 3)}' \
     f'\n\tPrecision = {np.round(precision_1, 3)}' \
     f'\n\tRecall = {np.round(recall_1, 3)}'
     f'\n\tROC AUC = {np.round(roc_auc_score_1, 3)}')

Base Mean Model:
	Best Threshold = 0.304
	F-Score = 0.737
	Precision = 0.683
	Recall = 0.8
	ROC AUC = 0.963

New Median Model:
	Best Threshold = 0.338
	F-Score = 0.728
	Precision = 0.703
	Recall = 0.755
	ROC AUC = 0.962


##### Порог выбора класса изменился. Точность прогнозирования улучшилась в ущерб Полноте. Метрика F-Score незначительно ухудшилась.

## 3. Повторить п.2, но используя уже не медиану, а max

##### Внесём изменения в функцию "get_user_embedding" согласно заданию: вычислять не среднее значение, а максимальное для вектора признаков пользователя.

In [105]:
def get_user_embedding(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
#     ИЗМЕНЕНИЯ
#     user_vector = np.mean(user_vector, 0)
    user_vector = np.max(user_vector, 0)
    return user_vector

##### Применим остальные преобразования и обучим новую модель.

In [106]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.031955,0.273128,0.037989,0.239571,0.0,0.071914,0.0,0.0,0.083086,...,0.035393,0.574095,0.0,0.0,0.0,0.052188,0.341677,0.195442,0.182476,0.080222
1,u108690,0.0,0.405275,0.094509,0.02463,0.0,0.078861,0.0,0.01025,0.024385,...,0.059606,0.339532,0.0,0.0,0.0,0.342276,0.148959,0.035374,0.091472,0.0
2,u108339,0.055714,0.136218,0.042072,0.042226,0.018352,0.129375,0.013831,0.0,0.033622,...,0.104441,0.339429,0.0,0.0,0.0,0.045963,0.328085,0.023155,0.0584,0.0


In [107]:
X = pd.merge(user_embeddings, target, 'left')

In [108]:
#разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=0)

In [109]:
logreg = LogisticRegression()
#обучим 
logreg.fit(X_train, y_train)

LogisticRegression()

##### Выполним прогнозирование.

In [110]:
#наши прогнозы для тестовой выборки
preds = logreg.predict_proba(X_test)[:, 1]

In [111]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.345687, F-Score=0.807, Precision=0.766, Recall=0.853


##### Сравним метрики новой и изначальной моделей.

In [112]:
theshold_2 = thresholds[ix]
fscore_2 = fscore[ix]
precision_2 = precision[ix]
recall_2 = recall[ix]
roc_auc_score_2 = roc_auc_score(y_test, preds)

In [115]:
print(f'Base Mean Model:' \
     f'\n\tBest Threshold = {np.round(theshold_0, 3)}' \
     f'\n\tF-Score = {np.round(fscore_0, 3)}' \
     f'\n\tPrecision = {np.round(precision_0, 3)}' \
     f'\n\tRecall = {np.round(recall_0, 3)}' \
     f'\n\tROC AUC = {np.round(roc_auc_score_0, 3)}' \
     f'\n'
     f'\nNew Max Model:' \
     f'\n\tBest Threshold = {np.round(theshold_2, 3)}' \
     f'\n\tF-Score = {np.round(fscore_2, 3)}' \
     f'\n\tPrecision = {np.round(precision_2, 3)}' \
     f'\n\tRecall = {np.round(recall_2, 3)}'
     f'\n\tROC AUC = {np.round(roc_auc_score_2, 3)}')

Base Mean Model:
	Best Threshold = 0.304
	F-Score = 0.737
	Precision = 0.683
	Recall = 0.8
	ROC AUC = 0.963

New Max Model:
	Best Threshold = 0.346
	F-Score = 0.807
	Precision = 0.766
	Recall = 0.853
	ROC AUC = 0.982


##### Порог выбора класса изменился. Все метрики улучшились.

## 4. (опциональное, если очень хочется) Воспользовавшись полученными знаниями из п.1, повторить пункт 2, но уже взвешивая новости по tfidf (подсказка: нужно получить веса-коэффициенты для каждого документа. Не все документы одинаково информативны и несут какой-то положительный сигнал). Подсказка 2 - нужен именно idf, как вес.

## 5. Сформировать на выходе единую таблицу, сравнивающую качество 3 разных метода получения эмбедингов пользователей: mean, median, max, idf_mean по метрикам roc_auc, precision, recall, f_score

In [123]:
columns = ['Base Mean Model',
          'Median Model',
          'Max Model']

indices = ['Best Threshold',
          'F-Score',
          'Precision',
          'Recall',
          'ROC AUC']

data = [[np.round(theshold_0, 3), np.round(theshold_1, 3), np.round(theshold_2, 3)],
       [np.round(fscore_0, 3), np.round(fscore_1, 3), np.round(fscore_2, 3)],
       [np.round(precision_0, 3), np.round(precision_1, 3), np.round(precision_2, 3)],
       [np.round(recall_0, 3), np.round(recall_1, 3), np.round(recall_2, 3)],
       [np.round(roc_auc_score_0, 3), np.round(roc_auc_score_1, 3), np.round(roc_auc_score_2, 3)]]

df_metrics = pd.DataFrame(data=data, index=indices, columns=columns)

df_metrics

Unnamed: 0,Base Mean Model,Median Model,Max Model
Best Threshold,0.304,0.338,0.346
F-Score,0.737,0.728,0.807
Precision,0.683,0.703,0.766
Recall,0.8,0.755,0.853
ROC AUC,0.963,0.962,0.982


## 6. Сделать самостоятельные выводы и предположения о том, почему тот или ной способ оказался эффективнее остальных