<a href="https://colab.research.google.com/github/VVdovichev/ML_in_Business/blob/main/HW_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!pip install razdel
#!pip install pymorphy2
#!pip install nltk

In [None]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from gensim.models import LdaModel
from razdel import tokenize
import pymorphy2
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix)
import itertools
import matplotlib.pyplot as plt
import json

%matplotlib inline

__2.__ Модифицировать код функции get_user_embedding таким образом, чтобы считалось не среднее (как в примере np.mean), а медиана. Применить такое преобразование к данным, обучить модель прогнозирования оттока и посчитать метрики качества и сохранить их: roc auc, precision/recall/f_score (для 3 последних - подобрать оптимальный порог с помощью precision_recall_curve, как это делалось на уроке)

In [None]:
news = pd.read_csv("articles.csv")
print(news.shape)
news.head(3)

(27000, 2)


Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...


In [None]:
users = pd.read_csv("users_articles.csv")
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [None]:
stopword_ru = stopwords.words('russian')
len(stopword_ru)

morph = pymorphy2.MorphAnalyzer()

In [None]:
with open('stopwords.txt') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
stopword_ru += additional_stopwords
len(stopword_ru)

776

In [None]:
def clean_text(text):
    '''
    очистка текста
    
    на выходе очищеный текст
    
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    
    #tokens = list(tokenize(text))
    #words = [_.text for _ in tokens]
    #words = [w for w in words if w not in stopword_ru]
    
    #return " ".join(words)
    return text

cache = {}

def lemmatization(text):
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

In [None]:
%%time
#Запускаем очистку текста.-
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

CPU times: user 25.3 s, sys: 185 ms, total: 25.5 s
Wall time: 25.6 s


In [None]:
%%time
#Запускаем лемматизацию текста.
news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

CPU times: user 4min 46s, sys: 760 ms, total: 4min 47s
Wall time: 4min 47s


In [None]:
#сформируем список наших текстов, разбив еще и на пробелы
texts = [t for t in news['title'].values]

# Create a corpus from a list of texts
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

In [None]:
%%time
# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=25, id2word=common_dictionary)#, passes=10)

CPU times: user 52.9 s, sys: 25.5 s, total: 1min 18s
Wall time: 50.2 s


In [None]:
def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(25):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [None]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(25)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(25)]]
topic_matrix.head(5)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,6,0.0,0.602959,0.0,0.088601,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052257,0.0,0.0,0.086046,0.0,0.0,0.152218,0.0,0.011532,0.0,0.0,0.0
1,4896,0.0,0.0,0.0,0.095165,0.0,0.093867,0.368376,0.0,0.0,0.030652,0.0,0.0,0.0,0.038913,0.0,0.0,0.354491,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4897,0.0,0.0,0.0,0.0,0.032934,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.128164,0.334346,0.207981,0.0,0.0,0.0,0.277063,0.0,0.0,0.0,0.0
3,4898,0.0,0.0,0.0,0.0,0.011516,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.835167,0.0,0.142841,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4899,0.0,0.569846,0.0,0.0,0.0,0.250524,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055162,0.0,0.0,0.0,0.0,0.0,0.101765,0.0,0.0,0.0


In [None]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(25)]].values))

In [None]:
user_articles_list = users['articles'].iloc[33]

def get_user_embedding(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.mean(user_vector, 0)
    return user_vector

In [None]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.008805,0.055096,0.025875,0.038961,0.071818,0.03517,0.060896,0.038711,0.195191,0.0,0.029105,0.069121,0.0,0.031009,0.033414,0.01977,0.006508,0.062074,0.0,0.015143,0.013629,0.011362,0.061413,0.093337,0.007607
1,u108690,0.031721,0.083997,0.078508,0.08625,0.008633,0.065211,0.010348,0.024731,0.12429,0.009998,0.01694,0.062906,0.008753,0.031104,0.003387,0.075332,0.001743,0.042083,0.016032,0.00291,0.060511,0.014445,0.102502,0.025496,0.002294
2,u108339,0.004524,0.074125,0.161537,0.039101,0.012498,0.050185,0.015641,0.123607,0.11029,0.0022,0.0,0.06256,0.0,0.044057,0.001871,0.011607,0.02503,0.039711,0.015737,0.012372,0.061257,0.003363,0.048285,0.04409,0.022283


In [None]:
target = pd.read_csv("users_churn.csv")

In [None]:
X = pd.merge(user_embeddings, target, 'left')
X.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24,churn
0,u105138,0.008805,0.055096,0.025875,0.038961,0.071818,0.03517,0.060896,0.038711,0.195191,0.0,0.029105,0.069121,0.0,0.031009,0.033414,0.01977,0.006508,0.062074,0.0,0.015143,0.013629,0.011362,0.061413,0.093337,0.007607,0
1,u108690,0.031721,0.083997,0.078508,0.08625,0.008633,0.065211,0.010348,0.024731,0.12429,0.009998,0.01694,0.062906,0.008753,0.031104,0.003387,0.075332,0.001743,0.042083,0.016032,0.00291,0.060511,0.014445,0.102502,0.025496,0.002294,1
2,u108339,0.004524,0.074125,0.161537,0.039101,0.012498,0.050185,0.015641,0.123607,0.11029,0.0022,0.0,0.06256,0.0,0.044057,0.001871,0.011607,0.02503,0.039711,0.015737,0.012372,0.061257,0.003363,0.048285,0.04409,0.022283,1


In [None]:
#разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=0)

In [None]:
logreg = LogisticRegression()
#обучим 
logreg.fit(X_train, y_train)

LogisticRegression()

In [None]:
#наши прогнозы для тестовой выборки
preds = logreg.predict_proba(X_test)[:, 1]
preds[:10]

array([0.14065467, 0.0413544 , 0.40839991, 0.25995009, 0.07390416,
       0.04734652, 0.10960944, 0.1448871 , 0.07955559, 0.14482044])

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.245343, F-Score=0.600, Precision=0.547, Recall=0.665


In [None]:
theshold_mean = thresholds[ix]
fscore_mean = fscore[ix]
precision_mean = precision[ix]
recall_mean = recall[ix]
roc_auc_score_mean = roc_auc_score(y_test, preds)

##Median

__3.__ Повторить п.2, но используя уже не медиану, а max

In [None]:
def get_user_embedding(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    #user_vector = np.mean(user_vector, 0)
    user_vector = np.median(user_vector, 0) #change
    return user_vector

In [None]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.0,0.046767,0.0,0.023471,0.011961,0.015369,0.0,0.0,0.189118,0.0,0.0,0.0,0.0,0.010349,0.02661,0.009657,0.0,0.007883,0.0,0.0,0.0,0.0,0.010149,0.051904,0.0
1,u108690,0.005086,0.093359,0.045291,0.080832,0.0,0.060386,0.0,0.009011,0.140155,0.0,0.0,0.057789,0.0,0.028884,0.0,0.029983,0.0,0.050331,0.0,0.0,0.03139,0.0,0.095992,0.0,0.0
2,u108339,0.0,0.064452,0.132347,0.034302,0.005043,0.052746,0.009349,0.097821,0.056397,0.0,0.0,0.026845,0.0,0.041957,0.0,0.009684,0.0,0.024295,0.007282,0.0,0.0,0.0,0.058554,0.020269,0.0


In [None]:
X = pd.merge(user_embeddings, target, 'left')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=0)

In [None]:
logreg = LogisticRegression()
#обучим 
logreg.fit(X_train, y_train)

LogisticRegression()

In [None]:
#наши прогнозы для тестовой выборки
preds = logreg.predict_proba(X_test)[:, 1]

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.259319, F-Score=0.710, Precision=0.674, Recall=0.751


In [None]:
theshold_median = thresholds[ix]
fscore_median = fscore[ix]
precision_median = precision[ix]
recall_median = recall[ix]
roc_auc_score_median = roc_auc_score(y_test, preds)

##Max

In [None]:
def get_user_embedding(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    #user_vector = np.mean(user_vector, 0)
    user_vector = np.max(user_vector, 0) #change
    return user_vector

In [None]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.052831,0.164334,0.155247,0.100042,0.244238,0.095801,0.300809,0.232269,0.488599,0.0,0.155522,0.414724,0.0,0.084013,0.076826,0.063074,0.03905,0.200853,0.0,0.090858,0.067335,0.038726,0.224447,0.321864,0.04564
1,u108690,0.160163,0.135898,0.206528,0.164047,0.051801,0.143033,0.041629,0.075,0.225649,0.059989,0.101639,0.160349,0.039736,0.072406,0.020325,0.271671,0.01046,0.097689,0.08056,0.017458,0.22935,0.086668,0.221631,0.104499,0.013763
2,u108339,0.027144,0.14701,0.29868,0.088454,0.039393,0.090054,0.048208,0.346087,0.353753,0.013202,0.0,0.185812,0.0,0.074793,0.011227,0.026116,0.132029,0.150679,0.059041,0.07423,0.286213,0.020181,0.074835,0.116159,0.071561


In [None]:
X = pd.merge(user_embeddings, target, 'left')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=0)

In [None]:
logreg = LogisticRegression()
#обучим 
logreg.fit(X_train, y_train)

LogisticRegression()

In [None]:
#наши прогнозы для тестовой выборки
preds = logreg.predict_proba(X_test)[:, 1]

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.298789, F-Score=0.751, Precision=0.708, Recall=0.800


In [None]:
theshold_max = thresholds[ix]
fscore_max = fscore[ix]
precision_max = precision[ix]
recall_max = recall[ix]
roc_auc_score_max = roc_auc_score(y_test, preds)

__4__ (опциональное, если очень хочется) Воспользовавшись полученными знаниями из п.1, повторить пункт 2, но уже взвешивая новости по tfidf (подсказка: нужно получить веса-коэффициенты для каждого документа. Не все документы одинаково информативны и несут какой-то положительный сигнал). Подсказка 2 - нужен именно idf, как вес.

In [None]:
list_docs = [' '.join(map(str, json.loads(row[1]['articles']))) for row in users[['articles']].iterrows()]

In [None]:
vectorizer = TfidfVectorizer(tokenizer=str.split)
vectorizer.fit_transform(list_docs)

<8000x14787 sparse matrix of type '<class 'numpy.float64'>'
	with 47979 stored elements in Compressed Sparse Row format>

In [None]:
dict_idf = dict(zip(list(vectorizer.vocabulary_.keys()), vectorizer.idf_))

In [None]:
def get_user_embedding(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] * dict_idf.get(str(doc_id), 1) for doc_id in user_articles_list])
    user_vector = np.median(user_vector, 0)
    return user_vector

In [None]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.0,0.415699,0.0,0.188738,0.10632,0.123587,0.0,0.0,1.611868,0.0,0.0,0.0,0.0,0.083223,0.210425,0.077653,0.0,0.063393,0.0,0.0,0.0,0.0,0.081616,0.417384,0.0
1,u108690,0.043745,0.750682,0.416807,0.677088,0.0,0.547286,0.0,0.083754,1.302629,0.0,0.0,0.509819,0.0,0.268456,0.0,0.271944,0.0,0.456902,0.0,0.0,0.286107,0.0,0.87631,0.0,0.0
2,u108339,0.0,0.57685,1.195421,0.299074,0.044825,0.46164,0.086891,0.869501,0.506572,0.0,0.0,0.249499,0.0,0.374389,0.0,0.081131,0.0,0.222371,0.062637,0.0,0.0,0.0,0.496296,0.188387,0.0


In [None]:
X = pd.merge(user_embeddings, target, 'left')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=0)

In [None]:
logreg = LogisticRegression()
#обучим 
logreg.fit(X_train, y_train)

LogisticRegression()

In [None]:
#наши прогнозы для тестовой выборки
preds = logreg.predict_proba(X_test)[:, 1]

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.480034, F-Score=0.784, Precision=0.792, Recall=0.776


In [None]:
theshold_idf = thresholds[ix]
fscore_idf = fscore[ix]
precision_idf = precision[ix]
recall_idf = recall[ix]
roc_auc_score_idf = roc_auc_score(y_test, preds)

__5.__ Сформировать на выходе единую таблицу, сравнивающую качество 3 разных метода получения эмбедингов пользователей: mean, median, max, idf_mean по метрикам roc_auc, precision, recall, f_score

In [None]:
columns = ['Base Mean Model',
          'Median Model',
          'Max Model', 
          'Idf Model']

indices = ['Best Threshold',
          'F-Score',
          'Precision',
          'Recall',
          'ROC AUC']

data = [[np.round(theshold_mean, 3), np.round(theshold_median, 3), np.round(theshold_max, 3), np.round(theshold_idf, 3)],
       [np.round(fscore_mean, 3), np.round(fscore_median, 3), np.round(fscore_max, 3), np.round(fscore_idf, 3)],
       [np.round(precision_mean, 3), np.round(precision_median, 3), np.round(precision_max, 3), np.round(precision_idf, 3)],
       [np.round(recall_mean, 3), np.round(recall_median, 3), np.round(recall_max, 3), np.round(recall_idf, 3)],
       [np.round(roc_auc_score_mean, 3), np.round(roc_auc_score_median, 3), np.round(roc_auc_score_max, 3), np.round(roc_auc_score_idf, 3)]]

df_metrics = pd.DataFrame(data=data, index=indices, columns=columns)

df_metrics

Unnamed: 0,Base Mean Model,Median Model,Max Model,Idf Model
Best Threshold,0.245,0.259,0.299,0.48
F-Score,0.6,0.71,0.751,0.784
Precision,0.547,0.674,0.708,0.792
Recall,0.665,0.751,0.8,0.776
ROC AUC,0.917,0.957,0.963,0.974


__6.__ Сделать самостоятельные выводы и предположения о том, почему тот или ной способ оказался эффективнее остальных

Лучшей моделью становится Idf_Model взятая по медиане. Она лучше отражает тематику статей и интерес пользователей за счёт весов статей умноженных на вес уникальных слов, содержащихся в документе.