# Финальный проект

Пример пайплайна, который может быть реализован, простыми словами

1. Загрузка данных из БД в Jupyter Hub, обзор данных

2. Создание признаков и обучающей выборки. Например, могут быть использованы признаки о пользователе, тексты постов и прочие статистики

3. Тренировка модели на Jupyter Hub и оценка ее качества на валидационной выборке 

4. Сохранение модели 

5. Написание сервиса: загрузка модели -> получение признаков для модели по user_id -> предсказание постов, которые лайкнут -> возвращение ответа. Важно: для того, чтобы чекер отработал, необходимо загрузить и сервис, и модель одновременно.

6. Загрузка в LMS в чекер

### Загрузка данных

In [1]:
%pip install psycopg2-binary
#%pip install sqlalchemy
import psycopg2
import pandas as pd

df_user = pd.read_sql(
    """SELECT * FROM public.user_data""",
    con="postgresql://robot-startml-ro:pheiph0hahj1Vaif@"
        "postgres.lab.karpov.courses:6432/startml"
)

df_user.head()

In [2]:
df_post = pd.read_sql(
    """SELECT * FROM public.post_text_df""",
    con="postgresql://robot-startml-ro:pheiph0hahj1Vaif@"
        "postgres.lab.karpov.courses:6432/startml"
)

df_post.head()


In [3]:
'''feed_data = pd.read_sql(
    """SELECT * FROM public.feed_data WHERE action = 'view' limit 3000000""",
    con="postgresql://robot-startml-ro:pheiph0hahj1Vaif@"
        "postgres.lab.karpov.courses:6432/startml"
)

feed_data.head()
feed_data=feed_data.drop(["action"],axis=1)'''

In [4]:
#feed_data.count()

In [5]:
#Процент таргета 1 в изначальной БД
#68686455/76892800=0.8932755082400433
#Следовательно, если всего берем 100000 записей, то с таргетом 0 - 89000
#SELECT * FROM public.feed_data WHERE target=1 order by random() limit 50000
feed_data_1 = pd.read_sql(
    """SELECT * FROM public.feed_data WHERE action = 'view' and target=0 limit 2670000""",
    con="postgresql://robot-startml-ro:pheiph0hahj1Vaif@"
        "postgres.lab.karpov.courses:6432/startml"
)#2670000

feed_data_1.head()

In [6]:
#Процент таргета 0 в изначальной БД
#8206345/76892800=0.10672449175995671
feed_data_2 = pd.read_sql(
    """SELECT * FROM public.feed_data WHERE action = 'view' and target=1 limit 330000""",
    con="postgresql://robot-startml-ro:pheiph0hahj1Vaif@"
        "postgres.lab.karpov.courses:6432/startml"
)#330000

feed_data_2.head()

In [7]:
feed_data=pd.concat([feed_data_1, feed_data_2])
feed_data=feed_data.drop(["action"],axis=1)
feed_data.head()


### Создание новых фичей

### TF-IDF

In [8]:
#%pip install nltk

In [9]:
import numpy as np
import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
import pickle
from nltk.corpus import stopwords

nltk.download('wordnet')
nltk.download('omw-1.4')

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

import re
import string

from nltk.stem import WordNetLemmatizer 

wnl = WordNetLemmatizer()

def preprocessing(line, token=wnl):
    line = line.lower()
    line = re.sub(r"[{}]".format(string.punctuation), " ", line)
    line = line.replace('\n\n', ' ').replace('\n', ' ')
    line = ' '.join([token.lemmatize(x) for x in line.split(' ')])
    return line

vectorizer_tfidf=TfidfVectorizer(max_df=0.9,min_df=0.003,stop_words='english',preprocessor=preprocessing)
tfidf_matrix=vectorizer_tfidf.fit_transform(df_post['text'])

df_post['tfidf_score']=pd.DataFrame(tfidf_matrix.todense()).sum(axis=1)
df_post

### Дополнительные признаки от нейронных сетей

In [11]:
#%pip install datasets transformers

In [12]:
#%pip install torch

In [13]:
import torch
import torch.nn as nn
import numpy as np

In [14]:
#!g1.1
from transformers import AutoTokenizer
from transformers import BertModel  # https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
from transformers import RobertaModel  # https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaModel
from transformers import DistilBertModel  # https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel


def get_model(model_name):
    assert model_name in ['bert', 'roberta', 'distilbert']
    
    checkpoint_names = {
        'bert': 'bert-base-cased',  # https://huggingface.co/bert-base-cased
        'roberta': 'roberta-base',  # https://huggingface.co/roberta-base
        'distilbert': 'distilbert-base-cased'  # https://huggingface.co/distilbert-base-cased
    }
    
    model_classes = {
        'bert': BertModel,
        'roberta': RobertaModel,
        'distilbert': DistilBertModel
    }
    
    return AutoTokenizer.from_pretrained(checkpoint_names[model_name]), model_classes[model_name].from_pretrained(checkpoint_names[model_name])

In [15]:
from datasets import Dataset

my_dict = {"text": list(df_post['text']),"label": list(df_post.topic.astype('category').cat.codes)}

dataset = Dataset.from_dict(my_dict)

In [16]:
dataset

In [17]:
dataset[0]

In [18]:
dataset[0].keys()

In [19]:
#!g1.1
tokenizer, model = get_model('bert')

In [20]:
#!g1.1
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(device)
print(torch.cuda.get_device_name())

In [21]:
#!g1.1
model = model.to(device)

In [22]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [23]:
#!g1.1
def tokenization(example):
    return tokenizer.batch_encode_plus(example['text'], add_special_tokens=True, return_token_type_ids=False, truncation=True)

dataset = dataset.map(tokenization, batched=True)

In [24]:
dataset[0].keys()

In [25]:
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

from torch.utils.data import DataLoader
loader = DataLoader(dataset, batch_size=64, collate_fn=data_collator, pin_memory=True, shuffle=False)

In [26]:
#!g1.1
from tqdm import tqdm


@torch.inference_mode()
def get_embeddings_labels(model, loader):
    model.eval()
    
    total_embeddings = []
    labels = []
    
    for batch in tqdm(loader):
        labels.append(batch['labels'].unsqueeze(1))

        batch = {key: batch[key].to(device) for key in ['attention_mask', 'input_ids']}

        embeddings = model(**batch)['last_hidden_state'][:, 0, :]

        total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0), torch.cat(labels, dim=0).to(torch.float32)

In [27]:
#!g1.1
embeddings, labels = get_embeddings_labels(model, loader)

In [28]:
#!g1.1
embeddings.shape, labels.shape

### PCA Анализ

In [29]:
#!g1.1
embed=pd.DataFrame(embeddings)

In [30]:
#!g1.1
### Центрируем данные

embed = embed.subtract(embed.mean())

embed.head()

In [31]:
### Разложим матрицу Х на главные компоненты

from sklearn.decomposition import PCA

pca = PCA(n_components=30)

PCA_dat = pca.fit_transform(embed)

PCA_dataset = pd.DataFrame(PCA_dat)#columns=['PCA_1', 'PCA_2','PCA_3', 'PCA_4','PCA_5', 'PCA_6','PCA_7', 'PCA_8','PCA_9', 'PCA_10']

PCA_dataset.head()

In [32]:
#Конкатим полученную матрицу с изначальной
#df_meta = pd.concat([df_post, PCA_dataset], axis=1)

## Кластеризация

In [33]:
### Применим к объектам K-means 
### с 10 кластерами 
from sklearn.cluster import KMeans
   
kmeans = KMeans(n_clusters=16, random_state=0).fit(PCA_dat)


In [34]:
df_meta=df_post
df_meta['Clasters']= kmeans.labels_

dists_columns = ['DistanceTo1thCluster',
                 'DistanceTo2thCluster',
                 'DistanceTo3thCluster',
                 'DistanceTo4thCluster',
                 'DistanceTo5thCluster',
                 'DistanceTo6thCluster',
                 'DistanceTo7thCluster',
                 'DistanceTo8thCluster',
                 'DistanceTo9thCluster',
                 'DistanceTo10thCluster',
                 'DistanceTo11thCluster',
                 'DistanceTo12thCluster',
                 'DistanceTo13thCluster',
                 'DistanceTo14thCluster',
                 'DistanceTo15thCluster',
                 'DistanceTo16thCluster'
                 ]

dists_df = pd.DataFrame(
    data=kmeans.transform(PCA_dat),
    columns=dists_columns
)

dists_df.head()

In [35]:
df_meta = pd.concat((df_meta,dists_df), axis=1)

df_meta.head()

In [36]:
#df_meta=df_post

In [37]:
#!g1.1 embed
#df_meta = pd.concat([df_post, pd.DataFrame(embeddings)], axis=1)

In [38]:
#Конкатим полученную матрицу с изначальной
#df_meta = pd.concat([df_post, pd.DataFrame(X)], axis=1)

In [39]:
#Для обучения df_meta_without_topic без 'topic', a в загрузку на сервер df_meta
df_meta_without_text=df_meta.drop(['text'],axis=1)

In [40]:
df_meta_without_text.head()

In [41]:
df = pd.merge(
    feed_data,
    df_meta_without_text,
    on='post_id',
    how='left'
)
df.head()

In [42]:
df_new = pd.merge(
    df,
    df_user,
    on='user_id',
    how='left'
)
df_new = df_new.set_index(['user_id', 'post_id'])
df_new.head()

In [43]:
#Добавим еще признаки
df_new['hour'] = pd.to_datetime(df_new['timestamp']).apply(lambda x: x.hour)
df_new['month'] = pd.to_datetime(df_new['timestamp']).apply(lambda x: x.month)
df_new.head()

In [44]:
df_new.corr()

In [45]:
max(df_new.timestamp), min(df_new.timestamp)

In [46]:
### За отсечку возьмем 2021-12-15

df_train = df_new[df_new.timestamp < '2021-12-15']
df_test = df_new[df_new.timestamp >= '2021-12-15']

df_train = df_train.drop('timestamp', axis=1)
df_test = df_test.drop('timestamp', axis=1)

X_train = df_train.drop('target', axis=1)
X_test = df_test.drop('target', axis=1)

y_train = df_train['target']
y_test = df_test['target']

y_train.shape, y_test.shape

## Catboost

In [47]:
#%pip install catboost

In [62]:
categorical_columns = [
    'topic', 'Clasters', 'gender', 'country',
    'city', 'exp_group',
    'os', 'source','hour', 'month'
]#,'hour', 'month'

from catboost import CatBoostClassifier

catboost_model = CatBoostClassifier(iterations=100,
                              learning_rate=1,
                              depth=2)


catboost_model.fit(X_train, y_train,
                   categorical_columns    
             )

In [67]:
catboost_model.save_model('catboost_model_with_16_clasters_3+hour_month.cbm',
                           format="cbm")

In [66]:
### Замерим качество работы такой модели
### Возьмем ROC-AUC
from sklearn.metrics import roc_auc_score

print(f"Качество на трейне: {roc_auc_score(y_train, catboost_model.predict_proba(X_train)[:, 1])}")
print(f"Качество на тесте: {roc_auc_score(y_test, catboost_model.predict_proba(X_test)[:, 1])}")

In [68]:

import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

def plot_feature_importance(importance,names,model_type):
    
    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    
    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    
    #Define size of bar plot
    plt.figure(figsize=(20,16))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + 'FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')
    
plot_feature_importance(catboost_model.feature_importances_,X_train.columns,'Catboost')

### Сохраняем полученные фичи в БД

In [69]:
#!c1.32
import psycopg2
from sqlalchemy import create_engine

# предполагаем, что у нас есть DataFrame с названием 'date'

# Подключаемся к базе данных PGSQL
engine = create_engine("postgresql://robot-startml-ro:pheiph0hahj1Vaif@""postgres.lab.karpov.courses:6432/startml")

# Пишем в PGSQL
df_meta.to_sql('meta_ve', con=engine, if_exists='replace')


In [71]:
df_meta.shape

In [60]:
df.shape

In [61]:
df.head()