In [1]:
import re
import string
import os
import gc
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
from datetime import datetime as dt

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from category_encoders.one_hot import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from sqlalchemy.engine.base import Engine
from yaml import safe_load

from database.database import engine
from config.config import PG_USER, PG_PASS, PG_HOST, PG_PORT, PG_DATABASE


pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', 100)

In [2]:
def select(query: str, conn: Engine=engine) -> pd.DataFrame:
    return pd.read_sql(query, conn)
    
users_query = "SELECT * FROM public.user_data"
posts_query = "SELECT * FROM public.post_text_df"
feeds_query = "SELECT * FROM public.feed_data WHERE action = 'view' LIMIT 5000000"

users_df = select(users_query)
posts_df = select(posts_query)
feeds_df = select(feeds_query)

In [3]:
users_df.shape, posts_df.shape, feeds_df.shape

((163205, 8), (7023, 3), (5000000, 5))

In [4]:
posts_df.head(2)

Unnamed: 0,post_id,text,topic
0,1,UK economy facing major risks\n\nThe UK manufacturing sector will continue to face serious chall...,business
1,2,Aids and climate top Davos agenda\n\nClimate change and the fight against Aids are leading the l...,business


In [5]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')


punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


def preprocessing(text):
    tokens_list = word_tokenize(text.lower())
    punctuation_free = [token for token in tokens_list if token not in punctuation]
    stop_words_free = [token for token in punctuation_free if token not in stop_words]
    digits_free = [token for token in stop_words_free if not re.search(r'\d', token)]
    lemmatized = " ".join([lemmatizer.lemmatize(word) for word in digits_free]) 
    return lemmatized


vectorizer = TfidfVectorizer(preprocessor=preprocessing)
transformed_output = vectorizer.fit_transform(posts_df['text']).toarray()
tfidf_df = pd.DataFrame(
    transformed_output,
    index=posts_df.post_id,
    columns=vectorizer.get_feature_names_out())

posts_df['TotalTfIdf'] = tfidf_df.sum(axis=1).reset_index()[0]
posts_df['MaxTfIdf'] = tfidf_df.max(axis=1).reset_index()[0]
posts_df['MeanTfIdf'] = tfidf_df.mean(axis=1).reset_index()[0]


centered = tfidf_df - tfidf_df.mean(axis=0)
pca = PCA(n_components=30)
pca_decomp = pca.fit_transform(centered)


kmeans = KMeans(n_clusters=20, random_state=0).fit(pca_decomp)
posts_df['TextCluster'] = kmeans.labels_
dists_columns = ['DistanceTo1thCluster',
                 'DistanceTo2thCluster',
                 'DistanceTo3thCluster',
                 'DistanceTo4thCluster',
                 'DistanceTo5thCluster',
                 'DistanceTo6thCluster',
                 'DistanceTo7thCluster',
                 'DistanceTo8thCluster',
                 'DistanceTo9thCluster',
                 'DistanceTo10thCluster',
                 'DistanceTo11thCluster',
                 'DistanceTo12thCluster',
                 'DistanceTo13thCluster',
                 'DistanceTo14thCluster',
                 'DistanceTo15thCluster',
                 'DistanceTo16thCluster',
                 'DistanceTo17thCluster',
                 'DistanceTo18thCluster',
                 'DistanceTo19thCluster',
                 'DistanceTo20thCluster']
kmeans_df = pd.DataFrame(
    data=kmeans.transform(pca_decomp),
    columns=dists_columns
)
kmeans_df.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/radionnazmiev/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/radionnazmiev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/radionnazmiev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,DistanceTo5thCluster,DistanceTo6thCluster,DistanceTo7thCluster,DistanceTo8thCluster,DistanceTo9thCluster,DistanceTo10thCluster,DistanceTo11thCluster,DistanceTo12thCluster,DistanceTo13thCluster,DistanceTo14thCluster,DistanceTo15thCluster,DistanceTo16thCluster,DistanceTo17thCluster,DistanceTo18thCluster,DistanceTo19thCluster,DistanceTo20thCluster
0,0.493781,0.715633,0.460887,0.489805,0.57801,0.388478,0.453619,0.471042,0.458475,0.510455,0.47485,0.554643,0.578019,0.5583,0.127352,0.495808,0.544718,0.523475,0.445957,0.46609
1,0.378274,0.623627,0.341158,0.372749,0.370449,0.240121,0.327436,0.373174,0.335957,0.399326,0.341067,0.461655,0.493461,0.466838,0.360706,0.38007,0.418891,0.436329,0.229788,0.334372
2,0.422594,0.624769,0.373712,0.403963,0.552234,0.22165,0.35757,0.395152,0.382942,0.426297,0.403015,0.490348,0.512511,0.507645,0.263905,0.412574,0.453416,0.465161,0.377739,0.406806
3,0.425389,0.598581,0.374433,0.417938,0.548786,0.259246,0.373911,0.408509,0.388838,0.43994,0.406383,0.47929,0.478455,0.516361,0.406711,0.410089,0.4731,0.489005,0.371121,0.417855
4,0.274426,0.559385,0.194327,0.264999,0.450796,0.098348,0.180611,0.241464,0.248593,0.294536,0.236393,0.364906,0.404367,0.403453,0.344585,0.267411,0.366204,0.362183,0.225888,0.277925


In [6]:
posts_df = pd.concat((posts_df,kmeans_df),axis=1)
posts_df.head(2)

Unnamed: 0,post_id,text,topic,TotalTfIdf,MaxTfIdf,MeanTfIdf,TextCluster,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,DistanceTo5thCluster,DistanceTo6thCluster,DistanceTo7thCluster,DistanceTo8thCluster,DistanceTo9thCluster,DistanceTo10thCluster,DistanceTo11thCluster,DistanceTo12thCluster,DistanceTo13thCluster,DistanceTo14thCluster,DistanceTo15thCluster,DistanceTo16thCluster,DistanceTo17thCluster,DistanceTo18thCluster,DistanceTo19thCluster,DistanceTo20thCluster
0,1,UK economy facing major risks\n\nThe UK manufacturing sector will continue to face serious chall...,business,9.015487,0.489283,0.000208,14,0.493781,0.715633,0.460887,0.489805,0.57801,0.388478,0.453619,0.471042,0.458475,0.510455,0.47485,0.554643,0.578019,0.5583,0.127352,0.495808,0.544718,0.523475,0.445957,0.46609
1,2,Aids and climate top Davos agenda\n\nClimate change and the fight against Aids are leading the l...,business,12.08276,0.3137,0.000279,18,0.378274,0.623627,0.341158,0.372749,0.370449,0.240121,0.327436,0.373174,0.335957,0.399326,0.341067,0.461655,0.493461,0.466838,0.360706,0.38007,0.418891,0.436329,0.229788,0.334372


In [7]:
posts_df.to_sql(
   "posts_info_by_radion_nazmiev",
    con=engine,
    schema="public",
    if_exists='replace'
)

23

In [8]:
del users_query
del posts_query
del feeds_query
del nltk
del punctuation
del stop_words
del lemmatizer
del vectorizer
del transformed_output
del tfidf_df
del centered
del pca
del pca_decomp
del kmeans
del dists_columns
del kmeans_df
del PCA
del KMeans

gc.collect()

321

In [9]:
df = pd.merge(
    feeds_df,
    posts_df,
    on='post_id',
    how='inner'
)
df = pd.merge(
    users_df,
    df,
    on='user_id',
    how='inner'
)

df.shape

(5000000, 38)

In [10]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['hour'] = df['timestamp'].dt.hour
df['month'] = df['timestamp'].dt.month

del feeds_df
del posts_df
del users_df

df = df.set_index(['user_id', 'post_id'])

df.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,gender,age,country,city,exp_group,os,source,timestamp,action,target,text,topic,TotalTfIdf,MaxTfIdf,MeanTfIdf,TextCluster,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,DistanceTo5thCluster,DistanceTo6thCluster,DistanceTo7thCluster,DistanceTo8thCluster,DistanceTo9thCluster,DistanceTo10thCluster,DistanceTo11thCluster,DistanceTo12thCluster,DistanceTo13thCluster,DistanceTo14thCluster,DistanceTo15thCluster,DistanceTo16thCluster,DistanceTo17thCluster,DistanceTo18thCluster,DistanceTo19thCluster,DistanceTo20thCluster,hour,month
user_id,post_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1
4458,5879,1,29,Kazakhstan,Almaty,0,iOS,ads,2021-11-01 07:11:23,view,1,"This movie and several other movies from the 1950s with a religious overtone, such as The Robe, ...",movie,6.809196,0.396121,0.000157,9,0.301202,0.629608,0.234056,0.262895,0.496206,0.246154,0.173163,0.30202,0.327792,0.138855,0.317836,0.38976,0.436557,0.426288,0.408458,0.291057,0.416121,0.39187,0.300563,0.304662,7,11
4458,5114,1,29,Kazakhstan,Almaty,0,iOS,ads,2021-12-08 10:55:41,view,0,"I seriously love this film so much, I never get sick of watching it. The only line I really cant...",movie,7.267828,0.44657,0.000168,3,0.245933,0.652321,0.272871,0.131867,0.509239,0.293383,0.149638,0.296914,0.339495,0.202056,0.318013,0.417948,0.465748,0.398326,0.435038,0.326735,0.425509,0.408675,0.337306,0.325758,10,12


In [11]:
df.columns

Index(['gender', 'age', 'country', 'city', 'exp_group', 'os', 'source',
       'timestamp', 'action', 'target', 'text', 'topic', 'TotalTfIdf',
       'MaxTfIdf', 'MeanTfIdf', 'TextCluster', 'DistanceTo1thCluster',
       'DistanceTo2thCluster', 'DistanceTo3thCluster', 'DistanceTo4thCluster',
       'DistanceTo5thCluster', 'DistanceTo6thCluster', 'DistanceTo7thCluster',
       'DistanceTo8thCluster', 'DistanceTo9thCluster', 'DistanceTo10thCluster',
       'DistanceTo11thCluster', 'DistanceTo12thCluster',
       'DistanceTo13thCluster', 'DistanceTo14thCluster',
       'DistanceTo15thCluster', 'DistanceTo16thCluster',
       'DistanceTo17thCluster', 'DistanceTo18thCluster',
       'DistanceTo19thCluster', 'DistanceTo20thCluster', 'hour', 'month'],
      dtype='object')

In [12]:
max(df.timestamp), min(df.timestamp)

(Timestamp('2021-12-29 23:43:27'), Timestamp('2021-10-01 06:05:25'))

In [13]:
df_train = df[df.timestamp < '2021-12-15']
df_test = df[df.timestamp >= '2021-12-15']

df_train = df_train.drop(['timestamp','action', 'text'], axis=1)
df_test = df_test.drop(['timestamp','action', 'text'], axis=1)

X_train = df_train.drop('target', axis=1)
X_test = df_test.drop('target', axis=1)

y_train = df_train['target']
y_test = df_test['target']
del df
del df_train
del df_test

y_train.shape, y_test.shape

((4173595,), (826405,))

In [14]:
object_cols = [
    'topic', 'TextCluster', 'gender', 'country',
    'city', 'exp_group', 'hour', 'month',
    'os', 'source'
]

cols_for_ohe = [x for x in object_cols if X_train[x].nunique() < 5]
cols_for_mte = [x for x in object_cols if X_train[x].nunique() >= 5]


cols_for_ohe_idx = [list(X_train.columns).index(col) for col in cols_for_ohe]
cols_for_mte_idx = [list(X_train.columns).index(col) for col in cols_for_mte]

t = [
    ('OneHotEncoder', OneHotEncoder(), cols_for_ohe_idx),
    ('MeanTargetEncoder', TargetEncoder(), cols_for_mte_idx)
]

col_transform = ColumnTransformer(transformers=t)


pipe_dt = Pipeline([("column_transformer",
                     col_transform),

                    ("decision_tree",
                     DecisionTreeClassifier())])

pipe_dt.fit(X_train, y_train)

  for cat_name, class_ in values.iteritems():
  for cat_name, class_ in values.iteritems():


In [15]:
print(f"Качество на трейне: {roc_auc_score(y_train, pipe_dt.predict_proba(X_train)[:, 1])}")
print(f"Качество на тесте: {roc_auc_score(y_test, pipe_dt.predict_proba(X_test)[:, 1])}")

Качество на трейне: 0.9227653337254538
Качество на тесте: 0.5335871757173084


In [16]:
catboost = CatBoostClassifier(iterations=100,
                              learning_rate=1,
                              depth=2,
                              random_seed=100)

catboost.fit(X_train, y_train, object_cols, logging_level='Verbose')

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


0:	learn: 0.3563631	total: 588ms	remaining: 58.2s
1:	learn: 0.3496658	total: 1.11s	remaining: 54.4s
2:	learn: 0.3484201	total: 1.59s	remaining: 51.5s
3:	learn: 0.3476475	total: 2.01s	remaining: 48.2s
4:	learn: 0.3473756	total: 2.39s	remaining: 45.5s
5:	learn: 0.3472236	total: 2.79s	remaining: 43.8s
6:	learn: 0.3471294	total: 3.17s	remaining: 42.1s
7:	learn: 0.3464463	total: 3.58s	remaining: 41.2s
8:	learn: 0.3461976	total: 3.97s	remaining: 40.1s
9:	learn: 0.3460548	total: 4.35s	remaining: 39.2s
10:	learn: 0.3460050	total: 4.75s	remaining: 38.4s
11:	learn: 0.3459813	total: 5.07s	remaining: 37.2s
12:	learn: 0.3459321	total: 5.45s	remaining: 36.5s
13:	learn: 0.3459005	total: 5.8s	remaining: 35.6s
14:	learn: 0.3458872	total: 6.15s	remaining: 34.8s
15:	learn: 0.3457022	total: 6.55s	remaining: 34.4s
16:	learn: 0.3448128	total: 6.94s	remaining: 33.9s
17:	learn: 0.3446198	total: 7.32s	remaining: 33.4s
18:	learn: 0.3445028	total: 7.7s	remaining: 32.8s
19:	learn: 0.3444763	total: 8.08s	remaining

<catboost.core.CatBoostClassifier at 0x7f38aacf66d0>

In [17]:
print(f"Качество на трейне: {roc_auc_score(y_train, catboost.predict_proba(X_train)[:, 1])}")
print(f"Качество на тесте: {roc_auc_score(y_test, catboost.predict_proba(X_test)[:, 1])}")

Качество на трейне: 0.666342553710827


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


Качество на тесте: 0.6482136997326485


In [18]:
catboost.save_model('catboost_model')

In [None]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

def plot_feature_importance(importance,names,model_type):

    feature_importance = np.array(importance)
    feature_names = np.array(names)

    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    plt.figure(figsize=(10,8))
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    plt.title(model_type + 'FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

plot_feature_importance(catboost.feature_importances_,X_train.columns,'Catboost')