In [3]:
import re
import string
import os
import gc
import pandas as pd
import numpy as np
from datetime import datetime as dt

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from category_encoders.one_hot import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier

from config.config import PG_USER, PG_PASS, PG_HOST, PG_PORT, PG_DATABASE

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', 100)

In [4]:
URL = f"postgresql://{PG_USER}:{PG_PASS}@{PG_HOST}:{PG_PORT}/{PG_DATABASE}"

def select(query: str, conn: str = URL) -> pd.DataFrame:
    return pd.read_sql(query, conn)
    
users_query = "SELECT * FROM public.user_data"
posts_query = "SELECT * FROM public.post_text_df"
feeds_query = "SELECT * FROM public.feed_data WHERE action = 'view' LIMIT 5000000"


users_df = select(users_query)
posts_df = select(posts_query)
feeds_df = select(feeds_query)


users_df.to_csv("tables/users_df.csv", sep=",")
posts_df.to_csv("tables/posts_df.csv", sep=",")
feeds_df.to_csv("tables/feeds_df.csv", sep=",")

In [22]:
users_df = pd.read_csv("tables/users_df.csv", sep=",", index_col=0)
posts_df = pd.read_csv("tables/posts_df.csv", sep=",", index_col=0)
feeds_df = pd.read_csv("tables/feeds_df.csv", sep=",", index_col=0)

In [23]:
users_df.shape, posts_df.shape, feeds_df.shape

((163205, 8), (7023, 3), (5000000, 5))

In [24]:
posts_df.head(2)

Unnamed: 0,post_id,text,topic
0,1,UK economy facing major risks\n\nThe UK manufacturing sector will continue to face serious chall...,business
1,2,Aids and climate top Davos agenda\n\nClimate change and the fight against Aids are leading the l...,business


In [8]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')


punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


def preprocessing(text):
    tokens_list = word_tokenize(text.lower())
    punctuation_free = [token for token in tokens_list if token not in punctuation]
    stop_words_free = [token for token in punctuation_free if token not in stop_words]
    digits_free = [token for token in stop_words_free if not re.search(r'\d', token)]
    lemmatized = " ".join([lemmatizer.lemmatize(word) for word in digits_free]) 
    return lemmatized


vectorizer = TfidfVectorizer(preprocessor=preprocessing)
transformed_output = vectorizer.fit_transform(posts_df['text']).toarray()
tfidf_df = pd.DataFrame(
    transformed_output,
    index=posts_df.post_id,
    columns=vectorizer.get_feature_names_out())

posts_df['TotalTfIdf'] = tfidf_df.sum(axis=1).reset_index()[0]
posts_df['MaxTfIdf'] = tfidf_df.max(axis=1).reset_index()[0]
posts_df['MeanTfIdf'] = tfidf_df.mean(axis=1).reset_index()[0]


centered = tfidf_df - tfidf_df.mean(axis=0)
pca = PCA(n_components=30)
pca_decomp = pca.fit_transform(centered)


kmeans = KMeans(n_clusters=20, random_state=0).fit(pca_decomp)
posts_df['TextCluster'] = kmeans.labels_
dists_columns = ['DistanceTo1thCluster',
                 'DistanceTo2thCluster',
                 'DistanceTo3thCluster',
                 'DistanceTo4thCluster',
                 'DistanceTo5thCluster',
                 'DistanceTo6thCluster',
                 'DistanceTo7thCluster',
                 'DistanceTo8thCluster',
                 'DistanceTo9thCluster',
                 'DistanceTo10thCluster',
                 'DistanceTo11thCluster',
                 'DistanceTo12thCluster',
                 'DistanceTo13thCluster',
                 'DistanceTo14thCluster',
                 'DistanceTo15thCluster',
                 'DistanceTo16thCluster',
                 'DistanceTo17thCluster',
                 'DistanceTo18thCluster',
                 'DistanceTo19thCluster',
                 'DistanceTo20thCluster']
kmeans_df = pd.DataFrame(
    data=kmeans.transform(pca_decomp),
    columns=dists_columns
)
kmeans_df.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\radio\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\radio\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\radio\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,DistanceTo5thCluster,DistanceTo6thCluster,DistanceTo7thCluster,DistanceTo8thCluster,DistanceTo9thCluster,DistanceTo10thCluster,DistanceTo11thCluster,DistanceTo12thCluster,DistanceTo13thCluster,DistanceTo14thCluster,DistanceTo15thCluster,DistanceTo16thCluster,DistanceTo17thCluster,DistanceTo18thCluster,DistanceTo19thCluster,DistanceTo20thCluster
0,0.499123,0.572318,0.439299,0.477111,0.468081,0.581346,0.493616,0.572465,0.525216,0.487099,0.52695,0.453816,0.556518,0.627826,0.553593,0.401064,0.477897,0.144484,0.474009,0.447141
1,0.384693,0.354698,0.239457,0.357875,0.349026,0.495703,0.377247,0.484945,0.41792,0.357443,0.437131,0.321102,0.464352,0.563421,0.428841,0.258446,0.350378,0.357392,0.379693,0.31888
2,0.430013,0.545488,0.378393,0.389374,0.386335,0.517714,0.413296,0.526245,0.448865,0.420366,0.469637,0.378794,0.499178,0.586233,0.46909,0.224614,0.419217,0.273811,0.404879,0.358095
3,0.416464,0.53161,0.356834,0.391913,0.368782,0.471079,0.410321,0.522755,0.445604,0.410844,0.48443,0.369819,0.475704,0.576737,0.465193,0.236594,0.415661,0.387788,0.39987,0.353357
4,0.278447,0.435224,0.211019,0.228737,0.202481,0.404987,0.265577,0.41942,0.313946,0.255501,0.361903,0.224552,0.364431,0.515798,0.371155,0.116215,0.289131,0.336901,0.245687,0.154328


In [9]:
posts_df = pd.concat((posts_df,kmeans_df),axis=1).drop('text', axis=1)
posts_df.head(2)

Unnamed: 0,post_id,topic,TotalTfIdf,MaxTfIdf,MeanTfIdf,TextCluster,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,DistanceTo5thCluster,DistanceTo6thCluster,DistanceTo7thCluster,DistanceTo8thCluster,DistanceTo9thCluster,DistanceTo10thCluster,DistanceTo11thCluster,DistanceTo12thCluster,DistanceTo13thCluster,DistanceTo14thCluster,DistanceTo15thCluster,DistanceTo16thCluster,DistanceTo17thCluster,DistanceTo18thCluster,DistanceTo19thCluster,DistanceTo20thCluster
0,1,business,9.015487,0.489283,0.000208,17,0.499123,0.572318,0.439299,0.477111,0.468081,0.581346,0.493616,0.572465,0.525216,0.487099,0.52695,0.453816,0.556518,0.627826,0.553593,0.401064,0.477897,0.144484,0.474009,0.447141
1,2,business,12.08276,0.3137,0.000279,2,0.384693,0.354698,0.239457,0.357875,0.349026,0.495703,0.377247,0.484945,0.41792,0.357443,0.437131,0.321102,0.464352,0.563421,0.428841,0.258446,0.350378,0.357392,0.379693,0.31888


In [10]:
posts_df.to_sql(
   "posts_info_by_radion_nazmiev",
    con=URL,
    schema="public",
    if_exists='replace'
)

23

In [11]:
del users_query
del posts_query
del feeds_query
del nltk
del punctuation
del stop_words
del lemmatizer
del vectorizer
del transformed_output
del tfidf_df
del centered
del pca
del pca_decomp
del kmeans
del dists_columns
del kmeans_df
del PCA
del KMeans

gc.collect()

1786

In [12]:
posts_df.to_csv("tables/posts_processed_df.csv", sep=",")

In [25]:
posts_processed_df = pd.read_csv("tables/posts_processed_df.csv", sep=",", index_col=0)
posts_processed_df.head(2)

Unnamed: 0,post_id,topic,TotalTfIdf,MaxTfIdf,MeanTfIdf,TextCluster,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,DistanceTo5thCluster,DistanceTo6thCluster,DistanceTo7thCluster,DistanceTo8thCluster,DistanceTo9thCluster,DistanceTo10thCluster,DistanceTo11thCluster,DistanceTo12thCluster,DistanceTo13thCluster,DistanceTo14thCluster,DistanceTo15thCluster,DistanceTo16thCluster,DistanceTo17thCluster,DistanceTo18thCluster,DistanceTo19thCluster,DistanceTo20thCluster
0,1,business,9.015487,0.489283,0.000208,17,0.499123,0.572318,0.439299,0.477111,0.468081,0.581346,0.493616,0.572465,0.525216,0.487099,0.52695,0.453816,0.556518,0.627826,0.553593,0.401064,0.477897,0.144484,0.474009,0.447141
1,2,business,12.08276,0.3137,0.000279,2,0.384693,0.354698,0.239457,0.357875,0.349026,0.495703,0.377247,0.484945,0.41792,0.357443,0.437131,0.321102,0.464352,0.563421,0.428841,0.258446,0.350378,0.357392,0.379693,0.31888


In [14]:
df = pd.merge(
    feeds_df,
    posts_processed_df,
    on='post_id',
    how='inner'
)
df = pd.merge(
    users_df,
    df,
    on='user_id',
    how='inner'
)

df.head(2)

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source,timestamp,post_id,action,target,topic,TotalTfIdf,MaxTfIdf,MeanTfIdf,TextCluster,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,DistanceTo5thCluster,DistanceTo6thCluster,DistanceTo7thCluster,DistanceTo8thCluster,DistanceTo9thCluster,DistanceTo10thCluster,DistanceTo11thCluster,DistanceTo12thCluster,DistanceTo13thCluster,DistanceTo14thCluster,DistanceTo15thCluster,DistanceTo16thCluster,DistanceTo17thCluster,DistanceTo18thCluster,DistanceTo19thCluster,DistanceTo20thCluster
0,1174,1,20,Russia,Moscow,1,iOS,ads,2021-11-12 14:56:47,3094,view,0,covid,3.23245,0.496057,7.5e-05,4,0.284703,0.468592,0.266475,0.239078,0.065994,0.356956,0.274291,0.430965,0.321743,0.314387,0.376797,0.286842,0.310627,0.539451,0.406603,0.217205,0.294086,0.378565,0.271402,0.165141
1,1174,1,20,Russia,Moscow,1,iOS,ads,2021-11-22 07:37:43,6666,view,0,movie,7.881283,0.183492,0.000182,19,0.248211,0.464223,0.267716,0.104272,0.201745,0.418985,0.16754,0.417547,0.185012,0.2922,0.372226,0.276859,0.369206,0.518546,0.382527,0.241454,0.288909,0.380252,0.267934,0.097485


In [15]:
df.shape

(5000000, 37)

In [16]:
df['timestamp']=pd.to_datetime(df['timestamp'])
df['hour']=df['timestamp'].dt.hour
df['month']=df['timestamp'].dt.month

del feeds_df
del posts_df
del users_df

df = df.set_index(['user_id', 'post_id'])

df.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,gender,age,country,city,exp_group,os,source,timestamp,action,target,topic,TotalTfIdf,MaxTfIdf,MeanTfIdf,TextCluster,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,DistanceTo5thCluster,DistanceTo6thCluster,DistanceTo7thCluster,DistanceTo8thCluster,DistanceTo9thCluster,DistanceTo10thCluster,DistanceTo11thCluster,DistanceTo12thCluster,DistanceTo13thCluster,DistanceTo14thCluster,DistanceTo15thCluster,DistanceTo16thCluster,DistanceTo17thCluster,DistanceTo18thCluster,DistanceTo19thCluster,DistanceTo20thCluster,hour,month
user_id,post_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
1174,3094,1,20,Russia,Moscow,1,iOS,ads,2021-11-12 14:56:47,view,0,covid,3.23245,0.496057,7.5e-05,4,0.284703,0.468592,0.266475,0.239078,0.065994,0.356956,0.274291,0.430965,0.321743,0.314387,0.376797,0.286842,0.310627,0.539451,0.406603,0.217205,0.294086,0.378565,0.271402,0.165141,14,11
1174,6666,1,20,Russia,Moscow,1,iOS,ads,2021-11-22 07:37:43,view,0,movie,7.881283,0.183492,0.000182,19,0.248211,0.464223,0.267716,0.104272,0.201745,0.418985,0.16754,0.417547,0.185012,0.2922,0.372226,0.276859,0.369206,0.518546,0.382527,0.241454,0.288909,0.380252,0.267934,0.097485,7,11


In [17]:
max(df.timestamp), min(df.timestamp)

(Timestamp('2021-12-29 23:43:15'), Timestamp('2021-10-01 06:01:40'))

In [18]:
df_train = df[df.timestamp < '2021-12-15']
df_test = df[df.timestamp >= '2021-12-15']

df_train = df_train.drop(['timestamp','action'], axis=1)
df_test = df_test.drop(['timestamp','action'], axis=1)

X_train = df_train.drop('target', axis=1)
X_test = df_test.drop('target', axis=1)

y_train = df_train['target']
y_test = df_test['target']

del df
del df_train
del df_test

y_train.shape, y_test.shape

((4181420,), (818580,))

In [19]:
object_cols = [
    'topic', 'TextCluster', 'gender', 'country',
    'city', 'exp_group', 'hour', 'month',
    'os', 'source'
]

cols_for_ohe = [x for x in object_cols if X_train[x].nunique() < 5]
cols_for_mte = [x for x in object_cols if X_train[x].nunique() >= 5]


cols_for_ohe_idx = [list(X_train.columns).index(col) for col in cols_for_ohe]
cols_for_mte_idx = [list(X_train.columns).index(col) for col in cols_for_mte]

t = [
    ('OneHotEncoder', OneHotEncoder(), cols_for_ohe_idx),
    ('MeanTargetEncoder', TargetEncoder(), cols_for_mte_idx)
]

col_transform = ColumnTransformer(transformers=t)


pipe_dt = Pipeline([("column_transformer",
                     col_transform),

                    ("decision_tree",
                     DecisionTreeClassifier())])

pipe_dt.fit(X_train, y_train)



In [20]:
print(f"Качество на трейне: {roc_auc_score(y_train, pipe_dt.predict_proba(X_train)[:, 1])}")
print(f"Качество на тесте: {roc_auc_score(y_test, pipe_dt.predict_proba(X_test)[:, 1])}")

Качество на трейне: 0.9287476161934307
Качество на тесте: 0.5337510216085503


In [28]:
catboost = CatBoostClassifier(iterations=100,
                              learning_rate=1,
                              depth=2,
                              random_seed=100)

catboost.fit(X_train, y_train, object_cols, logging_level='Verbose')

0:	learn: 0.3562603	total: 675ms	remaining: 1m 6s
1:	learn: 0.3493174	total: 1.18s	remaining: 57.8s
2:	learn: 0.3479849	total: 1.74s	remaining: 56.2s
3:	learn: 0.3472809	total: 2.21s	remaining: 53.2s
4:	learn: 0.3471388	total: 2.62s	remaining: 49.8s
5:	learn: 0.3470045	total: 3.14s	remaining: 49.2s
6:	learn: 0.3467825	total: 3.57s	remaining: 47.5s
7:	learn: 0.3467252	total: 3.98s	remaining: 45.8s
8:	learn: 0.3466824	total: 4.38s	remaining: 44.2s
9:	learn: 0.3465005	total: 4.77s	remaining: 42.9s
10:	learn: 0.3464035	total: 5.21s	remaining: 42.1s
11:	learn: 0.3457141	total: 5.63s	remaining: 41.3s
12:	learn: 0.3456858	total: 6.02s	remaining: 40.3s
13:	learn: 0.3453915	total: 6.42s	remaining: 39.5s
14:	learn: 0.3453535	total: 6.82s	remaining: 38.7s
15:	learn: 0.3452820	total: 7.26s	remaining: 38.1s
16:	learn: 0.3452147	total: 7.68s	remaining: 37.5s
17:	learn: 0.3451889	total: 8.1s	remaining: 36.9s
18:	learn: 0.3451322	total: 8.54s	remaining: 36.4s
19:	learn: 0.3449184	total: 8.96s	remainin

<catboost.core.CatBoostClassifier at 0x22781dd8b50>

In [29]:
catboost.save_model(
    'catboost_model',
    format="cbm"
)

In [None]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

def plot_feature_importance(importance,names,model_type):

    feature_importance = np.array(importance)
    feature_names = np.array(names)

    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    plt.figure(figsize=(10,8))
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    plt.title(model_type + 'FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

plot_feature_importance(catboost.feature_importances_,X_train.columns,'Catboost')