In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

np.random.seed(17)
tf.random.set_seed(17)

In [2]:
df = pd.read_pickle('df_pikabu_spam_posts.pd')

In [3]:
df = df.sample(frac=1)

In [4]:
words_set = set()

for row in df.itertuples():
    for word in row.title:
        words_set.add(word)
    for word in row.text:
        words_set.add(word)
        
print(f'All words: {len(words_set)}')

All words: 59073


In [5]:
words_counter = {w: 0 for w  in words_set}

for row in df.itertuples():
    for word in row.title:
        words_counter[word] += 1
    for word in row.text:
        words_counter[word] += 1
        
word_list = list(words_counter.items())
word_list.sort(key=(lambda x: x[1]), reverse=True)

In [6]:
word_list[:10]

[('эт', 11379),
 ('котор', 6957),
 ('сво', 4743),
 ('год', 4251),
 ('так', 4122),
 ('сам', 3757),
 ('одн', 3251),
 ('работ', 3207),
 ('очен', 3167),
 ('прост', 3116)]

In [8]:
word_list = word_list[:5000]

word_list = [k[0] for k in word_list]

words_ohe_position = {word_list[i]: i for i in range(len(word_list))}

In [9]:
titles = []
texts = []

for row in df.itertuples():
    title_ohe = [0]*len(word_list)
    for word in row.title:
        try:
            title_ohe[words_ohe_position[word]] += 1
        except:
            continue
    text_ohe = [0]*len(word_list)
    for word in row.text:
        try:
            text_ohe[words_ohe_position[word]] += 1
        except:
            continue
    
    titles.append(title_ohe)
    texts.append(text_ohe)
    
titles = np.array(titles)
texts = np.array(texts)

In [10]:
titles.shape,  texts.shape

((7443, 5000), (7443, 5000))

In [11]:
y = np.array(df['bad'])

In [13]:
def train_val_test_split(x, val_frac=0.15, test_frac=0.15):
    x_train = x[:round((1-val_frac-test_frac)*len(x))]
    x_val = x[round((1-val_frac-test_frac)*len(x)):round((1-test_frac)*len(x))]
    x_test = x[round((1-test_frac)*len(x)):]
    return x_train, x_val, x_test

titles_train, titles_val, titles_test = train_val_test_split(titles)
texts_train, texts_val, texts_test = train_val_test_split(texts)
y_train, y_val, y_test = train_val_test_split(y)

In [17]:
# Model 1

text_input = tf.keras.layers.Input(shape=(5000, ))

text_dense_1 = tf.keras.layers.Dense(500, activation='relu', use_bias=True)(text_input)

text_bn_1 = tf.keras.layers.BatchNormalization(center=True, scale=True)(text_dense_1)

text_dense_2 = tf.keras.layers.Dense(500, activation='relu', use_bias=True)(text_bn_1)

text_bn_2 = tf.keras.layers.BatchNormalization(center=True, scale=True)(text_dense_2)

title_input = tf.keras.layers.Input(shape=(5000, ))

title_dense_1 = tf.keras.layers.Dense(500, activation='relu', use_bias=True)(title_input)

title_bn_1 = tf.keras.layers.BatchNormalization(center=True, scale=True)(title_dense_1)

concat = tf.keras.layers.Concatenate()([title_bn_1, text_bn_2])

main_dense_1 = tf.keras.layers.Dense(300, activation='relu', use_bias=True)(concat)

main_bn_1 = tf.keras.layers.BatchNormalization(center=True, scale=True)(main_dense_1) 

drop_main_1 = tf.keras.layers.Dropout(0.8)(main_bn_1)

main_dense_2 = tf.keras.layers.Dense(100, activation='relu', use_bias=True)(drop_main_1)

main_bn_2 = tf.keras.layers.BatchNormalization(center=True, scale=True)(main_dense_2)

drop_main_2 = tf.keras.layers.Dropout(0.8)(main_bn_2)

output = tf.keras.layers.Dense(1, activation='sigmoid')(drop_main_2)


model = tf.keras.Model(inputs=[title_input, text_input], outputs=output)

In [18]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 5000)]       0           []                               
                                                                                                  
 dense_6 (Dense)                (None, 500)          2500500     ['input_3[0][0]']                
                                                                                                  
 input_4 (InputLayer)           [(None, 5000)]       0           []                               
                                                                                                  
 batch_normalization_5 (BatchNo  (None, 500)         2000        ['dense_6[0][0]']                
 rmalization)                                                                               

In [19]:
model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=tf.keras.losses.binary_crossentropy,
              metrics=[tf.keras.metrics.binary_accuracy]) 

tb_callback = tf.keras.callbacks.TensorBoard(log_dir='logs/second/', histogram_freq=1)

model.fit([titles_train, texts_train], y_train,
          validation_data=([titles_val, texts_val], y_val),
          batch_size=256,
          epochs=10,
          callbacks=[tb_callback]
         )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2a442211c00>