In [21]:
import pandas as pd
import tensorflow as tf
import classification_utils
import numpy as np
import math
import os
from scikeras.wrappers import KerasClassifier
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [2]:
df = pd.read_csv("./dataset/users_df_dataset_cleaned_with_indicators.csv")

We are going to use only the numerical attributes, except for the bot column that is the target one

In [3]:
df = df[[
#    "user_id"  ,
#    "name"  ,
#    "lang"  ,
    "bot"  ,
#    "created_at" ,
    "statuses_count" ,
    "account_age_in_days" ,
    "number_of_tweets" ,
    "account_average_tweets_per_day" ,
    "avg_tweets_per_actual_day" ,
#    "day_with_most_tweets" ,
    "max_number_of_tweets_in_a_day",
    "entropy_for_day",
    "entropy_for_hour",
    "entropy_for_minute",
    "avg_hashtags",
    "avg_text_length",
    "avg_mentions",
    "avg_special_char_in_text",
    "total_likes",
    "avg_favorite_count",
    "total_replies",
    "avg_reply_count",
    "total_retweet_count",
    "account_discussion_creation_ratio",
    "tweet_num_likes_ratio",
    "tweet_num_replies_ratio",
    "entropy_original_text",
    "entropy_text",
    "mean_inactive_period_length_in_seconds",
    "median_inactive_period_length_in_seconds",
    "mode_inactive_period_length_in_seconds",
    "mode_count"
]]

In the next 3 lines we shuffle the data in order to avoid biases if for example all the bot are in the bottom part and they will be part only of the test set, then we split the bot column from the other ones and convert them to a tensor 

In [4]:
#df = df.sample(frac=1, random_state=1)
#target_array = tf.convert_to_tensor(df.pop("bot"))
#feature_matrix = tf.convert_to_tensor(df)

df = df.sample(frac=1, random_state=1)
target_array = df.pop("bot").values
feature_matrix = df.values


In [5]:
df.head()

Unnamed: 0,statuses_count,account_age_in_days,number_of_tweets,account_average_tweets_per_day,avg_tweets_per_actual_day,max_number_of_tweets_in_a_day,entropy_for_day,entropy_for_hour,entropy_for_minute,avg_hashtags,...,total_retweet_count,account_discussion_creation_ratio,tweet_num_likes_ratio,tweet_num_replies_ratio,entropy_original_text,entropy_text,mean_inactive_period_length_in_seconds,median_inactive_period_length_in_seconds,mode_inactive_period_length_in_seconds,mode_count
2987,68,2163,2074,0.031438,10.690722,43,4.577648,2.202349,0.822681,0.248795,...,1334112,0.001555,3.405583,0.0,10.853126,2.747561e-07,53500.641273,845.5,0.0,153
7999,68,1005,3457,0.067662,86.425,224,5.103056,4.15818,1.425546,0.014753,...,377448,0.009159,1.328593,0.0,11.556204,2.491898e-07,3151.758461,56.0,0.0,282
8447,1785,2521,1238,0.708052,3.134177,21,2.742274,1.435677,0.466968,0.371567,...,133842,0.00925,77.375,0.0,10.126784,2.269985e-07,114544.543619,5181.0,0.0,91
7109,68,3078,2254,0.022092,7.772414,59,4.183229,2.123818,0.763846,0.526619,...,167190,0.013482,2.782716,0.0,10.998077,2.118565e-07,84270.544809,919.5,0.0,171
2519,68,2017,3010,0.033713,25.083333,86,5.3599,3.144455,0.907828,0.018272,...,553115,0.005442,1.718037,0.0,11.390271,3.56386e-07,32652.958472,197.0,0.0,227


We split the data in training, validation and test.
The training data are the only ones used to fit the neural network, the expactation is that the classification error on this set will only decrease durning the training.

The validation data are used to itaratively evaluate the network and select the best architecture, if the classification error on the validation set increases, it means that the network is in overfit.

The test set is used only after the choise of the final model and is used to have an idea of the error of the network on completely new data

In [6]:
tr_size = 0.7
vl_size = 0.2
ts_size = 0.1

tr_index = round(len(feature_matrix) * tr_size)
vl_index = tr_index + round(len(feature_matrix) * vl_size)
ts_index = vl_index + round(len(feature_matrix) * ts_size)

feature_matrix_tr = feature_matrix[0:tr_index]
target_array_tr = target_array[0:tr_index]

feature_matrix_vl = feature_matrix[tr_index:vl_index]
target_array_vl = target_array[tr_index:vl_index]

feature_matrix_ts = feature_matrix[vl_index:ts_index]
target_array_ts = target_array[vl_index:ts_index]

In [7]:
feature_matrix_tr.shape

(7776, 27)

The choose of the neural network architecture is guided by the theory: since we already have processed features (and not row data), and there is no sign of a hierarchy of features to exploit with the inductive bias of a deep neural network (as for the image classification), we choose to adopt a shallow model, with only two hiddel layer.

https://www.deeplearningbook.org/

https://www.nature.com/articles/nature14539

In the input layer we normalize the data

#TODO improve with grid search

In [25]:
# Use scikit-learn to grid search the learning rate and momentum
import numpy as np
from sklearn.model_selection import GridSearchCV


def create_model(epochs=100, batch_size=4096, activation="relu", num_neurons=32, learning_rate=0.1, momentum=0.1):
    in_layer = tf.keras.layers.Normalization(axis=-1)
    in_layer.adapt(feature_matrix_tr)
    model = tf.keras.models.Sequential()
    model.add(in_layer)
    model.add(tf.keras.layers.Dense(num_neurons, activation=activation))  
    model.add(tf.keras.layers.Dense(num_neurons, activation=activation))
    model.add(tf.keras.layers.Dense(1, activation="sigmoid"))
    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(), 
        optimizer=tf.keras.optimizers.SGD(
            learning_rate=learning_rate,
            momentum=momentum
        )
    )
    return model

# create model
model = KerasClassifier(model=create_model)

# define the grid search parameters
learning_rate = [0.01, 0.1, 0.2]
momentum = [0.2 , 0.4, 0.6]
epochs = [400, 500, 600]
batch_size = [2048, 4096]
activation =["sigmoid", "relu"]
num_neurons = [32, 64]

param_grid = dict(
    #with the keyword "model__" the parameter is used in the "create_model" function
    model__learning_rate=learning_rate, 
    model__momentum=momentum,
    model__activation = activation,
    model__num_neurons=num_neurons,
    #without the keyword the parameter is used in the "fit" function
    epochs = epochs,
    batch_size = batch_size
    )
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=10, cv=3)
grid_result = grid.fit(feature_matrix_tr , target_array_tr)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Epoch 1/400
Epoch 1/400
Epoch 1/400


In [24]:
model = grid.best_estimator_
print(grid.best_params_)
print(grid.best_score_)

{'batch_size': 4096, 'epochs': 400, 'model__activation': 'sigmoid', 'model__learning_rate': 0.1, 'model__momentum': 0.4, 'model__num_neurons': 64}
0.8299897119341564


In [None]:
in_layer = tf.keras.layers.Normalization(axis=-1)
in_layer.adapt(feature_matrix_tr)

model = tf.keras.models.Sequential([
  in_layer,
  tf.keras.layers.Dense(64, activation='sigmoid'),   
  tf.keras.layers.Dense(64, activation='sigmoid'),
  tf.keras.layers.Dense(1, activation='sigmoid')
])


model.compile(optimizer="adam",
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=["accuracy"]
)

history = model.fit(
  feature_matrix_tr,
  target_array_tr,
  batch_size=2048,
  epochs=600,
  validation_data=(feature_matrix_vl, target_array_vl),
  shuffle=True,
  verbose=False,
  class_weight={
    0 : 1.1,
    1 : 1
  }
)

classification_utils.print_training_stats(history=history)

In [None]:
from sklearn import metrics
print('Accuracy train set ', metrics.accuracy_score(target_array_tr, model.predict(feature_matrix_tr) >= 0.5 ))
print('Accuracy validation set ', metrics.accuracy_score(target_array_vl, model.predict(feature_matrix_vl) >= 0.5 ))

As you can see, we are able to achive a really good accuracy (near 85%), without overfitting 

As a counter example,in the following picture we want to demostrate that a larger nework is able to achive a near 100% accuracy, but then the generalization capabilities are really low because of overfitting, infact the classification error on the validation begin to increase.

We moved the code on anothe file for clarity. The network has 5 hiddel layers with 512 neurons for each layer

In [None]:
classification_utils.large_model_example(
    feature_matrix_tr,
    target_array_tr,
    feature_matrix_vl,
    target_array_vl
    )

Finally we evaluate the model on completely unseen data, this is like to predict if a new user is a bot or not, i.e. the final objective of the task

In [None]:
test_predict = (model.predict(feature_matrix_ts) > 0.5).astype("bool")

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(target_array_ts, test_predict)
disp = ConfusionMatrixDisplay(cm)
disp.plot()

The model seems very good at indentify true bots, while has a little bias to classify real users as boot too (false positive). This can be a consequence of the unbalanceness of the dataset (5000 humans, 6000 bots), but this is already partially correct by the class weight during the fit of the model. A greater weight toward the "human" class proved to be worse during the validation phase

In [None]:
from sklearn import metrics
print('Accuracy test set ', metrics.accuracy_score(target_array_ts, test_predict))
print('Precision test set ', metrics.precision_score(target_array_ts, test_predict))
print('Recall test set ', metrics.recall_score(target_array_ts, test_predict))
print('F1 score test set ', metrics.f1_score(target_array_ts, test_predict))

## Exporting model

In [None]:
saved_filepath = "models/nn_model"
model.save(saved_filepath)