In [95]:
import features_process
import random_usernames
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score,roc_auc_score
import numpy as np
from sklearn.svm import SVC, LinearSVC
from pathlib import Path
import sys
import os
import pandas as pd

sys.path.append('/home1/ep298479/Documents/Projet_Tuteure/twibench/src')

from config import Config

In [96]:
# Get formatted data

formatted_datasets_path = Config().getFormattedDatasetsPath()
# get the nameclass directory in formatted_datasets
formatted_datasets_path = os.path.join(formatted_datasets_path, "nameclass")

# Print the name of all the files in the directory
print("All the files in the directory are: ")
for file in os.listdir(formatted_datasets_path):
    print(file)

# Load cresci-2015.csv in a dataframe. This file contains the screen names and the label
# The label is HUMAN or BOT, convert it to 0 or 1

csv_path = os.path.join(formatted_datasets_path, "cresci-2015.csv")

df = pd.read_csv(csv_path, header=None, names=['screen_name', 'label'])
df['label'] = df['label'].map({'HUMAN': 0, 'BOT': 1})

# Keep only humans, we are gonna add bots later
df = df[df['label'] == 0]



    

All the files in the directory are: 
cresci-2015.csv


In [97]:
# add as many bots as humans using generate_random_name() from random_usernames.py
# we need to add the bots to the dataframe

# Get the number of humans
nb_humans = df.shape[0]
list_bot_names = [random_usernames.generate_random_name(15) for i in range(nb_humans)]

# Create a dataframe with the bot names, with one column named 'screen_name' and one column named 'label' with the value 1
df_bot = pd.DataFrame(list_bot_names, columns=['screen_name'])
df_bot['label'] = 1

# merge the two dataframes
df = pd.concat([df, df_bot], ignore_index=True)

# Count the amount of bots, then humans
print("Number of bots: ", df[df['label'] == 1].shape[0])
print("Number of humans: ", df[df['label'] == 0].shape[0])


Number of bots:  1950
Number of humans:  1950


In [98]:
# On récupère la liste des screen_name pour appliquer tf-idf sur les noms
screen_names = df['screen_name'].tolist()

tfidf = features_process.tfidf(screen_names)
features = tfidf
print(len(features[0]))


taille de la liste tfidf :  3900
1357


In [99]:
import random

# Tirer 20 noms aléatoires de la liste screen_names et les afficher pour vérifier que ça a l'air aléatoire
print("20 noms aléatoires: ")
for i in range(20):
    print(screen_names[random.randint(0, len(screen_names))])

20 noms aléatoires: 
FofoCasual
l6pU1yASWhUlGh0
1jnTjArTHj18kJK
JpEzPs6slAu3zb5
giagar69
YfsS17OVTto0YKx
BorelliRoberta
2uQDRSzbce20dWt
Rossoweb
lachiaramartini
bettabianchini
veracinico
y6MzaJKR39bTTig
FranKsBigBand
TR3Ouy5GdOsNhF7
DuHSS6wHkEp7N2D
dariodusio
luxluciaaa
pietrodinoia
sonoauroraricci


In [100]:
for i in range(len(screen_names)):
    features[i].append(float(features_process.shannon_entropy(screen_names[i])))
    features[i].append(float(features_process.upper_count(screen_names[i])))
    features[i].append(float(features_process.lower_count(screen_names[i])))


3.906890595608518


In [101]:
# standardiser les données
for i in range(len(features[0])):
    mean = np.mean([features[j][i] for j in range(len(features))])
    std = np.std([features[j][i] for j in range(len(features))])
    for j in range(len(features)):
        features[j][i] = (features[j][i] - mean) / std

In [102]:
# Séparer les données en données d'entraînement et données de test en 80-20, de façon stratifiée.
# Le dataframe df contient les labels, on peut donc utiliser train_test_split de sklearn
# Les features sont dans features

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, df['label'], test_size=0.2, stratify=df['label'])



In [103]:
# afficher les shapes
print("X_train shape: ", np.shape(X_train))
print("X_test shape: ", np.shape(X_test))
print("y_train shape: ", np.shape(y_train))
print("y_test shape: ", np.shape(y_test))

X_train shape:  (3120, 1360)
X_test shape:  (780, 1360)
y_train shape:  (3120,)
y_test shape:  (780,)


In [104]:
# On va faire de la regression logistique avec solver = 'saga', max_iter = 2000, tol = 1e-7, C = 50, verbose = 2

classifier = LogisticRegression(solver='saga', max_iter=2000, tol=1e-7, C=50, verbose=2)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)


Epoch 1, change: 1.00000000
Epoch 2, change: 0.19428585
Epoch 3, change: 0.09683186
Epoch 4, change: 0.07210960
Epoch 5, change: 0.05612059
Epoch 6, change: 0.04573544
Epoch 7, change: 0.03828077
Epoch 8, change: 0.03400504
Epoch 9, change: 0.03042268
Epoch 10, change: 0.02618273
Epoch 11, change: 0.02380367
Epoch 12, change: 0.02174203
Epoch 13, change: 0.02013073
Epoch 14, change: 0.01832893
Epoch 15, change: 0.01709466
Epoch 16, change: 0.01595284
Epoch 17, change: 0.01499599
Epoch 18, change: 0.01397824
Epoch 19, change: 0.01305173
Epoch 20, change: 0.01236198
Epoch 21, change: 0.01174411
Epoch 22, change: 0.01105964
Epoch 23, change: 0.01058336
Epoch 24, change: 0.01011326
Epoch 25, change: 0.00965973
Epoch 26, change: 0.00914672
Epoch 27, change: 0.00878167
Epoch 28, change: 0.00843624
Epoch 29, change: 0.00813471
Epoch 30, change: 0.00782283
Epoch 31, change: 0.00748739
Epoch 32, change: 0.00724931
Epoch 33, change: 0.00702290
Epoch 34, change: 0.00676080
Epoch 35, change: 0.006



In [105]:
# afficher les métriques
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("F1 score: ", f1_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("ROC AUC: ", roc_auc_score(y_test, y_pred))


Accuracy:  0.9948717948717949
F1 score:  0.9948849104859335
Recall:  0.9974358974358974
Precision:  0.9923469387755102
ROC AUC:  0.9948717948717949


Ancienne métrique : 

Accuracy:  0.7728557964184731
F1 score:  0.8224023581429624
Recall:  0.8315946348733234
Precision:  0.8134110787172012
ROC AUC:  0.7516947533340976

In [109]:
# Montrer quelques exemples de prédictions avec les nom d'utilisateur
for i in range(200,210):
    print("Username: ", screen_names[i])
    print("True label: ", y_test.iloc[i])
    print("Predicted label: ", y_pred[i])
    print("\n")

Username:  Emiko84
True label:  0
Predicted label:  0


Username:  9lla_ae
True label:  1
Predicted label:  1


Username:  salcam83
True label:  1
Predicted label:  1


Username:  SaraCaulfield
True label:  1
Predicted label:  1


Username:  Brainwashed91
True label:  0
Predicted label:  1


Username:  AlbertoCipolla
True label:  1
Predicted label:  1


Username:  lalladelve
True label:  0
Predicted label:  0


Username:  Blizzard1001
True label:  1
Predicted label:  1


Username:  woodoow
True label:  0
Predicted label:  0


Username:  FedeSolo
True label:  0
Predicted label:  0


