# Detecção de Bots Dataset Telegram

Assumimos que bots apresentam comportamento coordenado com outros usuários ou bots. Utilizando hipóteses de comportamento humano, serão criados grafos com candidatos a bots que serão filtrados usando algoritmos de comunidade. As hipóteses definidas foram:

1. Pessoas dificilmente compartilham mensagens poucos segundos após ser enviada
2. Pessoas dificilmente compartilham mensagens no mesmo horário frequentemente
3. Pessoas apresentam intervalos de tempo entre mensagens caóticos. Bots tendem a enviar mensagens de forma mais sistematizada

In [1]:
# dependências
import networkx as nx
from networkx.algorithms import community
import concurrent.futures
import os

import warnings
import os
os.chdir("..")
from util.load_graph import load_graph_by_edge, get_driver
from util.bot_filtering import rank_bot_suspicion

warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
driver = get_driver()

## Hipótese 1: Usuários que compartilham mensagens rápidos demais

![rapid share](../../images/rapid_share.png)

In [3]:
G_rapid = load_graph_by_edge(driver, 'RAPID_SHARE')

rapid_bots = rank_bot_suspicion(G_rapid) 
rapid_bots

Unnamed: 0,user_id,bot_suspicion_score,centrality_rank,clique_score,volume_score,labels
44,c743967449a387ad2c1c7e03b2c45b36,0.7086,0.1674,1.0000,1.0000,[]
43,1665e22b0f564cd46d343f7677014821,0.6011,0.1259,0.9709,0.7241,[]
45,b4ce87d44421ef3d2c4710051df5fe40,0.5034,0.0772,1.0000,0.4212,[]
21,1ac091b8ed5c4e42383f1b4ff4cc9b2d,0.3724,1.0000,0.0002,0.0743,[]
3,abe534d581ec6d552243d6955d3c3cd8,0.2142,0.5513,0.0000,0.0709,[]
...,...,...,...,...,...,...
175,761cf2c6592fbabbf0dae751e5c36a05,0.0082,0.0224,0.0000,0.0011,[]
218,942d61531313972a81171f1d87d80ac0,0.0080,0.0218,0.0000,0.0011,[]
459,e04f82940a5de7979fffadb350488924,0.0076,0.0207,0.0000,0.0011,[]
230,584b05446c0afec7298ccd5c0a11fa0a,0.0071,0.0193,0.0000,0.0011,[]


In [4]:
rapid_bots.describe()

Unnamed: 0,bot_suspicion_score,centrality_rank,clique_score,volume_score
count,474.0,474.0,474.0,474.0
mean,0.032808,0.078912,0.007692,0.008271
std,0.051529,0.060814,0.078742,0.060019
min,0.0045,0.012,0.0,0.0011
25%,0.01855,0.049025,0.0,0.0011
50%,0.028,0.0789,0.0,0.0023
75%,0.0309,0.0789,0.0,0.0034
max,0.7086,1.0,1.0,1.0


In [5]:
filtered_rapid_bots = rapid_bots[
    rapid_bots['bot_suspicion_score'] >
    rapid_bots['bot_suspicion_score'].quantile(0.8)
]
filtered_rapid_bots

Unnamed: 0,user_id,bot_suspicion_score,centrality_rank,clique_score,volume_score,labels
44,c743967449a387ad2c1c7e03b2c45b36,0.7086,0.1674,1.0000,1.0000,[]
43,1665e22b0f564cd46d343f7677014821,0.6011,0.1259,0.9709,0.7241,[]
45,b4ce87d44421ef3d2c4710051df5fe40,0.5034,0.0772,1.0000,0.4212,[]
21,1ac091b8ed5c4e42383f1b4ff4cc9b2d,0.3724,1.0000,0.0002,0.0743,[]
3,abe534d581ec6d552243d6955d3c3cd8,0.2142,0.5513,0.0000,0.0709,[]
...,...,...,...,...,...,...
30,50cc2b20a3c2c24ce68bb2c0554ddae1,0.0358,0.0874,0.0081,0.0079,[]
260,e3afed0047b08059d0fada10f400c1e5,0.0357,0.0947,0.0043,0.0034,[]
324,96c8dffb29c65cd1905239dc64196f03,0.0356,0.0958,0.0000,0.0068,[]
51,d49f81df0c75d1d72bee6c5b2d707da0,0.0355,0.0122,0.0874,0.0023,[]


## Hipótese 2: Usuários que compartilham mensagens no mesmo horário frequentemente

![hourly](../../images/hourly.png)

In [6]:
G_hourly = load_graph_by_edge(driver, 'HOURLY_SHARED')

hourly_bots = rank_bot_suspicion(G_hourly) 
hourly_bots

Unnamed: 0,user_id,bot_suspicion_score,centrality_rank,clique_score,volume_score,labels
50,1ac091b8ed5c4e42383f1b4ff4cc9b2d,0.6516,1.0000,0.0045,1.0000,[]
151,3f64a652deccf6273d7d769252bdfb46,0.4224,0.5838,0.0146,0.7099,[]
129,c743967449a387ad2c1c7e03b2c45b36,0.4143,0.0442,1.0000,0.1626,[]
20,56b8359fd127312651b80b8ed8030085,0.2521,0.3702,0.0111,0.3956,[]
308,9c05a2bb3b6011d43ae7982ae41b468f,0.2444,0.0305,0.5980,0.0813,[]
...,...,...,...,...,...,...
354,d077070bc8d907b095cadc92f7f5142a,0.0044,0.0088,0.0000,0.0044,[]
173,9020c6528611102b62f2a07ceea563f1,0.0044,0.0089,0.0000,0.0044,[]
402,a806cc776751cdeae372f175de3cc7b8,0.0044,0.0089,0.0000,0.0044,[]
406,2768de406ef6e866239acd8540fa4a29,0.0044,0.0089,0.0000,0.0044,[]


In [7]:
hourly_bots.describe()

Unnamed: 0,bot_suspicion_score,centrality_rank,clique_score,volume_score
count,483.0,483.0,483.0,483.0
mean,0.038574,0.03874,0.039798,0.036948
std,0.055365,0.066878,0.080979,0.076318
min,0.0044,0.0088,0.0,0.0044
25%,0.00705,0.0128,0.0,0.0044
50%,0.0262,0.0202,0.0092,0.011
75%,0.0449,0.0387,0.05635,0.033
max,0.6516,1.0,1.0,1.0


In [8]:
filtered_hourly_bots = hourly_bots[
    hourly_bots['bot_suspicion_score'] >
    hourly_bots['bot_suspicion_score'].quantile(0.8)
]
filtered_hourly_bots

Unnamed: 0,user_id,bot_suspicion_score,centrality_rank,clique_score,volume_score,labels
50,1ac091b8ed5c4e42383f1b4ff4cc9b2d,0.6516,1.0000,0.0045,1.0000,[]
151,3f64a652deccf6273d7d769252bdfb46,0.4224,0.5838,0.0146,0.7099,[]
129,c743967449a387ad2c1c7e03b2c45b36,0.4143,0.0442,1.0000,0.1626,[]
20,56b8359fd127312651b80b8ed8030085,0.2521,0.3702,0.0111,0.3956,[]
308,9c05a2bb3b6011d43ae7982ae41b468f,0.2444,0.0305,0.5980,0.0813,[]
...,...,...,...,...,...,...
251,a99a69fa769bfb03e2bf9aabe1187a77,0.0571,0.0196,0.1247,0.0220,[]
154,b92c54b51da92292919f5ad475b00116,0.0554,0.0201,0.1212,0.0198,[]
126,a2f306d00f22b310c2de89232f3b253e,0.0551,0.0816,0.0194,0.0659,[]
388,1967c4356e3314d52bcbf5af8ee95170,0.0547,0.0266,0.1071,0.0264,[]


## Hipótese 3: Usuários que compartilham mensagens com pouca variação
![hourly](../../images/metronome.png)

In [9]:
G_metronome = load_graph_by_edge(driver, 'METRONOME_SIMILAR')

metronome_bots = rank_bot_suspicion(G_metronome) 
metronome_bots

Unnamed: 0,user_id,bot_suspicion_score,centrality_rank,clique_score,volume_score,labels
20,a7bece6bf8c926b88caa8c0e130335ca,0.7593,0.9114,0.4709,0.9182,[]
44,3b262fc9f742d702f0ad2d861cb9874c,0.7592,0.9114,0.4708,0.9181,[]
40,42e08609134349429a28a5d9c0a766c8,0.7592,0.9114,0.4708,0.9181,[]
41,4b13a0eb93b709f1a8e605b6829450d0,0.7592,0.9114,0.4708,0.9181,[]
43,439baa00eef3d49b71e93c1df50149b7,0.7592,0.9114,0.4708,0.9181,[]
...,...,...,...,...,...,...
3,ef1e2b17508448d73c602751720be38f,0.4607,0.6690,0.3005,0.4043,[]
2,d8cca81e3540ae4de37e27d192bfd245,0.4606,0.6690,0.3005,0.4043,[]
5,cf8c3d5dcd1eade7426b6ea5622abd88,0.4606,0.6690,0.3005,0.4043,[]
0,abe534d581ec6d552243d6955d3c3cd8,0.4340,0.6225,0.3022,0.3679,[]


In [10]:
metronome_bots.describe()

Unnamed: 0,bot_suspicion_score,centrality_rank,clique_score,volume_score
count,147.0,147.0,147.0,147.0
mean,0.607016,0.785167,0.417654,0.62009
std,0.079357,0.09701,0.166567,0.174392
min,0.3204,0.4125,0.1966,0.207
25%,0.553,0.72035,0.3097,0.5178
50%,0.5993,0.7934,0.3744,0.6033
75%,0.66245,0.83235,0.4708,0.7523
max,0.7593,1.0,1.0,1.0


In [11]:
filtered_metronome_bots = metronome_bots[
    metronome_bots['bot_suspicion_score'] >
    metronome_bots['bot_suspicion_score'].quantile(0.8)
]
filtered_metronome_bots

Unnamed: 0,user_id,bot_suspicion_score,centrality_rank,clique_score,volume_score,labels
20,a7bece6bf8c926b88caa8c0e130335ca,0.7593,0.9114,0.4709,0.9182,[]
44,3b262fc9f742d702f0ad2d861cb9874c,0.7592,0.9114,0.4708,0.9181,[]
40,42e08609134349429a28a5d9c0a766c8,0.7592,0.9114,0.4708,0.9181,[]
41,4b13a0eb93b709f1a8e605b6829450d0,0.7592,0.9114,0.4708,0.9181,[]
43,439baa00eef3d49b71e93c1df50149b7,0.7592,0.9114,0.4708,0.9181,[]
111,221906f61123cb21c6653c1797e33ec4,0.7592,0.9113,0.4708,0.9181,[]
83,89d045e252c372ae68d291921a919635,0.7425,1.0,0.2644,1.0,[]
57,cae3a9898fe95e7603e7061c8a2738de,0.7424,0.9999,0.2643,0.9998,[]
145,10c2fb00b3e64fa51c0e5cdb246a1ed3,0.7424,0.8945,0.7033,0.6107,[]
132,50337553ce46b1eb0bd6798c75cfe5b7,0.7419,0.8938,0.703,0.6102,[]


## Avaliando Semelhança entre Técnicas de Detecção

In [12]:
metronome_susp = set(filtered_metronome_bots['user_id'])
hourly_susp = set(filtered_hourly_bots['user_id'])
rapid_susp = set(filtered_rapid_bots['user_id'])

In [13]:
overlap_mh = metronome_susp & hourly_susp
overlap_mr = metronome_susp & rapid_susp
overlap_hr = hourly_susp & rapid_susp

len(overlap_mh), len(overlap_mr), len(overlap_hr)

(2, 2, 37)

In [14]:
def jaccard(a, b):
    return len(a & b) / len(a | b) if (a | b) else 0

j12 = jaccard(metronome_susp, hourly_susp)
j13 = jaccard(metronome_susp, rapid_susp)
j23 = jaccard(hourly_susp, rapid_susp)

j12, j13, j23

(0.016, 0.016260162601626018, 0.23870967741935484)

In [15]:
def overlap_ratio(a, b):
    return len(a & b) / min(len(a), len(b)) if min(len(a), len(b)) else 0

o12 = overlap_ratio(metronome_susp, hourly_susp)
o13 = overlap_ratio(metronome_susp, rapid_susp)
o23 = overlap_ratio(rapid_susp, hourly_susp)

o12, o13, o23

(0.06666666666666667, 0.06666666666666667, 0.3894736842105263)

In [16]:
highest_confident = metronome_susp & hourly_susp & rapid_susp
len(highest_confident)

1

In [17]:
concatenated = metronome_susp | hourly_susp | rapid_susp
len(concatenated)

182

In [18]:
import pandas as pd

pd.DataFrame({'user_id': list(highest_confident)}).to_csv(
    '../data/highest_confident_bots.csv', index=False
)

pd.DataFrame({'user_id': list(concatenated)}).to_csv(
    '../data/all_suspected_bots.csv', index=False
)