# Detecção de Bots Dataset Telegram

Assumimos que bots apresentam comportamento coordenado com outros usuários ou bots. Utilizando hipóteses de comportamento humano, serão criados grafos com candidatos a bots que serão filtrados usando algoritmos de comunidade. As hipóteses definidas foram:

1. Pessoas dificilmente compartilham mensagens poucos segundos após ser enviada
2. Pessoas dificilmente compartilham mensagens no mesmo horário frequentemente
3. Pessoas apresentam intervalos de tempo entre mensagens caóticos. Bots tendem a enviar mensagens de forma mais sistematizada

In [1]:
# dependências
import networkx as nx
from networkx.algorithms import community
import concurrent.futures
import os

import warnings
import os
os.chdir("..")
from util.load_graph import load_graph_by_edge, get_driver
from util.bot_filtering import rank_bot_suspicion

warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
driver = get_driver()

## Hipótese 1: Usuários que compartilham mensagens rápidos demais

![rapid share](../../images/rapid_share.png)

In [3]:
G_rapid = load_graph_by_edge(driver, 'RAPID_SHARE')

rapid_bots = rank_bot_suspicion(G_rapid) 
rapid_bots

Unnamed: 0,user_id,bot_suspicion_score,centrality_rank,clique_score,volume_score,labels
11,c743967449a387ad2c1c7e03b2c45b36,0.6670,0.1674,1.0000,1.0000,[]
25,1665e22b0f564cd46d343f7677014821,0.5835,0.1259,0.9709,0.7241,[]
26,b4ce87d44421ef3d2c4710051df5fe40,0.5151,0.0772,1.0000,0.4212,[]
99,1ac091b8ed5c4e42383f1b4ff4cc9b2d,0.4150,1.0000,0.0002,0.0743,[]
5,abe534d581ec6d552243d6955d3c3cd8,0.2347,0.5513,0.0000,0.0709,[]
...,...,...,...,...,...,...
85,761cf2c6592fbabbf0dae751e5c36a05,0.0092,0.0224,0.0000,0.0011,[]
156,942d61531313972a81171f1d87d80ac0,0.0089,0.0218,0.0000,0.0011,[]
343,e04f82940a5de7979fffadb350488924,0.0085,0.0207,0.0000,0.0011,[]
121,584b05446c0afec7298ccd5c0a11fa0a,0.0079,0.0193,0.0000,0.0011,[]


In [4]:
rapid_bots.describe()

Unnamed: 0,bot_suspicion_score,centrality_rank,clique_score,volume_score
count,474.0,474.0,474.0,474.0
mean,0.036305,0.078912,0.007692,0.008271
std,0.051158,0.060814,0.078742,0.060019
min,0.005,0.012,0.0,0.0011
25%,0.0208,0.049025,0.0,0.0011
50%,0.0318,0.0789,0.0,0.0023
75%,0.0348,0.0789,0.0,0.0034
max,0.667,1.0,1.0,1.0


In [5]:
filtered_rapid_bots = rapid_bots[
    rapid_bots['bot_suspicion_score'] >
    rapid_bots['bot_suspicion_score'].quantile(0.7)
]
filtered_rapid_bots

Unnamed: 0,user_id,bot_suspicion_score,centrality_rank,clique_score,volume_score,labels
11,c743967449a387ad2c1c7e03b2c45b36,0.6670,0.1674,1.0000,1.0000,[]
25,1665e22b0f564cd46d343f7677014821,0.5835,0.1259,0.9709,0.7241,[]
26,b4ce87d44421ef3d2c4710051df5fe40,0.5151,0.0772,1.0000,0.4212,[]
99,1ac091b8ed5c4e42383f1b4ff4cc9b2d,0.4150,1.0000,0.0002,0.0743,[]
5,abe534d581ec6d552243d6955d3c3cd8,0.2347,0.5513,0.0000,0.0709,[]
...,...,...,...,...,...,...
206,699b8378361d65a1932168264633c843,0.0327,0.0800,0.0000,0.0034,[]
138,cf061f911f3df59fca0b3f084abe88cc,0.0327,0.0788,0.0000,0.0056,[]
225,7f6e9c7ce55b111539996f50a57e4787,0.0325,0.0673,0.0129,0.0023,[]
306,5180a70680aa7648bd1cd9652811f892,0.0322,0.0771,0.0000,0.0068,[]


## Hipótese 2: Usuários que compartilham mensagens no mesmo horário frequentemente

![hourly](../../images/hourly.png)

In [6]:
G_hourly = load_graph_by_edge(driver, 'HOURLY_SHARED')

hourly_bots = rank_bot_suspicion(G_hourly) 
hourly_bots

Unnamed: 0,user_id,bot_suspicion_score,centrality_rank,clique_score,volume_score,labels
76,c743967449a387ad2c1c7e03b2c45b36,0.6131,0.0327,1.0000,1.0000,[]
253,b4ce87d44421ef3d2c4710051df5fe40,0.5278,0.0201,0.9909,0.6171,[]
440,3f64a652deccf6273d7d769252bdfb46,0.4119,1.0000,0.0007,0.0579,[]
13,1ac091b8ed5c4e42383f1b4ff4cc9b2d,0.3799,0.9319,0.0000,0.0357,[]
252,1665e22b0f564cd46d343f7677014821,0.3300,0.0226,0.4993,0.6058,[]
...,...,...,...,...,...,...
1859,b417646992e5d8eaf52aab1425b316ac,0.0010,0.0025,0.0000,0.0000,[]
2072,119ea5dcad1f560dbdcb008c3c7427c4,0.0010,0.0025,0.0000,0.0000,[]
826,2c7efac5a30d6354ecc5f86d6c739ba1,0.0010,0.0024,0.0000,0.0000,[]
2089,09dafa0ebeb8462a878322b896d83448,0.0010,0.0026,0.0000,0.0000,[]


In [7]:
hourly_bots.describe()

Unnamed: 0,bot_suspicion_score,centrality_rank,clique_score,volume_score
count,2413.0,2413.0,2413.0,2413.0
mean,0.007069,0.015684,0.001237,0.001485
std,0.023523,0.03781,0.0304,0.027115
min,0.0009,0.0024,0.0,0.0
25%,0.0021,0.0049,0.0,0.0
50%,0.0036,0.0087,0.0001,0.0001
75%,0.0063,0.0157,0.0002,0.0003
max,0.6131,1.0,1.0,1.0


In [8]:
filtered_hourly_bots = hourly_bots[
    hourly_bots['bot_suspicion_score'] >
    hourly_bots['bot_suspicion_score'].quantile(0.7)
]
filtered_hourly_bots

Unnamed: 0,user_id,bot_suspicion_score,centrality_rank,clique_score,volume_score,labels
76,c743967449a387ad2c1c7e03b2c45b36,0.6131,0.0327,1.0000,1.0000,[]
253,b4ce87d44421ef3d2c4710051df5fe40,0.5278,0.0201,0.9909,0.6171,[]
440,3f64a652deccf6273d7d769252bdfb46,0.4119,1.0000,0.0007,0.0579,[]
13,1ac091b8ed5c4e42383f1b4ff4cc9b2d,0.3799,0.9319,0.0000,0.0357,[]
252,1665e22b0f564cd46d343f7677014821,0.3300,0.0226,0.4993,0.6058,[]
...,...,...,...,...,...,...
1642,8ad5417bac03ec5c86365a2b85b3a470,0.0062,0.0153,0.0002,0.0002,[]
641,46c0ca3f9777deb8dc2e63289f1f2616,0.0062,0.0154,0.0001,0.0003,[]
1476,932db1be76939c5af01e75b196495312,0.0062,0.0154,0.0001,0.0000,[]
2270,2420792d5f8528bb060cb3e53d5cc430,0.0062,0.0154,0.0001,0.0000,[]


## Hipótese 3: Usuários que compartilham mensagens com pouca variação
![hourly](../../images/metronome.png)

In [9]:
G_metronome = load_graph_by_edge(driver, 'METRONOME_SIMILAR')

metronome_bots = rank_bot_suspicion(G_metronome) 
metronome_bots

Unnamed: 0,user_id,bot_suspicion_score,centrality_rank,clique_score,volume_score,labels
153,96dd9aa3ec0b34b3960f2e1e939a8d64,0.8308,0.8351,0.7418,1.0000,[]
26,03a05508f9a2899dcd3e689fb47ed900,0.8289,0.8298,0.7459,0.9930,[]
167,07916b5f9d395b4ba287939d7662b94d,0.8289,0.8296,0.7461,0.9930,[]
49,6106f6a23e4332847cbafa9f5bcccdbd,0.8289,0.8297,0.7459,0.9929,[]
68,1da1ba1081749ce54622f0590bfde704,0.8288,0.8295,0.7461,0.9930,[]
...,...,...,...,...,...,...
2413,c0daf1c78478267109aaf5422608db05,0.3092,0.7712,0.0000,0.0037,[]
2481,d488b1da51fcff2bf3eb3b5077804cb6,0.3092,0.7712,0.0000,0.0037,[]
2480,65a7098260880ec30c1da27059f8f95f,0.3092,0.7712,0.0000,0.0037,[]
2339,6151c496a834491ebf776043463768e0,0.2594,0.6449,0.0000,0.0072,[]


In [10]:
metronome_bots.describe()

Unnamed: 0,bot_suspicion_score,centrality_rank,clique_score,volume_score
count,2485.0,2485.0,2485.0,2485.0
mean,0.729997,0.77116,0.743526,0.62061
std,0.064036,0.033537,0.055701,0.26652
min,0.1765,0.4348,0.0,0.0036
25%,0.6978,0.7571,0.7271,0.4427
50%,0.7369,0.7707,0.7474,0.6658
75%,0.7801,0.7893,0.7641,0.8588
max,0.8308,1.0,1.0,1.0


In [11]:
filtered_metronome_bots = metronome_bots[
    metronome_bots['bot_suspicion_score'] >
    metronome_bots['bot_suspicion_score'].quantile(0.7)
]
filtered_metronome_bots

Unnamed: 0,user_id,bot_suspicion_score,centrality_rank,clique_score,volume_score,labels
153,96dd9aa3ec0b34b3960f2e1e939a8d64,0.8308,0.8351,0.7418,1.0000,[]
26,03a05508f9a2899dcd3e689fb47ed900,0.8289,0.8298,0.7459,0.9930,[]
167,07916b5f9d395b4ba287939d7662b94d,0.8289,0.8296,0.7461,0.9930,[]
49,6106f6a23e4332847cbafa9f5bcccdbd,0.8289,0.8297,0.7459,0.9929,[]
68,1da1ba1081749ce54622f0590bfde704,0.8288,0.8295,0.7461,0.9930,[]
...,...,...,...,...,...,...
1188,fd2406a7ad5ef8a8cbfa78071f991337,0.7756,0.7729,0.7103,0.9117,[]
1168,39c61e36b694c54c90168f78664d10b4,0.7756,0.7659,0.7224,0.9013,[]
2038,21f0aae1f244c946ddc353f1a0ae29b2,0.7756,0.7619,0.7442,0.8656,[]
1928,4b425585eb4570a153579555af4a1a47,0.7755,0.7926,0.7564,0.7793,[]


## Avaliando Semelhança entre Técnicas de Detecção

In [12]:
metronome_susp = set(filtered_metronome_bots['user_id'])
hourly_susp = set(filtered_hourly_bots['user_id'])
rapid_susp = set(filtered_rapid_bots['user_id'])

In [13]:
overlap_mh = metronome_susp & hourly_susp
overlap_mr = metronome_susp & rapid_susp
overlap_hr = hourly_susp & rapid_susp

len(overlap_mh), len(overlap_mr), len(overlap_hr)

(153, 32, 122)

In [14]:
def jaccard(a, b):
    return len(a & b) / len(a | b) if (a | b) else 0

j12 = jaccard(metronome_susp, hourly_susp)
j13 = jaccard(metronome_susp, rapid_susp)
j23 = jaccard(hourly_susp, rapid_susp)

j12, j13, j23

(0.11634980988593156, 0.03764705882352941, 0.16531165311653118)

In [15]:
def overlap_ratio(a, b):
    return len(a & b) / min(len(a), len(b)) if min(len(a), len(b)) else 0

o12 = overlap_ratio(metronome_susp, hourly_susp)
o13 = overlap_ratio(metronome_susp, rapid_susp)
o23 = overlap_ratio(rapid_susp, hourly_susp)

o12, o13, o23

(0.21161825726141079, 0.23357664233576642, 0.8905109489051095)

In [16]:
highest_confident = metronome_susp & hourly_susp & rapid_susp
len(highest_confident)

29

In [17]:
concatenated = metronome_susp | hourly_susp | rapid_susp
len(concatenated)

1327

In [18]:
import pandas as pd

pd.DataFrame({'user_id': list(highest_confident)}).to_csv(
    '../data/highest_confident_bots.csv', index=False
)

pd.DataFrame({'user_id': list(concatenated)}).to_csv(
    '../data/all_suspected_bots.csv', index=False
)