# Detecção de Bots Dataset Telegram

Assumimos que bots apresentam comportamento coordenado com outros usuários ou bots. Utilizando hipóteses de comportamento humano, serão criados grafos com candidatos a bots que serão filtrados usando algoritmos de comunidade. As hipóteses definidas foram:

1. Pessoas dificilmente compartilham mensagens poucos segundos após ser enviada
2. Pessoas dificilmente compartilham mensagens no mesmo horário frequentemente
3. Pessoas apresentam intervalos de tempo entre mensagens caóticos. Bots tendem a enviar mensagens de forma mais sistematizada

In [1]:
# dependências
import networkx as nx
from networkx.algorithms import community
import concurrent.futures
import os

import warnings
import os
os.chdir("..")
from util.load_graph import load_graph_by_edge, get_driver
from util.bot_filtering import rank_bot_suspicion

warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
driver = get_driver()

## Hipótese 1: Usuários que compartilham mensagens rápidos demais

![rapid share](../../images/rapid_share.png)

In [10]:
G_rapid = load_graph_by_edge(driver, 'RAPID_SHARE')

rapid_bots = rank_bot_suspicion(G_rapid) 
rapid_bots



Unnamed: 0,user_id,bot_suspicion_score,centrality_rank,clique_score,volume_score,labels
11,49273,0.6670,0.1674,1.0000,1.0000,[]
25,49291,0.5835,0.1259,0.9709,0.7241,[]
26,49351,0.5151,0.0772,1.0000,0.4212,[]
99,49575,0.4150,1.0000,0.0002,0.0743,[]
5,49272,0.2347,0.5513,0.0000,0.0709,[]
...,...,...,...,...,...,...
85,53929,0.0092,0.0224,0.0000,0.0011,[]
156,55855,0.0089,0.0218,0.0000,0.0011,[]
343,52561,0.0085,0.0207,0.0000,0.0011,[]
121,49685,0.0079,0.0193,0.0000,0.0011,[]


In [11]:
rapid_bots.describe()

Unnamed: 0,user_id,bot_suspicion_score,centrality_rank,clique_score,volume_score
count,474.0,474.0,474.0,474.0,474.0
mean,52253.940928,0.036305,0.078912,0.007692,0.008271
std,3079.043486,0.051158,0.060814,0.078742,0.060019
min,49271.0,0.005,0.012,0.0,0.0011
25%,49873.5,0.0208,0.049025,0.0,0.0011
50%,51186.5,0.0318,0.0789,0.0,0.0023
75%,53096.5,0.0348,0.0789,0.0,0.0034
max,59798.0,0.667,1.0,1.0,1.0


In [None]:
filtered_rapid_bots = rapid_bots[
    rapid_bots['bot_suspicion_score'] >
    rapid_bots['bot_suspicion_score'].quantile(0.7)
]
filtered_rapid_bots

Unnamed: 0,user_id,bot_suspicion_score,centrality_rank,clique_score,volume_score,labels
11,49273,0.6670,0.1674,1.0000,1.0000,[]
25,49291,0.5835,0.1259,0.9709,0.7241,[]
26,49351,0.5151,0.0772,1.0000,0.4212,[]
99,49575,0.4150,1.0000,0.0002,0.0743,[]
5,49272,0.2347,0.5513,0.0000,0.0709,[]
...,...,...,...,...,...,...
206,50228,0.0327,0.0800,0.0000,0.0034,[]
138,50285,0.0327,0.0788,0.0000,0.0056,[]
225,50398,0.0325,0.0673,0.0129,0.0023,[]
306,51522,0.0322,0.0771,0.0000,0.0068,[]


## Hipótese 2: Usuários que compartilham mensagens no mesmo horário frequentemente

![hourly](../../images/hourly.png)

In [13]:
G_hourly = load_graph_by_edge(driver, 'HOURLY_SHARED')

hourly_bots = rank_bot_suspicion(G_hourly) 
hourly_bots



Unnamed: 0,user_id,bot_suspicion_score,centrality_rank,clique_score,volume_score,labels
76,49273,0.6131,0.0327,1.0000,1.0000,[]
253,49351,0.5278,0.0201,0.9909,0.6171,[]
440,51656,0.4119,1.0000,0.0007,0.0579,[]
13,49575,0.3799,0.9319,0.0000,0.0357,[]
252,49291,0.3300,0.0226,0.4993,0.6058,[]
...,...,...,...,...,...,...
1859,52517,0.0010,0.0025,0.0000,0.0000,[]
2072,54270,0.0010,0.0025,0.0000,0.0000,[]
826,49489,0.0010,0.0024,0.0000,0.0000,[]
2089,54633,0.0010,0.0026,0.0000,0.0000,[]


In [14]:
hourly_bots.describe()

Unnamed: 0,user_id,bot_suspicion_score,centrality_rank,clique_score,volume_score
count,2413.0,2413.0,2413.0,2413.0,2413.0
mean,53151.148777,0.007069,0.015684,0.001237,0.001485
std,3046.944011,0.023523,0.03781,0.0304,0.027115
min,49271.0,0.0009,0.0024,0.0,0.0
25%,50433.0,0.0021,0.0049,0.0,0.0
50%,52535.0,0.0036,0.0087,0.0001,0.0001
75%,55525.0,0.0063,0.0157,0.0002,0.0003
max,59884.0,0.6131,1.0,1.0,1.0


In [15]:
filtered_hourly_bots = hourly_bots[
    hourly_bots['bot_suspicion_score'] >
    hourly_bots['bot_suspicion_score'].quantile(0.7)
]
filtered_hourly_bots

Unnamed: 0,user_id,bot_suspicion_score,centrality_rank,clique_score,volume_score,labels
76,49273,0.6131,0.0327,1.0000,1.0000,[]
253,49351,0.5278,0.0201,0.9909,0.6171,[]
440,51656,0.4119,1.0000,0.0007,0.0579,[]
13,49575,0.3799,0.9319,0.0000,0.0357,[]
252,49291,0.3300,0.0226,0.4993,0.6058,[]
...,...,...,...,...,...,...
1642,51179,0.0062,0.0153,0.0002,0.0002,[]
641,49676,0.0062,0.0154,0.0001,0.0003,[]
1476,50470,0.0062,0.0154,0.0001,0.0000,[]
2270,56969,0.0062,0.0154,0.0001,0.0000,[]


## Hipótese 3: Usuários que compartilham mensagens com pouca variação
![hourly](../../images/metronome.png)

In [18]:
G_metronome = load_graph_by_edge(driver, 'METRONOME_SIMILAR')

metronome_bots = rank_bot_suspicion(G_metronome) 
metronome_bots



Unnamed: 0,user_id,bot_suspicion_score,centrality_rank,clique_score,volume_score,labels
153,51903,0.8308,0.8351,0.7418,1.0000,[]
26,49616,0.8289,0.8298,0.7459,0.9930,[]
167,52202,0.8289,0.8296,0.7461,0.9930,[]
49,49916,0.8289,0.8297,0.7459,0.9929,[]
68,50241,0.8288,0.8295,0.7461,0.9930,[]
...,...,...,...,...,...,...
2413,50968,0.3092,0.7712,0.0000,0.0037,[]
2481,58709,0.3092,0.7712,0.0000,0.0037,[]
2480,56240,0.3092,0.7712,0.0000,0.0037,[]
2339,50023,0.2594,0.6449,0.0000,0.0072,[]


In [19]:
metronome_bots.describe()

Unnamed: 0,user_id,bot_suspicion_score,centrality_rank,clique_score,volume_score
count,2485.0,2485.0,2485.0,2485.0,2485.0
mean,52298.16499,0.729997,0.77116,0.743526,0.62061
std,2581.283957,0.064036,0.033537,0.055701,0.26652
min,49271.0,0.1765,0.4348,0.0,0.0036
25%,50180.0,0.6978,0.7571,0.7271,0.4427
50%,51556.0,0.7369,0.7707,0.7474,0.6658
75%,53755.0,0.7801,0.7893,0.7641,0.8588
max,59734.0,0.8308,1.0,1.0,1.0


In [20]:
filtered_metronome_bots = metronome_bots[
    metronome_bots['bot_suspicion_score'] >
    metronome_bots['bot_suspicion_score'].quantile(0.7)
]
filtered_metronome_bots

Unnamed: 0,user_id,bot_suspicion_score,centrality_rank,clique_score,volume_score,labels
153,51903,0.8308,0.8351,0.7418,1.0000,[]
26,49616,0.8289,0.8298,0.7459,0.9930,[]
167,52202,0.8289,0.8296,0.7461,0.9930,[]
49,49916,0.8289,0.8297,0.7459,0.9929,[]
68,50241,0.8288,0.8295,0.7461,0.9930,[]
...,...,...,...,...,...,...
1188,54313,0.7756,0.7729,0.7103,0.9117,[]
1168,50257,0.7756,0.7659,0.7224,0.9013,[]
2038,50749,0.7756,0.7619,0.7442,0.8656,[]
1928,51220,0.7755,0.7926,0.7564,0.7793,[]


## Avaliando Semelhança entre Técnicas de Detecção

In [22]:
metronome_susp = set(filtered_metronome_bots['user_id'])
hourly_susp = set(filtered_hourly_bots['user_id'])
rapid_susp = set(filtered_rapid_bots['user_id'])

In [23]:
overlap_mh = metronome_susp & hourly_susp
overlap_mr = metronome_susp & rapid_susp
overlap_hr = hourly_susp & rapid_susp

len(overlap_mh), len(overlap_mr), len(overlap_hr)

(153, 32, 122)

In [24]:
def jaccard(a, b):
    return len(a & b) / len(a | b) if (a | b) else 0

j12 = jaccard(metronome_susp, hourly_susp)
j13 = jaccard(metronome_susp, rapid_susp)
j23 = jaccard(hourly_susp, rapid_susp)

j12, j13, j23

(0.11634980988593156, 0.03764705882352941, 0.16531165311653118)

In [25]:
def overlap_ratio(a, b):
    return len(a & b) / min(len(a), len(b)) if min(len(a), len(b)) else 0

o12 = overlap_ratio(metronome_susp, hourly_susp)
o13 = overlap_ratio(metronome_susp, rapid_susp)
o23 = overlap_ratio(rapid_susp, hourly_susp)

o12, o13, o23

(0.21161825726141079, 0.23357664233576642, 0.8905109489051095)

In [26]:
highest_confident = metronome_susp & hourly_susp & rapid_susp
highest_confident

{49271,
 49333,
 49432,
 49452,
 49514,
 49519,
 49589,
 49645,
 49654,
 49727,
 49834,
 49862,
 49864,
 50004,
 50009,
 50011,
 50056,
 50112,
 50136,
 50281,
 50574,
 50585,
 51089,
 51662,
 52859,
 52883,
 52896,
 52934,
 54055}