# Findings

In [1]:
import os
import pandas as pd

os.chdir(os.path.abspath(os.path.join(os.getcwd(), "..")))

from config import PATH

SEED = 42

In [3]:
emb_df = pd.read_csv(PATH + 'node_embeddings5000.csv')

In [4]:
n_rows = emb_df.shape[0]

embedding_cols = emb_df.columns.to_list()
embedding_cols.remove('user_id')

# Remove zeros
emb_df = emb_df.loc[~(emb_df[embedding_cols] == 0.0).all(axis=1)]
print('Removed zero embeddings:')
print(f"Nodes remaining = {emb_df.shape[0]}/{n_rows}")

Removed zero embeddings:
Nodes remaining = 19429/677640


In [5]:
nodes_df = pd.read_csv(PATH + 'node_features.csv')
node_columns = nodes_df.columns.tolist()
node_columns.remove('user_id')
df = nodes_df.merge(emb_df, on="user_id", how="inner")

In [6]:
metrics_df = pd.read_csv(PATH + 'graph_metrics.csv')
metrics_df.rename(columns={"node": "user_id"}, inplace=True)
metrics_columns = metrics_df.columns.tolist()
metrics_columns.remove('Unnamed: 0')
metrics_columns.remove('user_id')  
df = df.merge(metrics_df, on="user_id", how="inner")
df.head()

Unnamed: 0.1,user_id,user_rt,num_post,user_time_rt,num_post_unverified,num_post_non-rumor,num_post_true,num_post_false,num_rt_unverified,num_rt_non-rumor,...,emb_9,emb_10,emb_11,Unnamed: 0,in_degree,out_degree,katz,eigenvector,pagerank,betweenness
0,1000157142,2,0,3.775,0,0,0,0,0,0,...,3.519199,-2.032025,-1.165162,490687,2,2,0.001338,3.768125e-09,2e-06,2.0
1,1000228238,2,0,29.65,0,0,0,0,0,0,...,0.009075,0.006395,0.009014,138774,2,5,0.001271,3.848015e-06,1e-06,45830.0
2,1000424378,1,0,3.18,0,0,0,0,0,1,...,0.044873,0.044618,0.080977,106472,1,5,0.001238,1.489122e-08,1e-06,20.0
3,100084513,3,0,15.95,0,0,0,0,1,2,...,0.174872,0.034832,-0.076246,10498,3,3,0.001403,3.582313e-06,3e-06,34454801.0
4,1001017003,1,0,4.42,0,0,0,0,0,0,...,1.096398,-3.1938,-2.022495,184330,1,2,0.001201,5.430364e-11,2e-06,0.0


In [7]:
df[node_columns].describe()

Unnamed: 0,user_rt,num_post,user_time_rt,num_post_unverified,num_post_non-rumor,num_post_true,num_post_false,num_rt_unverified,num_rt_non-rumor,num_rt_true,num_rt_false,score,rt_total
count,19429.0,19429.0,19429.0,19429.0,19429.0,19429.0,19429.0,19429.0,19429.0,19429.0,19429.0,19429.0,19429.0
mean,2.629626,0.108961,4468.981,0.020639,0.029801,0.029389,0.029132,0.509445,0.828813,0.607751,0.683617,0.427764,46.269597
std,3.592342,1.33433,80469.38,0.197879,0.987823,0.370731,0.302647,1.104894,2.326949,1.243465,1.243434,1.821189,799.263922
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-20.8,0.0
25%,1.0,0.0,4.73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-7e-06,0.0
50%,2.0,0.0,31.95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025579,0.0
75%,3.0,0.0,197.27,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.74878,0.0
max,139.0,80.0,2372812.0,8.0,72.0,17.0,15.0,25.0,86.0,30.0,27.0,90.2,59893.0


In [8]:
df[metrics_columns].describe()

Unnamed: 0,in_degree,out_degree,katz,eigenvector,pagerank,betweenness
count,19429.0,19429.0,19429.0,19429.0,19429.0,19429.0
mean,2.358896,37.034021,0.001391,0.0003265356,2e-06,658342.9
std,2.444551,543.8268,0.000408,0.003651803,2e-06,10616050.0
min,0.0,1.0,0.001046,5.7399630000000005e-22,-0.000126,0.0
25%,1.0,1.0,0.001164,9.63006e-15,1e-06,3.0
50%,2.0,2.0,0.001278,2.161465e-10,1e-06,37.0
75%,3.0,4.0,0.00144,2.198604e-06,3e-06,7242.0
max,44.0,35379.0,0.008743,0.2492946,9.3e-05,376433700.0


In [15]:
df_to_remove = df[df['score'] < 0]
df_to_remove.shape[0]

5603

In [16]:
df_to_remove["betweenness"].describe()

count    5.603000e+03
mean     6.427988e+05
std      8.622262e+06
min      0.000000e+00
25%      2.000000e+00
50%      3.600000e+01
75%      9.179000e+03
max      2.842791e+08
Name: betweenness, dtype: float64

In [18]:
df_to_remove = df_to_remove.sort_values(by="score", ascending=True)
df_to_remove['user_id'].to_csv('network/nodes_to_remove/gate_sorted.txt', index=False, header=False)

In [20]:
df = pd.read_csv(PATH + 'node_features.csv')
metrics_df = pd.read_csv(PATH + 'graph_metrics.csv')
metrics_df.rename(columns={"node": "user_id"}, inplace=True)
metrics_columns = metrics_df.columns.tolist()
metrics_columns.remove('Unnamed: 0')
metrics_columns.remove('user_id')  
df = df.merge(metrics_df, on="user_id", how="inner")

In [21]:
from sklearn.preprocessing import MinMaxScaler
# Invert score so more negative = more harmful
df["harmfulness"] = -df["score"]


# Normalize both columns to [0, 1]
scaler = MinMaxScaler()
df[["harmfulness_norm", "betweenness_norm"]] = scaler.fit_transform(
    df[["harmfulness", "betweenness"]]
)


In [24]:
alpha = 1
df["joint_score"] = alpha * df["harmfulness_norm"] + (1 - alpha) * df["betweenness_norm"]

In [25]:
top_joint = df.sort_values("joint_score", ascending=False).head(5603)
top_joint['user_id'].to_csv('network/nodes_to_remove/joint.txt', index=False, header=False)