# Node Selection

## Real Network

In [1]:
import os
import pandas as pd

os.chdir(os.path.abspath(os.path.join(os.getcwd(), "..")))

from config import PATH

SEED = 42

In [2]:
emb_df = pd.read_csv(PATH + 'node_embeddings5000.csv')

In [3]:
def remove_zero_attn(emb_df):
    n_rows = emb_df.shape[0]

    embedding_cols = emb_df.columns.to_list()
    embedding_cols.remove('user_id')

    # Remove zeros
    emb_df = emb_df.loc[~(emb_df[embedding_cols] == 0.0).all(axis=1)]
    print('Removed zero embeddings:')
    print(f"Nodes remaining = {emb_df.shape[0]}/{n_rows}")
    return emb_df

emb_df = remove_zero_attn(emb_df)

Removed zero embeddings:
Nodes remaining = 19429/677640


In [None]:
def add_node_features(emb_df, features_file = 'node_features.csv'):
    nodes_df = pd.read_csv(PATH + features_file)
    node_columns = nodes_df.columns.tolist()
    node_columns.remove('user_id')
    return nodes_df.merge(emb_df, on="user_id", how="inner")

df = add_node_features(emb_df)

In [None]:
def add_node_metrics(df, metrics_file = 'graph_metrics.csv'):
    metrics_df = pd.read_csv(PATH + metrics_file)
    metrics_df.rename(columns={"node": "user_id"}, inplace=True)
    return df.merge(metrics_df, on="user_id", how="inner")
    
df = add_node_metrics(df)
df.head()

Unnamed: 0.1,user_id,user_rt,num_post,user_time_rt,num_post_unverified,num_post_non-rumor,num_post_true,num_post_false,num_rt_unverified,num_rt_non-rumor,...,emb_9,emb_10,emb_11,Unnamed: 0,in_degree,out_degree,katz,eigenvector,pagerank,betweenness
0,1000157142,2,0,3.775,0,0,0,0,0,0,...,3.519199,-2.032025,-1.165162,490687,2,2,0.001338,3.768125e-09,2e-06,2.0
1,1000228238,2,0,29.65,0,0,0,0,0,0,...,0.009075,0.006395,0.009014,138774,2,5,0.001271,3.848015e-06,1e-06,45830.0
2,1000424378,1,0,3.18,0,0,0,0,0,1,...,0.044873,0.044618,0.080977,106472,1,5,0.001238,1.489122e-08,1e-06,20.0
3,100084513,3,0,15.95,0,0,0,0,1,2,...,0.174872,0.034832,-0.076246,10498,3,3,0.001403,3.582313e-06,3e-06,34454801.0
4,1001017003,1,0,4.42,0,0,0,0,0,0,...,1.096398,-3.1938,-2.022495,184330,1,2,0.001201,5.430364e-11,2e-06,0.0


In [6]:
df_to_remove = df[df['score'] < 0]
df_to_remove.shape[0]

5603

In [7]:
df_to_remove["betweenness"].describe()

count    5.603000e+03
mean     6.427988e+05
std      8.622262e+06
min      0.000000e+00
25%      2.000000e+00
50%      3.600000e+01
75%      9.179000e+03
max      2.842791e+08
Name: betweenness, dtype: float64

In [8]:
df_to_remove = df_to_remove.sort_values(by="score", ascending=True)
df_to_remove['user_id'].to_csv('network/nodes_to_remove/gate_sorted.txt', index=False, header=False)

### Comparison with baseline removal

In [18]:
df = pd.read_csv(PATH + 'node_features.csv')
df = add_node_metrics(df)
df.shape

(677640, 21)

In [19]:
from sklearn.preprocessing import MinMaxScaler

def remove_harmful_baseline(df, count, alpha=0.5):
    # Invert score so more negative = more harmful
    df["harmfulness"] = -df["score"]

    # Normalize both columns to [0, 1]
    scaler = MinMaxScaler()
    df[["harmfulness_norm", "betweenness_norm"]] = scaler.fit_transform(
        df[["harmfulness", "betweenness"]]
    )
    df["joint_score"] = alpha * df["harmfulness_norm"] + (1 - alpha) * df["betweenness_norm"]
    return df.sort_values("joint_score", ascending=False).head(count)

In [20]:
df = remove_harmful_baseline(df, 5603)
df['user_id'].to_csv('network/nodes_to_remove/jointTest.txt', index=False, header=False)

## Dummy networks

### Gate selected

In [None]:
dummy_df = pd.read_csv(PATH + 'node_embeddings_same_dist.csv')
dummy_df = remove_zero_attn(dummy_df)

Removed zero embeddings:
Nodes remaining = 9331/677640


In [22]:
df_to_remove = dummy_df[dummy_df['score'] < 0]
df_to_remove.shape[0]

2281

In [23]:
df_to_remove = df_to_remove.sort_values(by="score", ascending=True)
df_to_remove['user_id'].to_csv('network/nodes_to_remove/dummy_gate_sorted.txt', index=False, header=False)

### Baseline selected

In [24]:
dummy_df = pd.read_csv(PATH + 'node_features.csv')
dummy_df = add_node_metrics(dummy_df)
dummy_df.shape

(677640, 21)

In [25]:
dummy_df = remove_harmful_baseline(dummy_df, 9331)
df['user_id'].to_csv('network/nodes_to_remove/dummy_baseline.txt', index=False, header=False)