In [1]:
import ipyparallel as ipp

cluster = ipp.Cluster.from_file("/home/scc/lennart.giessing/.ipython/profile_gpu/security/cluster-.json")
rc = cluster.connect_client_sync()
rc

<ipyparallel.client.client.Client at 0x7f4ad85a62c0>

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import scipy.sparse as sp
import json
import zipfile
from scipy.sparse import load_npz
import random
from tqdm import tqdm

In [3]:
# Load datasets
chats = pd.read_parquet('../02_data/chats.parquet')

chat_index = np.load('../02_data/chat_index.npy')
url_domains = pd.read_csv('../02_data/url_domains.csv')
# url_domains.info()
#domain_pc1 = pd.read_csv('../02_data/domain_pc1.csv')
url_data = pd.read_parquet('../02_data/url.data.parquet')
chat_url_shares = load_npz('../02_data/chat_url_shares.npz')


In [4]:
url_domains.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18141615 entries, 0 to 18141614
Data columns (total 19 columns):
 #   Column               Dtype  
---  ------               -----  
 0   index                int64  
 1   url                  object 
 2   domain               object 
 3   messages             int64  
 4   chats                int64  
 5   avalanches           int64  
 6   mean_avalanche_size  float64
 7   top_avalanche_size   int64  
 8   virality             float64
 9   top_shares_1h        int64  
 10  top_shares_6h        int64  
 11  top_shares_1d        int64  
 12  top_shares_3d        int64  
 13  top_shares_14d       int64  
 14  first_share_date     object 
 15  final_share_date     object 
 16  pc1                  float64
 17  rank                 float64
 18  year                 int64  
dtypes: float64(4), int64(11), object(4)
memory usage: 2.6+ GB


In [5]:
url_domains

Unnamed: 0,index,url,domain,messages,chats,avalanches,mean_avalanche_size,top_avalanche_size,virality,top_shares_1h,top_shares_6h,top_shares_1d,top_shares_3d,top_shares_14d,first_share_date,final_share_date,pc1,rank,year
0,3,https://live.childrenshealthdefense.org/shows/...,childrenshealthdefense.org,31,24,18,1.722222,8,0.327384,8,8,8,14,17,2022-09-23 00:56:14.000000,2023-07-03 12:05:08.872485,0.176170,1329.0,2022
1,39,https://vernoncoleman.org/articles/vernon-cole...,vernoncoleman.org,10,6,6,1.666667,5,0.547723,4,6,6,7,9,2023-05-30 11:12:51.426699,2023-06-14 13:44:37.560958,0.569545,166.0,2023
2,41,https://www.naturalblaze.com/2023/03/us-govern...,naturalblaze.com,84,62,57,1.473684,5,0.154303,4,11,11,15,24,2023-03-17 12:40:30.000000,2023-08-28 21:37:09.214585,0.227196,9.0,2023
3,42,https://expose-news.com/2023/05/26/rockefeller...,expose-news.com,118,69,76,1.552632,11,0.161685,6,24,33,42,69,2023-05-26 11:00:16.000000,2023-09-01 13:58:19.683013,0.305736,258.0,2023
4,43,https://rumble.com/vxv8s3-the-fluoride-decepti...,rumble.com,131,96,81,1.617284,15,0.178861,12,23,37,40,45,2022-03-19 19:19:15.000000,2023-08-29 19:00:44.889558,0.162506,3530.0,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18141610,60788523,https://YouTube.com/OnlineFormOfficial,youtube.com,98,1,84,1.166667,3,0.116344,3,5,7,12,31,2021-06-01 08:15:45,2021-09-30 09:30:24,0.375093,16627.0,2021
18141611,60788824,https://YouTube.com/OnlineFormOfficial*,youtube.com,3,1,3,1.000000,1,0.577350,1,1,1,1,2,2021-06-03 10:02:29,2021-06-19 12:25:11,0.375093,1709042.0,2021
18141612,60788884,https://youtube.com/c/OnlineFormOfficial,youtube.com,2,1,2,1.000000,1,0.707107,1,1,1,2,2,2021-05-27 07:02:17,2021-05-29 05:21:09,0.375093,397065.0,2021
18141613,60788897,https://youtube.com/channel/UCNqG6qB7_ttYapMNH...,youtube.com,9,1,9,1.000000,1,0.333333,1,2,3,6,9,2021-05-13 05:03:52,2021-05-25 12:01:17,0.375093,60177.0,2021


In [6]:
domain_index = np.load('../02_data/domain_index.npy')

In [7]:
domain_index

array(['drsambailey.com', 'jermwarfare.com', 'yummy.doctor', ...,
       'arahdrive.com', 'tb.cn', 'wcash.vip'], dtype='<U62')

## Check Embeddings

In [8]:
# Unzipping files
with zipfile.ZipFile('../02_data/article_embeddings.zip', 'r') as zip_ref:
    zip_ref.extractall('../02_data/article_embeddings')

article_embeddings_path = '../02_data/article_embeddings/article_embeddings.txt'   # pickle
    
with open(article_embeddings_path, 'r') as file:
    article_data = json.load(file)
    
article_embeddings = pd.DataFrame(article_data)
article_embeddings.info()

KeyboardInterrupt: 

In [None]:
article_embeddings = article_embeddings.drop(columns=['url_index'])

In [None]:
# Use the "index" column from url_data to correct article_embeddings
article_embeddings = pd.merge(
    article_embeddings,
    url_data[['index', 'url']],
    on='url',
    how='inner'
).rename(columns={'index': 'url_index'})

# Merge article_embeddings with url_domains to get domain
article_embeddings = pd.merge(
    article_embeddings,
    url_domains[['url', 'domain']],
    on='url',
    how='inner'
)

In [None]:
article_embeddings.info()

In [None]:
embeddings = np.vstack(article_embeddings['article_embedding'])
print("Embeddings shape:", embeddings.shape)

In [None]:
filtered_df = article_embeddings.loc[article_embeddings['url_index'] < 5000]
filtered_embeddings = np.vstack(filtered_df['article_embedding'])

UMAP to reduce the dimensionality of the embeddings to 2D or 3D for visualization, different colors for the scatter plots in order to identify clusters

In [None]:
from umap import UMAP
import matplotlib.pyplot as plt

In [None]:
# Assuming `embeddings` is a matrix of shape (n_samples, embedding_dim)
umap_model = UMAP(n_components=2, random_state=42)
low_dim_embeddings = umap_model.fit_transform(filtered_embeddings)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Example plot
plt.figure(figsize=(10, 8))
sns.scatterplot(
    x=low_dim_embeddings[:, 0],
    y=low_dim_embeddings[:, 1],
    hue=filtered_df['domain'],  # Replace with the metadata, e.g., domains, labels, etc.
    palette='Paired',
    alpha=0.7
)
plt.title("UMAP Projection of Embeddings")
plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")

# Hide legend
plt.legend([], [], frameon=False)

# Export the plot as an image file
plt.savefig("../03_plots/umap_projection.png", dpi=300, bbox_inches="tight")

# Show the plot (optional)
plt.show()


In [None]:
import plotly.express as px
import pandas as pd

# Assuming `low_dim_embeddings` is a numpy array and `filtered_df['domain']` is your metadata
# Convert data to a DataFrame for Plotly
plot_data = pd.DataFrame({
    "UMAP Dim 1": low_dim_embeddings[:, 0],
    "UMAP Dim 2": low_dim_embeddings[:, 1],
    "Domain": filtered_df['domain']
})

# Create an interactive scatter plot
fig = px.scatter(
    plot_data,
    x="UMAP Dim 1",
    y="UMAP Dim 2",
    color="Domain",  # Replace with the column you want to use for coloring
    title="Zoomable UMAP Projection of Embeddings",
    opacity=0.7,
)

# Customize the plot layout
fig.update_layout(
    title_font_size=20,
    xaxis_title="UMAP Dimension 1",
    yaxis_title="UMAP Dimension 2",
    legend_title="Domain",  # Optional: customize legend title
    showlegend=False,       # Hide legend
)

# Show the plot
fig.show()

# Save the plot as an HTML file for sharing
fig.write_html("../03_plots/zoom_umap_projection.html")


In [None]:
filtered_df.info()

In [None]:
from sklearn.metrics import silhouette_score, normalized_mutual_info_score

umap_model = UMAP(n_components=2, random_state=42)
low_dim_embeddings = umap_model.fit_transform(filtered_embeddings)

# Example for silhouette score
sil_score = silhouette_score(low_dim_embeddings, filtered_df['domain'])
print("Silhouette Score:", sil_score)

In [None]:
article_embeddings.info()

In [None]:
social_media_domains = [
    'facebook.com', 'instagram.com', 'linkedin.com',
    'tiktok.com', 'snapchat.com',
    'youtube.com', 'youtu.be',  
    'twitter.com', 't.co', 't.me',
    'reddit.com', 'redd.it',
    #'google.com', 'feedproxy.google.com', 'goo.gl',
    # 'amzn.to', 'rumble.com' rumble could be scraped kinda efficient
    # 'bit.ly', 'tinyurl.com', 'ow.ly', 'buff.ly', 'shorte.st', 'is.gd', ift.tt link compression 
    # 'boards.4chan.org', '4chan.org' blog posting "social media" platform but not very known and active
    # 'telegra.ph' anonymous blog and article 
]

# Function to check if a URL is from a social media site
def is_social_media(domain):
    try:
        return any(domain == sm_domain or domain.endswith(f".{sm_domain}") for sm_domain in social_media_domains)
    except Exception as e:
        print(f"Error parsing URL: {url}, {e}")
        return None

# Filter out rows where the URL is from a social media site
embeddings_sm_filtered = article_embeddings[~article_embeddings['domain'].apply(is_social_media)]

# Check the cleaned DataFrame
print(embeddings_sm_filtered.head())

num_social_media = len(article_embeddings) - len(embeddings_sm_filtered)
print(f"Number of URLs in article_embeddings: {len(article_embeddings)}")
print(f"Number of URLs in filtered df: {len(embeddings_sm_filtered)}")
print(f"Number of URLs filtered out: {num_social_media}")

In [None]:
# Group by domain and count the number of social media rows
article_embeddings['is_social_media'] = article_embeddings['domain'].apply(is_social_media)

social_media_counts = (
    article_embeddings[article_embeddings['is_social_media']]
    .groupby('domain')
    .size()
    .reset_index(name='count')
)

print(social_media_counts)

In [None]:
# filtering of social media in original data set
article_embeddings = article_embeddings[~article_embeddings['domain'].apply(is_social_media)]

### Filter anglish 

In [None]:
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

# Ensure consistent language detection results
DetectorFactory.seed = 0

# Function to detect if the article is in English
def is_english(text):
    try:
        return detect(text) == 'en'  # Returns True if detected language is English
    except LangDetectException:  # Handles empty or invalid text
        return False

# Apply the filter to keep only English articles
english_articles = article_embeddings[article_embeddings['article'].apply(is_english)]

# Display the filtered DataFrame
print(english_articles.head())

# Check counts
print(f"Total articles: {len(article_embeddings)}")
print(f"English articles: {len(english_articles)}")


## Filter Chat URL matrix

In [None]:
# Filter chat_url_shares to keep only URLs present in article_embeddings
valid_url_indices = article_embeddings['url_index'].values
filtered_chat_url_shares = chat_url_shares.tocsr()[:, valid_url_indices]

In [None]:
valid_url_indices

In [None]:
filtered_chat_url_shares

In [None]:
# Threshold for classification
threshold = 0.5

# Assign domain indices randomly
unique_domains = article_embeddings['domain'].dropna().unique()
domain_to_index_map = {domain: idx for idx, domain in enumerate(random.sample(list(unique_domains), len(unique_domains)))}
article_embeddings['domain_index'] = article_embeddings['domain'].map(domain_to_index_map)


# Add domain information to article embeddings
# url_to_domain = url_domains[['url', 'domain']].set_index('url')
# article_embeddings = article_embeddings.join(url_to_domain, on='url', rsuffix='_domain')

In [None]:
# Map domains to domain index
url_to_domain = article_embeddings.set_index('url')['domain']
# domain_to_index_map = {domain: idx for idx, domain in enumerate(domain_index)}
# domain_to_index_map = aggregated_data.set_index('domain')['domain_index']
# article_embeddings['domain_index'] = article_embeddings['domain'].map(domain_to_index_map)

In [None]:
article_embeddings.info()

In [None]:
# Merge article embeddings with URL metadata
merged_data = pd.merge(
    article_embeddings,
    url_domains[['url', 'virality', 'avalanches', 'messages', 'chats', 'pc1', 'year']],
    on='url',
    suffixes=('', '_meta')
)

# Define fake/non-fake labels based on pc1 threshold
merged_data['label'] = (merged_data['pc1'] > threshold).astype(int)


In [None]:
merged_data.head()

In [None]:
from sklearn.metrics import silhouette_score, normalized_mutual_info_score

umap_model = UMAP(n_components=2, random_state=42)
low_dim_embeddings = umap_model.fit_transform(embeddings)

# Example for silhouette score
sil_score = silhouette_score(low_dim_embeddings, merged_data['domain'])
print("Silhouette Score:", sil_score)

In [None]:
# Aggregate URL-based information to the domain level
aggregated_data = merged_data.groupby('domain').agg({
    'article_embedding': lambda x: np.mean([np.array(ast.literal_eval(e)) if isinstance(e, str) else e for e in x], axis=0),
    'virality': 'mean',
    'avalanches': 'mean',
    'messages': 'mean',
    'chats': 'mean',
    'year': 'mean',
    'domain_index': 'first',
    'pc1':'first',
    'label': 'first'  
}).reset_index()

aggregated_data.info()

In [None]:
aggregated_data

In [None]:
# Train-test split based on nodes (domains)
unique_domains = aggregated_data['domain'].unique()
train_domains, test_domains = train_test_split(unique_domains, test_size=0.2, random_state=42)

# Split data into train and test sets
train_data = aggregated_data[aggregated_data['domain'].isin(train_domains)]
test_data = aggregated_data[aggregated_data['domain'].isin(test_domains)]

node_features = aggregated_data[['domain_index', 'domain', 'article_embedding', 'virality', 'avalanches', 'messages', 'chats', 'year', 'pc1', 'label']].copy()

node_features['domain_index'] = node_features['domain_index'].astype(int)
node_features['year'] = node_features['year'].astype(int)node_features['virality'] = node_features['virality'].round(2)
# node_features['pc1'] = node_features['pc1'].round(2)
# node_features['avalanches'] = node_features['avalanches'].round(2)
# node_features['messages'] = node_features['messages'].round(2)
# node_features['chats'] = node_features['chats'].round(2)

In [None]:
# Create edge list from filtered_chat_url_shares with domain indices
# url_to_domain_map = article_embeddings.set_index('url')['domain']
# rows, cols = chat_url_shares.tocsr().nonzero()
# edges = []

# article_url_map = url_data.set_index('index')['url']

In [None]:
# Create edge list from filtered_chat_url_shares with domain indices
# urls = []
# domain = []
# for row, col in zip(rows[:10000], cols[:10000]):
#     url = article_url_map.iloc[col] if col in article_url_map.index else None
#     urls.append(url)
#     if url is not None:
#         domain.append(url_to_domain[url] if url in url_to_domain.index else None)

In [None]:
# Create edge list from filtered_chat_url_shares with domain indices
rows, cols = chat_url_shares.tocsr().nonzero()
edges = []

article_url_map = url_data.set_index('index')['url']

for row, col in zip(rows, cols):
    url = article_url_map.iloc[col] if col in article_url_map.index else None
    if url is not None:
        domain = url_to_domain[url] if url in url_to_domain.index else None
        domain_index = domain_to_index_map.get(domain)
        if domain_index is not None:
            edges.append((row, domain_index))
edge_list = pd.DataFrame(edges, columns=['chat_id', 'domain_index'])

In [None]:
node_features

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Sample data
# train_sample = train_data.sample(n=100, random_state=42)

# Define outlier thresholds for avalanches
pc1_threshold_high = train_data['pc1'].quantile(0.9999) 
pc1_threshold_low = train_data['pc1'].quantile(0.0001)   
avalanches_threshold_high = train_data['avalanches'].quantile(0.999)  
avalanches_threshold_low = train_data['avalanches'].quantile(0.001)  

# Scatter plot
plt.figure(figsize=(12, 6))
sc = plt.scatter(
    train_data['pc1'], 
    train_data['avalanches'], 
    alpha=0.5, 
    #c=train_sample['avalanches'], 
    #cmap='BuGn', 
    s=100
)

# Add color bar
#plt.colorbar(sc, label='Avalanches')

# Labeling
plt.xlabel('Misinformation Score')
plt.ylabel('Avalanches')
plt.title('Avalanches of Domains vs. Misinformation Score')

# Add labels for outliers
for pc1, avalanches, domain in zip(train_data['pc1'], train_data['avalanches'], train_data['domain']):
    if pc1 > pc1_threshold_high or pc1 < pc1_threshold_low or avalanches > avalanches_threshold_high or avalanches < avalanches_threshold_low:
        plt.text(pc1 + 0.02, avalanches, domain, fontsize=9)

# Save the figure
plt.savefig('../03_plots/sample_scatter_outliers_pc1_avalanches.png')
plt.show()


In [None]:
print(train_data.info())
print(test_data.info())
print(edge_list.info())

In [None]:
# Step 5: Save results
# edge_list.to_csv('../02_data/train_test/edge_list.csv', index=False)
# train_data.to_csv('../02_data/train_test/train_data.csv', index=False)
# test_data.to_csv('../02_data/train_test/test_data.csv', index=False)

In [None]:
# node_features.to_csv('../02_data/train_test/node_features.csv', index=False)