In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
from urllib.parse import urlparse
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns 
import requests

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
chats = pd.read_parquet('../02_data/data_archive/raphaela/chats.parquet', engine='pyarrow')
en_domain_stats = pd.read_parquet('../02_data/data_archive/raphaela/en_domain_stats.parquet', engine='pyarrow')
chat_url_shares = np.load('../02_data/data_archive/raphaela/chat_url_shares.npz')
urls = dd.read_parquet('../02_data/data_archive/raphaela/urls.parquet', engine = 'fastparquet') # , engine='pyarrow'



In [3]:
urls = urls.compute()

In [4]:
for url in urls["url"].head():
    print(url)

https://odysee.com/@jermwarfare:2/The-Baileys:8
https://drsambailey.com/its-elementary-my-dear-watson-unmasking-the-viral-paradigm/
https://drsambailey.com/why-nobody-had-caught-or-got-covid-19/
https://live.childrenshealthdefense.org/shows/good-morning-chd/qtpO5WHxFz
https://jonrappoport.substack.com/p/the-millionaire-blogger-in-the-land


## Approach 1: requests

In [8]:
# user identification
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

In [13]:
# Function to get headers of a URL
def get_headers(url):
    try:
        response = requests.get(url, headers=headers, timeout=5) # , verify=False, get instead of head?
        return response.headers  # Return headers
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

In [14]:
# Loop through URLs and get headers
url_headers = {}
for url in urls["url"]:
    headers = get_headers(url)
    if headers:
        url_headers[url] = headers

# Display the headers of the first few URLs
for url, headers in list(url_headers.items())[:5]:  # Displaying first 5 for brevity
    print(f"URL: {url}")
    print("Headers:")
    for key, value in headers.items():
        print(f"  {key}: {value}")
    print("\n")

Error fetching https://odysee.com/@jermwarfare:2/The-Baileys:8: HTTPSConnectionPool(host='odysee.com', port=443): Read timed out. (read timeout=5)
Error fetching https://drsambailey.com/resources/videos/censorship/new-zealands-greatest-doctor/: HTTPSConnectionPool(host='drsambailey.com', port=443): Read timed out. (read timeout=5)
Error fetching https://drsambailey.com/covid-19/are-the-pfizer-injected-now-gmos/: HTTPSConnectionPool(host='drsambailey.com', port=443): Read timed out. (read timeout=5)
Error fetching https://thesecularheretic.com/terrain-theory-recontextualising-the-germ/: HTTPSConnectionPool(host='thesecularheretic.com', port=443): Max retries exceeded with url: /terrain-theory-recontextualising-the-germ/ (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000029780573D90>: Failed to resolve 'thesecularheretic.com' ([Errno 11001] getaddrinfo failed)"))
Error fetching https://odysee.com/@CosmicEvent:5/DR-MATT-SHELTON---NANOTECH-IN-THE-VACCINE-R

KeyboardInterrupt: 

## Approach 2: Selenium

In [18]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
from tqdm import tqdm

In [6]:
driver = webdriver.Chrome()
driver.get("https://www.selenium.dev/selenium/web/web-form.html")
title = driver.title
driver.implicitly_wait(0.5)

In [19]:
def get_headers_selenium(url):
    driver = webdriver.Chrome()

    headers = {}

    try:
        driver.get(url)
        time.sleep(1)  # Wait for page to load (adjust as needed)
        title = driver.title
        headers = {url: title}
    except Exception as e:
        print(f"Error fetching {url}: {e}")
    finally:
        driver.quit()

    return headers


In [8]:
# Example usage
url = "https://example.com"
headers = get_headers_selenium(url)
print(headers)

# Close the driver when done
driver.quit()

{'https://example.com': 'Example Domain'}


In [21]:
urls_sample = urls.sample(n=100000, random_state=42)

In [22]:
urls_sample["url"].head()

2043925    https://www.indiatoday.in/india/story/wrestler...
540366     https://www.nytimes.com/2019/08/15/sports/base...
768493     https://bigota.d.miui.com/V13.0.3.0.SKGINXM/mi...
136266                                    http://ptv.io/2Jrx
312520                          https://youtu.be/j5fCqKbSC7M
Name: url, dtype: object

In [25]:
# Loop through URLs and get headers
url_headers = []
for url in urls_sample["url"]:
    headers = get_headers_selenium(url)
    if headers:
        url_headers.append(headers)

error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error


KeyboardInterrupt: 

In [24]:
url_headers

[{'https://odysee.com/@jermwarfare:2/The-Baileys:8': 'The-Baileys'},
 {'https://drsambailey.com/its-elementary-my-dear-watson-unmasking-the-viral-paradigm/': '‘It’s Elementary My Dear Watson’ – Unmasking The Viral Paradigm'},
 {'https://drsambailey.com/why-nobody-had-caught-or-got-covid-19/': 'Why Nobody “Had, Caught or Got” COVID-19'},
 {'https://live.childrenshealthdefense.org/shows/good-morning-chd/qtpO5WHxFz': 'Page not found | Childrens Health Defense'},
 {'https://jonrappoport.substack.com/p/the-millionaire-blogger-in-the-land': 'The Millionaire blogger, in the Land of Virology'},
 {'https://planetwavesfm.substack.com/p/charlatans-web': "Charlatan's Web - by Eric F Coppolino"},
 {'https://drsambailey.com/resources/videos/natural-health-remedies/can-soft-drinks-be-healthy/': 'Can Soft Drinks Be Healthy?'},
 {'https://drsambailey.com/resources/videos/interviews/jon-rappoport-make-the-criminals-squirm/': 'Jon Rappoport: Make The Criminals Squirm'},
 {'https://drsambailey.com/resourc

In [23]:
# Loop through URLs and get headers with a progress bar
url_headers = []
for url in tqdm(urls_sample["url"], desc="Fetching headers"):
    headers = get_headers_selenium(url)
    if headers:
        url_headers.append(headers)

Fetching headers:   0%|          | 33/100000 [07:24<341:07:42, 12.28s/it]

Error fetching https://www.libertariannews.org/2020/10/17/im-now-a-believer-q-is-a-real-us-military-intelligence-operation-fighting-off-a-globalist-coup/: Message: unknown error: net::ERR_CONNECTION_TIMED_OUT
  (Session info: chrome=129.0.6668.100)
Stacktrace:
	GetHandleVerifier [0x00007FF780DDB095+29557]
	(No symbol) [0x00007FF780D4FA50]
	(No symbol) [0x00007FF780C0B56A]
	(No symbol) [0x00007FF780C0356D]
	(No symbol) [0x00007FF780BF4459]
	(No symbol) [0x00007FF780BF6202]
	(No symbol) [0x00007FF780BF471F]
	(No symbol) [0x00007FF780BF3FAB]
	(No symbol) [0x00007FF780BF3EEA]
	(No symbol) [0x00007FF780BF1D65]
	(No symbol) [0x00007FF780BF23DC]
	(No symbol) [0x00007FF780C0E6A1]
	(No symbol) [0x00007FF780CA933E]
	(No symbol) [0x00007FF780C8718A]
	(No symbol) [0x00007FF780CA851C]
	(No symbol) [0x00007FF780C86F33]
	(No symbol) [0x00007FF780C5116F]
	(No symbol) [0x00007FF780C522D1]
	GetHandleVerifier [0x00007FF78110C96D+3378253]
	GetHandleVerifier [0x00007FF781158497+3688311]
	GetHandleVerifier 

Fetching headers:   0%|          | 72/100000 [16:48<418:50:06, 15.09s/it]

Error fetching https://dzm0ugdauank9.cloudfront.net/wp-content/uploads/2021/11/2021-11-12T085607Z_1_LYNXMPEHAB0J0_RTROPTP_0_EUROPE-MIGRANTS-BELARUS-POLAND_1-373x210.jpg">Media</a>November: Message: unknown error: net::ERR_NAME_NOT_RESOLVED
  (Session info: chrome=129.0.6668.100)
Stacktrace:
	GetHandleVerifier [0x00007FF780DDB095+29557]
	(No symbol) [0x00007FF780D4FA50]
	(No symbol) [0x00007FF780C0B56A]
	(No symbol) [0x00007FF780C0356D]
	(No symbol) [0x00007FF780BF4459]
	(No symbol) [0x00007FF780BF6202]
	(No symbol) [0x00007FF780BF471F]
	(No symbol) [0x00007FF780BF3FAB]
	(No symbol) [0x00007FF780BF3EEA]
	(No symbol) [0x00007FF780BF1D65]
	(No symbol) [0x00007FF780BF23DC]
	(No symbol) [0x00007FF780C0E6A1]
	(No symbol) [0x00007FF780CA933E]
	(No symbol) [0x00007FF780C8718A]
	(No symbol) [0x00007FF780CA851C]
	(No symbol) [0x00007FF780C86F33]
	(No symbol) [0x00007FF780C5116F]
	(No symbol) [0x00007FF780C522D1]
	GetHandleVerifier [0x00007FF78110C96D+3378253]
	GetHandleVerifier [0x00007FF7811584

Fetching headers:   0%|          | 73/100000 [16:57<368:34:23, 13.28s/it]

Error fetching https://havagas.pt/job/34922/ux-ui-designer: Message: unknown error: net::ERR_NAME_NOT_RESOLVED
  (Session info: chrome=129.0.6668.100)
Stacktrace:
	GetHandleVerifier [0x00007FF780DDB095+29557]
	(No symbol) [0x00007FF780D4FA50]
	(No symbol) [0x00007FF780C0B56A]
	(No symbol) [0x00007FF780C0356D]
	(No symbol) [0x00007FF780BF4459]
	(No symbol) [0x00007FF780BF6202]
	(No symbol) [0x00007FF780BF471F]
	(No symbol) [0x00007FF780BF3FAB]
	(No symbol) [0x00007FF780BF3EEA]
	(No symbol) [0x00007FF780BF1D65]
	(No symbol) [0x00007FF780BF23DC]
	(No symbol) [0x00007FF780C0E6A1]
	(No symbol) [0x00007FF780CA933E]
	(No symbol) [0x00007FF780C8718A]
	(No symbol) [0x00007FF780CA851C]
	(No symbol) [0x00007FF780C86F33]
	(No symbol) [0x00007FF780C5116F]
	(No symbol) [0x00007FF780C522D1]
	GetHandleVerifier [0x00007FF78110C96D+3378253]
	GetHandleVerifier [0x00007FF781158497+3688311]
	GetHandleVerifier [0x00007FF78114D1CB+3642539]
	GetHandleVerifier [0x00007FF780E9A6B6+813462]
	(No symbol) [0x00007F

Fetching headers:   0%|          | 109/100000 [24:51<398:29:36, 14.36s/it]

Error fetching https://dzm0ugdauank9.cloudfront.net/wp-content/uploads/2021/10/2021-10-05T191232Z_1_LYNXMPEH94134_RTROPTP_0_CAMEROON-SECURITY_1.jpg: Message: unknown error: net::ERR_NAME_NOT_RESOLVED
  (Session info: chrome=129.0.6668.100)
Stacktrace:
	GetHandleVerifier [0x00007FF780DDB095+29557]
	(No symbol) [0x00007FF780D4FA50]
	(No symbol) [0x00007FF780C0B56A]
	(No symbol) [0x00007FF780C0356D]
	(No symbol) [0x00007FF780BF4459]
	(No symbol) [0x00007FF780BF6202]
	(No symbol) [0x00007FF780BF471F]
	(No symbol) [0x00007FF780BF3FAB]
	(No symbol) [0x00007FF780BF3EEA]
	(No symbol) [0x00007FF780BF1D65]
	(No symbol) [0x00007FF780BF23DC]
	(No symbol) [0x00007FF780C0E6A1]
	(No symbol) [0x00007FF780CA933E]
	(No symbol) [0x00007FF780C8718A]
	(No symbol) [0x00007FF780CA851C]
	(No symbol) [0x00007FF780C86F33]
	(No symbol) [0x00007FF780C5116F]
	(No symbol) [0x00007FF780C522D1]
	GetHandleVerifier [0x00007FF78110C96D+3378253]
	GetHandleVerifier [0x00007FF781158497+3688311]
	GetHandleVerifier [0x00007F

Fetching headers:   0%|          | 133/100000 [30:09<368:55:37, 13.30s/it]

Error fetching https://www.rt.com/russia/567191-ukraine-blackouts-eu-membership/?utm_campaign=RSS&utm_medium=rss: Message: unknown error: net::ERR_NAME_NOT_RESOLVED
  (Session info: chrome=129.0.6668.100)
Stacktrace:
	GetHandleVerifier [0x00007FF780DDB095+29557]
	(No symbol) [0x00007FF780D4FA50]
	(No symbol) [0x00007FF780C0B56A]
	(No symbol) [0x00007FF780C0356D]
	(No symbol) [0x00007FF780BF4459]
	(No symbol) [0x00007FF780BF6202]
	(No symbol) [0x00007FF780BF471F]
	(No symbol) [0x00007FF780BF3FAB]
	(No symbol) [0x00007FF780BF3EEA]
	(No symbol) [0x00007FF780BF1D65]
	(No symbol) [0x00007FF780BF23DC]
	(No symbol) [0x00007FF780C0E6A1]
	(No symbol) [0x00007FF780CA933E]
	(No symbol) [0x00007FF780C8718A]
	(No symbol) [0x00007FF780CA851C]
	(No symbol) [0x00007FF780C86F33]
	(No symbol) [0x00007FF780C5116F]
	(No symbol) [0x00007FF780C522D1]
	GetHandleVerifier [0x00007FF78110C96D+3378253]
	GetHandleVerifier [0x00007FF781158497+3688311]
	GetHandleVerifier [0x00007FF78114D1CB+3642539]
	GetHandleVerif

Fetching headers:   0%|          | 146/100000 [33:02<384:44:22, 13.87s/it]

Error fetching https://twitter.com/kylenabecker/status/1588718931354980352: Message: disconnected: not connected to DevTools
  (failed to check if window was closed: disconnected: not connected to DevTools)
  (Session info: chrome=129.0.6668.100)
Stacktrace:
	GetHandleVerifier [0x00007FF780DDB095+29557]
	(No symbol) [0x00007FF780D4FA50]
	(No symbol) [0x00007FF780C0B56A]
	(No symbol) [0x00007FF780BF2BAC]
	(No symbol) [0x00007FF780BF2A70]
	(No symbol) [0x00007FF780C0DF31]
	(No symbol) [0x00007FF780CA7E49]
	(No symbol) [0x00007FF780C86F33]
	(No symbol) [0x00007FF780C5116F]
	(No symbol) [0x00007FF780C522D1]
	GetHandleVerifier [0x00007FF78110C96D+3378253]
	GetHandleVerifier [0x00007FF781158497+3688311]
	GetHandleVerifier [0x00007FF78114D1CB+3642539]
	GetHandleVerifier [0x00007FF780E9A6B6+813462]
	(No symbol) [0x00007FF780D5AB5F]
	(No symbol) [0x00007FF780D56B74]
	(No symbol) [0x00007FF780D56D10]
	(No symbol) [0x00007FF780D45C1F]
	BaseThreadInitThunk [0x00007FFA0DBB7374+20]
	RtlUserThreadSta

Fetching headers:   0%|          | 153/100000 [40:32<440:54:04, 15.90s/it]  


KeyboardInterrupt: 

## 03 Approach
filtering title from URL

In [4]:
import re
from tqdm import tqdm

In [5]:
urls_sample = urls.sample(n=100000, random_state=42)

In [6]:
def extract_title(url):

    if 'youtube' in url.lower():
        return 'YouTube'

    # Pattern to extract the title from the URL (assuming it's between slashes and hyphens)
    match = re.search(r'/([^/]+)/?$', url)  # Extract last part of URL after '/'
    if match:
        # Replace hyphens or underscores with spaces and return the title
        title = match.group(1).replace('-', ' ').replace('_', ' ')
        return title
    return None  

In [7]:
# Initialize tqdm to monitor the progress of apply
tqdm.pandas()

# Apply the function to the 'url' column with a progress bar
urls_sample['title'] = urls_sample['url'].progress_apply(extract_title)

100%|██████████| 100000/100000 [00:01<00:00, 90318.41it/s]


In [10]:

# Display the DataFrame with titles
print(urls_sample)

               id                                                url  \
2043925  31404053  https://www.indiatoday.in/india/story/wrestler...   
540366   55066318  https://www.nytimes.com/2019/08/15/sports/base...   
768493   52148717  https://bigota.d.miui.com/V13.0.3.0.SKGINXM/mi...   
136266    2233418                                 http://ptv.io/2Jrx   
312520   17089736                       https://youtu.be/j5fCqKbSC7M   
...           ...                                                ...   
668130   39465442      http://maps.google.com/?q=57.520810,25.331358   
858521   30218649                      https://www.dixiememories.com   
1018579   5212883                            https://t.co/PfXVDIT0qD   
1680175   7971631  https://twitter.com/sovietvisuals/status/10091...   
1829388  56355340  http://twitter.com/Darrmill/status/13606641053...   

                        start_date                   end_date  \
2043925 2023-05-04 09:33:21.000000 2023-05-04 09:33:21.000000   
54036

### Filter non sense titles with Lama3

In [37]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import pandas as pd

# Load the GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.eval()

ImportError: 
GPT2LMHeadModel requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.


In [31]:
#from transformers import LlamaForCausalLM, LlamaTokenizer
import transformers
import sentencepiece
import torch

# Load the model and tokenizer
model_name = "meta-llama/Meta-Llama-3-8B"
tokenizer = transformers.LlamaTokenizer.from_pretrained(model_name)
model = transformers.LlamaForCausalLM.from_pretrained(model_name)
model.eval()


ImportError: 
LlamaTokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [None]:
titles = urls_sample['title'].tolist()  

def is_sensible_title(title, model, tokenizer, threshold=0.5):
    # Encode the input
    inputs = tokenizer(title, return_tensors="pt")
    with torch.no_grad():
        # Generate the output
        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss.item()
    
    # If the loss is lower than the threshold, consider it a sensible title
    return loss < threshold

# Filter nonsensical titles
urls_sample['is_sensible'] = urls_sample['title'].apply(
    lambda x: is_sensible_title(x, model, tokenizer)
)

# Keep only sensible titles
filtered_urls_sample = urls_sample[urls_sample['is_sensible']]

In [9]:
import os

# Save the DataFrame as a CSV file in the '02_data' folder
folder_path = "C:/Users/Raphaela/Documents/MA_Studium/4_Semester/MA_Thesis/02_data"
csv_path = os.path.join(folder_path, 'url_sample_with_titles.csv')
urls_sample.to_csv(csv_path, index=False)