In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
from urllib.parse import urlparse
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns 
import requests

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
chats = pd.read_parquet('../02_data/data_archive/raphaela/chats.parquet', engine='pyarrow')
en_domain_stats = pd.read_parquet('../02_data/data_archive/raphaela/en_domain_stats.parquet', engine='pyarrow')
chat_url_shares = np.load('../02_data/data_archive/raphaela/chat_url_shares.npz')
urls = dd.read_parquet('../02_data/data_archive/raphaela/urls.parquet', engine = 'fastparquet') # , engine='pyarrow'



In [3]:
urls = urls.compute()

In [4]:
for url in urls["url"].head():
    print(url)

https://odysee.com/@jermwarfare:2/The-Baileys:8
https://drsambailey.com/its-elementary-my-dear-watson-unmasking-the-viral-paradigm/
https://drsambailey.com/why-nobody-had-caught-or-got-covid-19/
https://live.childrenshealthdefense.org/shows/good-morning-chd/qtpO5WHxFz
https://jonrappoport.substack.com/p/the-millionaire-blogger-in-the-land


## Approach 1: requests

In [8]:
# user identification
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

In [13]:
# Function to get headers of a URL
def get_headers(url):
    try:
        response = requests.get(url, headers=headers, timeout=5) # , verify=False, get instead of head?
        return response.headers  # Return headers
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

In [14]:
# Loop through URLs and get headers
url_headers = {}
for url in urls["url"]:
    headers = get_headers(url)
    if headers:
        url_headers[url] = headers

# Display the headers of the first few URLs
for url, headers in list(url_headers.items())[:5]:  # Displaying first 5 for brevity
    print(f"URL: {url}")
    print("Headers:")
    for key, value in headers.items():
        print(f"  {key}: {value}")
    print("\n")

Error fetching https://odysee.com/@jermwarfare:2/The-Baileys:8: HTTPSConnectionPool(host='odysee.com', port=443): Read timed out. (read timeout=5)
Error fetching https://drsambailey.com/resources/videos/censorship/new-zealands-greatest-doctor/: HTTPSConnectionPool(host='drsambailey.com', port=443): Read timed out. (read timeout=5)
Error fetching https://drsambailey.com/covid-19/are-the-pfizer-injected-now-gmos/: HTTPSConnectionPool(host='drsambailey.com', port=443): Read timed out. (read timeout=5)
Error fetching https://thesecularheretic.com/terrain-theory-recontextualising-the-germ/: HTTPSConnectionPool(host='thesecularheretic.com', port=443): Max retries exceeded with url: /terrain-theory-recontextualising-the-germ/ (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000029780573D90>: Failed to resolve 'thesecularheretic.com' ([Errno 11001] getaddrinfo failed)"))
Error fetching https://odysee.com/@CosmicEvent:5/DR-MATT-SHELTON---NANOTECH-IN-THE-VACCINE-R

KeyboardInterrupt: 

## Approach 2: Selenium

In [17]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time

In [None]:
# Provide the path to the chromedriver executable if it's not in your PATH
CHROME_DRIVER_PATH = "/path/to/chromedriver"

# Set Chrome options and enable performance logging (to capture network requests)
capabilities = DesiredCapabilities.CHROME
capabilities['goog:loggingPrefs'] = {'performance': 'ALL'}  # Enable capturing network logs

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')  # Run in headless mode (optional)
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')

# Set up the WebDriver (make sure to point to your chromedriver)
service = Service(CHROME_DRIVER_PATH)
driver = webdriver.Chrome(service=service, options=chrome_options, desired_capabilities=capabilities)

def get_headers_selenium(url):
    driver.get(url)
    time.sleep(3)  # Wait for page to load (adjust as needed)

    # Capture performance logs
    logs = driver.get_log('performance')
    headers = {}

    for entry in logs:
        log = entry['message']
        if 'Network.responseReceived' in log:
            try:
                response = eval(log)['message']['params']['response']
                headers[response['url']] = response['headers']
            except Exception as e:
                pass
    return headers

# Example usage
url = "https://example.com"
headers = get_headers_selenium(url)
print(headers)

# Close the driver when done
driver.quit()
