In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
from urllib.parse import urlparse
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns 
import requests

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
chats = pd.read_parquet('../02_data/data_archive/raphaela/chats.parquet', engine='pyarrow')
en_domain_stats = pd.read_parquet('../02_data/data_archive/raphaela/en_domain_stats.parquet', engine='pyarrow')
chat_url_shares = np.load('../02_data/data_archive/raphaela/chat_url_shares.npz')
urls = dd.read_parquet('../02_data/data_archive/raphaela/urls.parquet', engine = 'fastparquet') # , engine='pyarrow'



In [3]:
urls = urls.compute()

In [4]:
for url in urls["url"].head():
    print(url)

https://odysee.com/@jermwarfare:2/The-Baileys:8
https://drsambailey.com/its-elementary-my-dear-watson-unmasking-the-viral-paradigm/
https://drsambailey.com/why-nobody-had-caught-or-got-covid-19/
https://live.childrenshealthdefense.org/shows/good-morning-chd/qtpO5WHxFz
https://jonrappoport.substack.com/p/the-millionaire-blogger-in-the-land


## Approach 1: requests

In [8]:
# user identification
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

In [13]:
# Function to get headers of a URL
def get_headers(url):
    try:
        response = requests.get(url, headers=headers, timeout=5) # , verify=False, get instead of head?
        return response.headers  # Return headers
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

In [None]:
# Loop through URLs and get headers
url_headers = {}
for url in urls["url"]:
    headers = get_headers(url)
    if headers:
        url_headers[url] = headers

# Display the headers of the first few URLs
for url, headers in list(url_headers.items())[:5]:  # Displaying first 5 for brevity
    print(f"URL: {url}")
    print("Headers:")
    for key, value in headers.items():
        print(f"  {key}: {value}")
    print("\n")

## Approach 2: Selenium

In [18]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
from tqdm import tqdm

In [6]:
driver = webdriver.Chrome()
driver.get("https://www.selenium.dev/selenium/web/web-form.html")
title = driver.title
driver.implicitly_wait(0.5)

In [19]:
def get_headers_selenium(url):
    driver = webdriver.Chrome()

    headers = {}

    try:
        driver.get(url)
        time.sleep(1)  # Wait for page to load (adjust as needed)
        title = driver.title
        headers = {url: title}
    except Exception as e:
        print(f"Error fetching {url}: {e}")
    finally:
        driver.quit()

    return headers


In [8]:
# Example usage
url = "https://example.com"
headers = get_headers_selenium(url)
print(headers)

# Close the driver when done
driver.quit()

{'https://example.com': 'Example Domain'}


In [21]:
urls_sample = urls.sample(n=100000, random_state=42)

In [22]:
urls_sample["url"].head()

2043925    https://www.indiatoday.in/india/story/wrestler...
540366     https://www.nytimes.com/2019/08/15/sports/base...
768493     https://bigota.d.miui.com/V13.0.3.0.SKGINXM/mi...
136266                                    http://ptv.io/2Jrx
312520                          https://youtu.be/j5fCqKbSC7M
Name: url, dtype: object

In [None]:
# Loop through URLs and get headers
url_headers = []
for url in urls_sample["url"]:
    headers = get_headers_selenium(url)
    if headers:
        url_headers.append(headers)

In [24]:
url_headers

[{'https://odysee.com/@jermwarfare:2/The-Baileys:8': 'The-Baileys'},
 {'https://drsambailey.com/its-elementary-my-dear-watson-unmasking-the-viral-paradigm/': '‘It’s Elementary My Dear Watson’ – Unmasking The Viral Paradigm'},
 {'https://drsambailey.com/why-nobody-had-caught-or-got-covid-19/': 'Why Nobody “Had, Caught or Got” COVID-19'},
 {'https://live.childrenshealthdefense.org/shows/good-morning-chd/qtpO5WHxFz': 'Page not found | Childrens Health Defense'},
 {'https://jonrappoport.substack.com/p/the-millionaire-blogger-in-the-land': 'The Millionaire blogger, in the Land of Virology'},
 {'https://planetwavesfm.substack.com/p/charlatans-web': "Charlatan's Web - by Eric F Coppolino"},
 {'https://drsambailey.com/resources/videos/natural-health-remedies/can-soft-drinks-be-healthy/': 'Can Soft Drinks Be Healthy?'},
 {'https://drsambailey.com/resources/videos/interviews/jon-rappoport-make-the-criminals-squirm/': 'Jon Rappoport: Make The Criminals Squirm'},
 {'https://drsambailey.com/resourc

In [None]:
# Loop through URLs and get headers with a progress bar
url_headers = []
for url in tqdm(urls_sample["url"], desc="Fetching headers"):
    headers = get_headers_selenium(url)
    if headers:
        url_headers.append(headers)

## 03 Approach
filtering title from URL

In [4]:
import re
from tqdm import tqdm

In [112]:
urls_sample = urls.sample(n=100000, random_state=42)

In [113]:
def extract_title(url):

    if 'youtube' in url.lower():
        return 'YouTube'

    # Pattern to extract the title from the URL (assuming it's between slashes and hyphens)
    match = re.search(r'/([^/]+)/?$', url)  # Extract last part of URL after '/'
    if match:
        # Replace hyphens or underscores with spaces and return the title
        title = match.group(1).replace('-', ' ').replace('_', ' ')
        return title
    return None  

In [114]:
# Initialize tqdm to monitor the progress of apply
tqdm.pandas()

# Apply the function to the 'url' column with a progress bar
urls_sample['title'] = urls_sample['url'].progress_apply(extract_title)

  0%|          | 0/100000 [00:00<?, ?it/s]

100%|██████████| 100000/100000 [00:01<00:00, 97311.82it/s]


In [115]:

# Display the DataFrame with titles
print(urls_sample)

               id                                                url  \
2043925  31404053  https://www.indiatoday.in/india/story/wrestler...   
540366   55066318  https://www.nytimes.com/2019/08/15/sports/base...   
768493   52148717  https://bigota.d.miui.com/V13.0.3.0.SKGINXM/mi...   
136266    2233418                                 http://ptv.io/2Jrx   
312520   17089736                       https://youtu.be/j5fCqKbSC7M   
...           ...                                                ...   
668130   39465442      http://maps.google.com/?q=57.520810,25.331358   
858521   30218649                      https://www.dixiememories.com   
1018579   5212883                            https://t.co/PfXVDIT0qD   
1680175   7971631  https://twitter.com/sovietvisuals/status/10091...   
1829388  56355340  http://twitter.com/Darrmill/status/13606641053...   

                        start_date                   end_date  \
2043925 2023-05-04 09:33:21.000000 2023-05-04 09:33:21.000000   
54036

### Filter non sense titles 

In [116]:
import nltk
from nltk.corpus import words
nltk.download('words')

english_words = set(words.words()) - {"a"}  # Exclude "a" from the set of English words

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Raphaela\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [117]:
def contains_valid_word(title):
    # Check if title is not None
    if title is None:
        return False
    # Split the title into words and check if any are in the list of English words
    title_words = title.lower().split()
    return any(word in english_words for word in title_words)

In [118]:
# Filter the dataframe to include only titles with actual words
filtered_df = urls_sample[urls_sample['title'].apply(contains_valid_word)]

print(filtered_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 31926 entries, 2043925 to 1086530
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   id          31926 non-null  int64         
 1   url         31926 non-null  object        
 2   start_date  31926 non-null  datetime64[ns]
 3   end_date    31926 non-null  datetime64[ns]
 4   title       31926 non-null  object        
dtypes: datetime64[ns](2), int64(1), object(2)
memory usage: 1.5+ MB
None


Export Data

In [119]:
import os

# Save the DataFrame as a CSV file in the '02_data' folder
folder_path = "C:/Users/Raphaela/Documents/MA_Studium/4_Semester/MA_Thesis/02_data"
csv_path = os.path.join(folder_path, 'url_sample_with_titles.csv')
filtered_df.to_csv(csv_path, index=False)

In [120]:
# Compress and save the DataFrame as a .gz file
csv_gz_path = os.path.join(folder_path, 'url_sample_with_titles.csv.gz')
filtered_df.to_csv(csv_gz_path, index=False, compression='gzip')