In [1]:
import requests
import string
from bs4 import BeautifulSoup
import pandas as pd
from bs4.element import ResultSet
import os

# Functions

In [2]:
def track_tropes_crawled(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    td_tags = soup.find_all('td')
    trope_names = []
    trope_links = []
    for td in td_tags:
        a_tag = td.find('a', href=True)
        if a_tag:
            link = a_tag['href']
            trope_links.append(link)
            link_text = a_tag.get_text(strip=True)
            trope_names.append(link_text)
    return pd.DataFrame({'trope_name':trope_names, 'trope_link':trope_links})

In [3]:
def parsing_trope_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the main content div with the id 'main-content'
    main_content = soup.find('div', id='main-content')
    if not main_content:
        print("No 'main-content' found.")
        return None, None, None
    # Find the article with id 'main-entry'
    main_entry = main_content.find('article', id='main-entry')

    if not main_entry:
        print("No 'main-entry' found.")
        return None, None, None

    # Initialize a list to store paragraphs
    paragraphs = []
    related_links = []
    related_link_names = []
    example_descriptions = []
    example_related_links = []
    example_related_link_names = []
    found_examples = False

    # Loop through all elements in the main entry
    for elem in main_entry.find_all(True):
        if elem.name == 'h2':
          break
        if elem.name == 'p':
          paragraph_text = ''
          for content in elem.contents:
              if content.name == 'a' and 'href' in content.attrs:
                  link = content['href']
                  link_text = content.get_text(strip=True)
                  paragraph_text += f' {link_text} '
                  related_links.append(link)
                  related_link_names.append(link_text)
              else:
                    paragraph_text += content.strip() if isinstance(content, str) else ''
          paragraphs.append(paragraph_text.strip())

    for elem in main_entry.find_all(True):
        if elem.name == 'h2':
            if 'Examples' in elem.get_text() or 'Other examples' in elem.get_text():
                found_examples = True
                continue
        if found_examples:
          if 'folder' in elem.get('class',[]):
            list_items = elem.find_all('li')
            for li in list_items:
              example_text = ''
              for content in li.contents:
                if content.name == 'a' and 'href' in content.attrs:
                  link = content['href']
                  link_text = content.get_text(strip=True)
                  example_text += f' {link_text} '
                  example_related_links.append(link)
                  example_related_link_names.append(link_text)
                else:
                  example_text += content.strip() if isinstance(content, str) else ''
                if content.name == 'em':
                  for em_content in content.contents:
                    if em_content.name == 'a' and 'href' in em_content.attrs:
                      link = em_content['href']
                      link_text = em_content.get_text(strip=True)
                      example_text += f' {link_text} '
                      example_related_links.append(link)
                      example_related_link_names.append(link_text)
                    else:
                      example_text += em_content.strip() if isinstance(em_content, str) else ''
              example_descriptions.append(example_text.strip())
    # Print the paragraphs
    if not paragraphs:
        print("No paragraphs found before the 'Examples' section.")

    if not example_descriptions:
        print("No example descriptions found.")
        print(len(example_descriptions))

    return paragraphs, related_links, related_link_names, example_descriptions, example_related_links, example_related_link_names


# Widespread Ripping

In [4]:
df_copy = pd.read_csv('all_trope_names_links.csv')

In [5]:
output_folder = '/content/drive/MyDrive/04 Class Directed Reading/Trope_Comic_Research/data_raw/parsed_tropes.csv'


In [6]:
def parsing_with_csv_aid(df, output_folder, message):

  if os.path.exists(output_folder):
    output_df = pd.read_csv(output_folder)
    print("Created CSV!")
  else:
    output_df = pd.DataFrame(columns=df.columns)
    print("Found CSV!")
  for index, row in df.iterrows():
    if index in output_df.index:
      print(f"Trope '{row['trope_name']}' already exists, skipping.")
      continue
    try:
      name = row['trope_name']
      url = row['trope_link']
      #example_descriptions, example_related_links, example_related_link_names
      row['trope_description'], row['related_link_names'], row['related_links'],\
       row['example_descriptions'], row['example_related_links'], row['example_related_link_names']= parsing_trope_page(url)

      output_df = pd.concat([output_df, pd.DataFrame([row])])

      output_df.to_csv(output_folder, index=False)
      print(f"Successfully processed and saved: {name}")

    except requests.exceptions.RequestException as e:
          print(f"Failed to process {name} at row {index}: {e}")
          break  # Stop the loop if there is an error



In [7]:
parsing_with_csv_aid(df_copy, output_folder, "test")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Trope 'Absurdly Powerful Student Council' already exists, skipping.
Trope 'Absurdly Sharp Blade' already exists, skipping.
Trope 'Absurdly Sharp Claws' already exists, skipping.
Trope 'Absurdly Short Level' already exists, skipping.
Trope 'Absurdly Short Production Time' already exists, skipping.
Trope 'Absurdly Spacious Sewer' already exists, skipping.
Trope 'Absurdly Youthful Mother' already exists, skipping.
Trope 'Absurd Phobia' already exists, skipping.
Trope 'Ab Urbe Condita' already exists, skipping.
Trope 'Abuse Discretion Shot' already exists, skipping.
Trope 'Abuse Mistake' already exists, skipping.
Trope 'Abuse Of Return Policy' already exists, skipping.
Trope 'Abusive Advertising' already exists, skipping.
Trope 'Abusive Alien Parents' already exists, skipping.
Trope 'Abusive Offspring' already exists, skipping.
Trope 'Abusive Parents' already exists, skipping.
Trope 'Abusive Precursors' already exists, skippi

KeyboardInterrupt: 

In [None]:
df_copy[['trope_description', 'related_link_names','related_links']] = df_copy['trope_link'].apply(parsing_trope_page).apply(pd.Series)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Request failed: 403 Client Error: Forbidden for url: https://tvtropes.org/pmwiki/pmwiki.php/Main/HumanoidAliens
Request failed: 403 Client Error: Forbidden for url: https://tvtropes.org/pmwiki/pmwiki.php/Main/HumanoidFemaleAnimal
Request failed: 403 Client Error: Forbidden for url: https://tvtropes.org/pmwiki/pmwiki.php/Main/HumanOutsideAlienInside
Request failed: 403 Client Error: Forbidden for url: https://tvtropes.org/pmwiki/pmwiki.php/Main/HumanPackMule
Request failed: 403 Client Error: Forbidden for url: https://tvtropes.org/pmwiki/pmwiki.php/Main/HumanPet
Request failed: 403 Client Error: Forbidden for url: https://tvtropes.org/pmwiki/pmwiki.php/Main/HumanPincushion
Request failed: 403 Client Error: Forbidden for url: https://tvtropes.org/pmwiki/pmwiki.php/Main/HumanPopsicle
Request failed: 403 Client Error: Forbidden for url: https://tvtropes.org/pmwiki/pmwiki.php/Main/HumanResources
Request failed: 403 Client Erro

KeyboardInterrupt: 

In [None]:
df_copy.head()

In [None]:
df_copy.info()

# Widespread Downloading

In [None]:
df = pd.read_csv('all_trope_names_links.csv')

In [None]:
output_folder = '/content/drive/MyDrive/04 Class Directed Reading/Trope_Comic_Research/data_raw/Webpages'

In [None]:
for index, row in df.iterrows():
    try:
        name = row['trope_name']
        url = row['trope_link']

        # Define the file path
        file_path = os.path.join(output_folder, f"{name}.html")

        # Check if the file already exists
        if os.path.exists(file_path):
            print(f"File already exists for {name}, skipping download.")
            continue

        # Request the HTML content
        response = requests.get(url)
        response.raise_for_status()  # Check for HTTP errors

        # Save the content to a file named after the trope_name
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(response.text)

        print(f"Successfully downloaded: {name}")

    except requests.exceptions.RequestException as e:
        print(f"Failed to download {name} at row {index}: {e}")
        break  # Stop the loop if there is an error

print("Download process completed.")

File already exists for Aardvark Trunks, skipping download.
File already exists for Abandoned Area, skipping download.
File already exists for Abandoned Camp Ruins, skipping download.
File already exists for Abandoned Catchphrase, skipping download.
File already exists for Abandoned Hospital, skipping download.
File already exists for Abandoned Hospital Awakening, skipping download.
File already exists for Abandoned Info Page, skipping download.
File already exists for Abandoned Laboratory, skipping download.
File already exists for Abandoned Mascot, skipping download.
File already exists for Abandoned Mine, skipping download.
Successfully downloaded: Abandoned Pet In A Box
Successfully downloaded: Abandoned Playground
Successfully downloaded: Abandoned War Child
Successfully downloaded: Abandoned Warehouse
Successfully downloaded: Abandonment Induced Animosity
Successfully downloaded: Abandon Ship
Successfully downloaded: Abandon Shipping
Successfully downloaded: Abandon The Disabled


KeyboardInterrupt: 