In [1]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy
import networkx as nx
import matplotlib.pyplot as plt 
import json
import os
from urllib.parse import urljoin  

## Extracting Characters information - this is already put into "characters_analysis"

In [2]:
folder_path = "rickmorty_characters"

if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [9]:
# Complete mapped list for the infobox fields
desired_infobox_fields = {
    "SPECIES": "Species",
    "TYPE": "Type",
    "AGE": "Age",
    "AVERAGE LIFESPAN": "Average Lifespan",
    "STATUS": "Status",
    "OCCUPATION": "Occupation",
    "HOME PLANET": "Home Planet",
    "PLACE OF ORIGIN": "Place of Origin",
    "AFFILIATION": "Affiliation",
    "FAMILY": "Family",
    "FIRST APPEARANCE": "First Appearance",
    "FIRST SEEN IN": "First Appearance",
    "VOICE ACTOR": "Voice Actor"
}

# Function to get character links from a given category page URL
def get_character_links(page_url):
    character_links = []
    page = requests.get(page_url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    # Locate all character links on the page
    for link in soup.select(".category-page__member-link"):
        character_url = urljoin("https://rickandmorty.fandom.com", link.get('href'))
        character_links.append(character_url)
    
    # Find the "Next page" link, if it exists
    next_button = soup.select_one(".category-page__pagination-next")
    next_page_url = urljoin("https://rickandmorty.fandom.com", next_button.get('href')) if next_button else None
    
    return character_links, next_page_url

# Function to extract specific infobox data
def extract_infobox_data(soup):
    infobox = soup.find("aside", {"role": "region"})  # Infobox container
    infobox_data = {field_name: None for field_name in set(desired_infobox_fields.values())}  # Initialize with null values

    if infobox:
        for item in infobox.find_all("div", class_="pi-item"):
            label = item.find("h3", class_="pi-data-label")
            value = item.find("div", class_="pi-data-value")
            if label and value:
                label_text = label.get_text(strip=True).upper()  # Convert label to uppercase for matching
                value_text = value.get_text(strip=True)

                # Check if the label_text is in desired_infobox_fields
                if label_text in desired_infobox_fields:
                    mapped_field_name = desired_infobox_fields[label_text]
                    
                    # Handle "First Appearance" by using the first available match between "FIRST APPEARANCE" and "FIRST SEEN IN"
                    if mapped_field_name == "First Appearance" and infobox_data[mapped_field_name] is None:
                        infobox_data[mapped_field_name] = value_text
                    
                    # Handle "Family" and "Affiliation" to ensure multiple entries are separated by commas
                    elif mapped_field_name in ["Family", "Affiliation"]:
                        if infobox_data[mapped_field_name]:
                            infobox_data[mapped_field_name] += ", " + value_text
                        else:
                            infobox_data[mapped_field_name] = value_text
                    else:
                        infobox_data[mapped_field_name] = value_text

    return infobox_data

# Function to scrape character data and save connected URLs
def scrape_character_data(character_url):
    page = requests.get(character_url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    # Retrieve character name
    name = soup.find("h1", class_="page-header__title").text.strip()

    # Extract infobox data
    infobox_data = extract_infobox_data(soup)

    # Locate the main content section (using a div that typically holds character data)
    content_div = soup.find("div", class_="mw-parser-output")
    
    # Collect text content and links
    content_data = []
    page_links = []  # To store URLs linked within the character page

    # Collect all paragraphs and list items, storing text and links
    for element in content_div.find_all(['p', 'ul'], recursive=False):
        if "Fan Feed" in element.text:
            break  # Stop when reaching "Fan Feed"

        item_text = element.get_text(strip=True)
        content_data.append(item_text)

        # Extract and store all internal links in the current element
        for a in element.find_all('a', href=True):
            link_url = urljoin(character_url, a['href'])
            if "rickandmorty.fandom.com/wiki/" in link_url:
                page_links.append(link_url)

    # Structure data for saving
    character_data = {
        "name": name,
        "infobox": infobox_data,  # Add structured infobox data
        "content": "\n".join(content_data),  # Combine all text content into a single field
        "linked_pages": list(set(page_links))  # Store unique URLs to avoid duplicates
    }
    
    # Save each character's data as a JSON file in the specified folder
    filename = f"{folder_path}/{name.replace(' ', '_')}.json"
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(character_data, f, indent=4)
    
    print(f"Saved {name} data to {filename}")
    return character_data

# Main function to iterate through all pages and scrape all characters
def scrape_all_characters(start_url):
    visited_links = set()  # To keep track of already-scraped characters
    current_page = start_url
    
    while current_page:
        print(f"Scraping page: {current_page}")
        character_links, next_page = get_character_links(current_page)
        
        # Scrape each character on the current page
        for link in character_links:
            if link in visited_links:  # Skip if already scraped
                print(f"Skipping already scraped character: {link}")
                continue
            
            try:
                scrape_character_data(link)  # Save data immediately for each character
                visited_links.add(link)  # Mark as visited
                time.sleep(1)  # Respectful delay
            except Exception as e:
                print(f"Failed to scrape {link}: {e}")
        
        # Move to the next page
        current_page = next_page

# Start URL for the character category
start_url = "https://rickandmorty.fandom.com/wiki/Category:Characters"
scrape_all_characters(start_url)


Scraping page: https://rickandmorty.fandom.com/wiki/Category:Characters
Saved Abandoned Jerrys data to rickmorty_characters/Abandoned_Jerrys.json
Saved Abrodolph Lincoler data to rickmorty_characters/Abrodolph_Lincoler.json
Saved Adam data to rickmorty_characters/Adam.json
Saved Adjudicator Rick data to rickmorty_characters/Adjudicator_Rick.json
Saved Afro Rick data to rickmorty_characters/Afro_Rick.json
Saved Agent Gribbles data to rickmorty_characters/Agent_Gribbles.json
Saved Alan Rails data to rickmorty_characters/Alan_Rails.json
Saved Albert Einstein data to rickmorty_characters/Albert_Einstein.json
Saved Alexander data to rickmorty_characters/Alexander.json
Saved Alien Googah data to rickmorty_characters/Alien_Googah.json
Saved Alien Morty data to rickmorty_characters/Alien_Morty.json
Saved Alien Rick data to rickmorty_characters/Alien_Rick.json
Saved Alphabetrians data to rickmorty_characters/Alphabetrians.json
Saved Alternate versions of characters data to rickmorty_characters/