In [1]:
#Importing libraries

import requests
from bs4 import BeautifulSoup
import json
from urllib.parse import urljoin

In [2]:
import pandas as pd

In [3]:
site_url = "https://scriptmochi.com/tv-series/how-i-met-your-mother/"

In [4]:
response = requests.get(site_url)
soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
#Getting a list of links of all episodes

links = soup.find_all('a', class_='chapter__link')
episode_links = [link.get('href') for link in links]
for episode_link in episode_links:
    print(episode_link)

/tv-series/how-i-met-your-mother/pilot-5482
/tv-series/how-i-met-your-mother/purple-giraffe-1393
/tv-series/how-i-met-your-mother/sweet-taste-of-liberty-2860
/tv-series/how-i-met-your-mother/return-of-the-shirt-7680
/tv-series/how-i-met-your-mother/okay-awesome-8976
/tv-series/how-i-met-your-mother/slutty-pumpkin-8703
/tv-series/how-i-met-your-mother/matchmaker-3187
/tv-series/how-i-met-your-mother/the-duel-2301
/tv-series/how-i-met-your-mother/belly-full-of-turkey-4692
/tv-series/how-i-met-your-mother/the-pineapple-incident-2300
/tv-series/how-i-met-your-mother/the-limo-2522
/tv-series/how-i-met-your-mother/the-wedding-4641
/tv-series/how-i-met-your-mother/drumroll-please-2462
/tv-series/how-i-met-your-mother/zip-zip-zip-1285
/tv-series/how-i-met-your-mother/game-night-5166
/tv-series/how-i-met-your-mother/cupcake-7451
/tv-series/how-i-met-your-mother/life-among-the-gorillas-2189
/tv-series/how-i-met-your-mother/nothing-good-happens-after-2-a-m-2369
/tv-series/how-i-met-your-mother/ma

In [10]:
#Making a list of episode's titles

def episode_name(link):
    parts = link.split('/')

    #series_name = " ".join(parts[2].split('-'))
    episode_title_parts = parts[-1].split('-')
    episode_title = " ".join(episode_title_parts[:-1])

    title = f"{episode_title.capitalize()}"
    return title

for episode_link in episode_links:
    title = episode_name(episode_link)
    print(title)

Pilot
Purple giraffe
Sweet taste of liberty
Return of the shirt
Okay awesome
Slutty pumpkin
Matchmaker
The duel
Belly full of turkey
The pineapple incident
The limo
The wedding
Drumroll please
Zip zip zip
Game night
Cupcake
Life among the gorillas
Nothing good happens after 2 a m
Mary the paralegal
Best prom ever
Milk
Come on
Where were we
The scorpion and the toad
Brunch
Ted mosby architect
World s greatest couple
Aldrin justice
Swarley
Atlantic city
Slap bet
Single stamina
How lily stole christmas
First time in new york
Columns
Monday night football
Lucky penny
Stuff
Arrivederci fiero
Moving day
Bachelor party
Showdown
Something borrowed
Something blue
Wait for it
We re not from here
Third wheel
Little boys
How i met everyone else
I m not that guy
Dowisetrepla
Spoiler alert
Slapsgiving
The yips
The platinum rule
No tomorrow
Ten sessions
The bracket
The chain of screaming
Sandcastles in the sand
The goat
Rebound bro
Everything must go
Miracles
Do i know you
The best burger in new york

In [7]:
#Parsing dialogues

def parse_page(url):
    response = requests.get(url)

    if response.status_code == 200:
        page_content = response.text
        soup = BeautifulSoup(page_content, 'html.parser')
        content = soup.find('div', class_='content')

        lines = str(content).replace('<br/><br/>', '\n').split('\n')
        clean_lines = [BeautifulSoup(line, 'html.parser').get_text(strip=True) for line in lines if line.strip()]

        data = []
        for text in clean_lines:
          if ':' in text and not text.startswith("["):
            character, line = text.split(':', 1)
            data.append({"name":character, "text":line})
        return data
    else:
        # Error massage if the request failes
        print(f"Failed to fetch page: {url}")
        return None

# Just to see how the function works. It works good
url = "https://scriptmochi.com/tv-series/how-i-met-your-mother/pilot-5482"
parse_page(url)

[{'name': 'Narrator',
  'text': " Kids, I'm going to tell you an incredible story. The story of how I met your mother"},
 {'name': 'Son', 'text': ' Are we being punished for something?'},
 {'name': 'Narrator', 'text': ' No'},
 {'name': 'Daughter', 'text': ' Yeah, is this going to take a while?'},
 {'name': 'Narrator',
  'text': ' Yes. (Kids are annoyed) Twenty-five years ago, before I was dad, I had this whole other life.'},
 {'name': 'Narrator',
  'text': ' It was way back in 2005. I was twenty-seven just starting to make it as an architect and living in New York with my friend Marshall, my best friend from college. My life was good and then Uncle Marshall went and screwed the whole thing up.'},
 {'name': 'Marshall', 'text': ' (Opens ring) Will you marry me.'},
 {'name': 'Ted',
  'text': " Yes, perfect! And then you're engaged, you pop the champagne! You drink a toast! You have sex on the kitchen floor... Don't have sex on our kitchen floor."},
 {'name': 'Marshall',
  'text': ' Got it

In [16]:
#A function the scrapes through all the links and scrapes all of the dialogues of each episode

def scrape_episodes_dialogue(episode_links, base_url):
    episodes_data = []

    for episode_link in episode_links:
        # correcting an url for scrapping
        full_episode_url = urljoin(base_url, episode_link)

        # getting the name of the episode
        episode_title = episode_name(full_episode_url)
        print(f"Scraping dialogue for {episode_title}...")

        # Parsing dilogues
        dialogues = parse_page(full_episode_url)

        if dialogues:
            # appending dialogues to the list
            episodes_data.append({
                "episode_title": episode_title,
                "dialogues": dialogues
            })
        else:
            print(f"Failed to scrape dialogues for {episode_title}")

    # Saving as a JSON file
    with open("How_I met_your_mother_episodes_dialogues.json", "w") as json_file:
        json.dump(episodes_data, json_file, ensure_ascii=False, indent=4)

    print("Dialogue scraping completed. Data saved to How_I met_your_mother_episodes_dialogues.json")

In [17]:
base_url = "https://scriptmochi.com"
scrape_episodes_dialogue(episode_links, base_url)

Scraping dialogue for Pilot...
Scraping dialogue for Purple giraffe...
Scraping dialogue for Sweet taste of liberty...
Scraping dialogue for Return of the shirt...
Scraping dialogue for Okay awesome...
Scraping dialogue for Slutty pumpkin...
Scraping dialogue for Matchmaker...
Scraping dialogue for The duel...
Scraping dialogue for Belly full of turkey...
Scraping dialogue for The pineapple incident...
Scraping dialogue for The limo...
Scraping dialogue for The wedding...
Scraping dialogue for Drumroll please...
Scraping dialogue for Zip zip zip...
Scraping dialogue for Game night...
Scraping dialogue for Cupcake...
Scraping dialogue for Life among the gorillas...
Scraping dialogue for Nothing good happens after 2 a m...
Scraping dialogue for Mary the paralegal...
Scraping dialogue for Best prom ever...
Scraping dialogue for Milk...
Scraping dialogue for Come on...
Scraping dialogue for Where were we...
Scraping dialogue for The scorpion and the toad...
Scraping dialogue for Brunch...


  clean_lines = [BeautifulSoup(line, 'html.parser').get_text(strip=True) for line in lines if line.strip()]


Scraping dialogue for No tomorrow...
Scraping dialogue for Ten sessions...
Scraping dialogue for The bracket...
Scraping dialogue for The chain of screaming...
Scraping dialogue for Sandcastles in the sand...
Scraping dialogue for The goat...
Scraping dialogue for Rebound bro...
Scraping dialogue for Everything must go...
Scraping dialogue for Miracles...
Scraping dialogue for Do i know you...
Scraping dialogue for The best burger in new york...
Scraping dialogue for I heart nj...
Scraping dialogue for Intervention...
Scraping dialogue for Shelter island...
Scraping dialogue for Happily ever after...
Scraping dialogue for Not a father s day...
Scraping dialogue for Woooo...
Scraping dialogue for The naked man...
Scraping dialogue for The fight...
Scraping dialogue for Little minnesota...
Scraping dialogue for Benefits...
Scraping dialogue for Three days of snow...
Scraping dialogue for The possimpible...
Scraping dialogue for The stinsons...
Scraping dialogue for Sorry bro...
Scraping 