In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import os
import re
import pandas as pd

main_url = 'https://www.seinfeldscripts.com/seinfeld-scripts.html'

# need this for web scrap
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}

# change the links accordingly
response = requests.get(main_url, headers=headers)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
script_links = ['https://www.seinfeldscripts.com/' + link['href'].strip() for link in soup.select('table a[href]')]


In [None]:
# make dir and file to save
os.makedirs('Seinfeld_Scripts', exist_ok=True)
csv_file_path = 'Seinfeld_Scripts/jerry_scripts.csv'

with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['Character', 'Dialogue', 'EpisodeNo', 'SEID', 'Season']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    # begin process the links
    for index, script_url in enumerate(script_links, start=1):
        script_response = requests.get(script_url, headers=headers)
        script_html = script_response.text
        script_soup = BeautifulSoup(script_html, 'html.parser')
        script_text = script_soup.find('div', id='content').get_text(separator='\n')

        # Adjust parsing for dialogues
        lines = script_text.split('\n')
        for line in lines:
            line = line.strip()
            # remove scence sentences and choose character with upper case only
            if ':' in line and not line.startswith('['):
                parts = line.split(':', 1)
                character = parts[0].strip().upper()
                if re.match(r'^[A-Z ]+$', character):
                    dialogue = parts[1].strip() if len(parts) > 1 else ''
                    episode_no = index
                    season = (index - 1) // 24 + 1
                    seid = f'S{season:02}E{episode_no:02}'
                    writer.writerow({
                        'Character': character,
                        'Dialogue': dialogue,
                        'EpisodeNo': float(episode_no),
                        'SEID': seid,
                        'Season': float(season)
                    })

        print(f'Processed script: {index} - {script_url}')

Processed script: 1 - https://www.seinfeldscripts.com/TheSeinfeldChronicles.htm
Processed script: 2 - https://www.seinfeldscripts.com/TheStakeout.htm
Processed script: 3 - https://www.seinfeldscripts.com/TheRobbery.htm
Processed script: 4 - https://www.seinfeldscripts.com/MaleUnbonding.htm
Processed script: 5 - https://www.seinfeldscripts.com/TheStockTip.htm
Processed script: 6 - https://www.seinfeldscripts.com/TheExGirlfriend.htm
Processed script: 7 - https://www.seinfeldscripts.com/ThePonyRemark.htm
Processed script: 8 - https://www.seinfeldscripts.com/TheJacket.htm
Processed script: 9 - https://www.seinfeldscripts.com/ThePhoneMessage.htm
Processed script: 10 - https://www.seinfeldscripts.com/TheApartment.htm
Processed script: 11 - https://www.seinfeldscripts.com/TheStatue.htm
Processed script: 12 - https://www.seinfeldscripts.com/TheRevenge.htm
Processed script: 13 - https://www.seinfeldscripts.com/TheHeartAttack.htm
Processed script: 14 - https://www.seinfeldscripts.com/TheDeal.htm

In [None]:
# load the file
df = pd.read_csv(csv_file_path)
df.to_csv(csv_file_path, index_label='', index=True)