# 2. Data Understanding
The series Breaking Bad is a critically-acclaimed series with 62 episodes. We want to do a sentiment analysis of how the dialogues of different characters in Breaking Bad and see which characters speak the most, which words they use the most, and how their language changes over time.

We would require the series transcripts, to access the dialogues held in the show for this analysis. There are a number of sites that give acces to this. 
In this case, I used [Forever Dreaming](https://transcripts.foreverdreaming.org/viewforum.php?f=165&sid=18a2d0725580199573a521ce00dc350a), unfortunately only seasons 1-3 had the scripts include the character and the dialogue, in that order, so we will use that in the mean time. 

In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Define the URL of the page containing the links to all the episodes
url = 'https://transcripts.foreverdreaming.org/viewforum.php?f=165'

# Send a GET request to the URL and parse the HTML content using BeautifulSoup
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Find all the links to the Breaking Bad episodes on the page
episode_links = soup.select('.topictitle')

#print(episode_links)

In [16]:
# Define a function to extract the transcript for a given episode link
def get_transcript(link):
    # Send a GET request to the episode link and parse the HTML content
    response = requests.get('https://transcripts.foreverdreaming.org/viewforum.php?f=165' + link['href'][1:])
    soup = BeautifulSoup(response.content, 'html.parser')
    print(soup)
    # Extract the season, episode, and title information from the page heading
    heading = soup.select_one('h2')
    print(heading)

    heading_parts = heading.text.split(' ')
    if len(heading_parts) >= 4:
        season = heading_parts[1]
        episode = heading_parts[3][:-1]
        title = ' '.join(heading_parts[4:])
    else:
        season = 'N/A'
        episode = 'N/A'
        title = 'N/A'
        
    # Extract the dialogue from the page content
    dialogue = soup.select_one('#pagecontent').find_all('p')
    dialogue = [p.text for p in dialogue if not p.text.startswith(('(', '[', 'Scene'))]
    
    # Split the dialogue into actor and text columns
    dialogue = [line for line in dialogue if ':' in line]
    dialogue = [{'actor': line.split(':')[0], 'text': line.split(':')[-1].strip()} for line in dialogue]
    
    # Create a DataFrame with the episode information and dialogue
    df = pd.DataFrame(dialogue)
    df['season'] = season
    df['episode'] = episode
    df['title'] = title
    
    return df

# Apply the get_transcript function to all the episode links
episode_data = [get_transcript(link) for link in episode_links]

# Concatenate all the episode DataFrames into a single DataFrame
full_df = pd.concat(episode_data, ignore_index=True)

# Write the DataFrame to a CSV file
full_df.to_csv('BBdata.csv', index=False)

# Print the first few rows of the DataFrame
print(full_df.head())


KeyboardInterrupt: 