<a href="https://colab.research.google.com/github/SaeSaeeda/ComputerScience-2022-2025/blob/main/data_final_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Nessasary imports for the project
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
from tabulate import tabulate
!pip install pandas requests beautifulsoup4

In [None]:
#creates the primary table for the project, going through each link and adding it to the series column
url = "https://en.wikipedia.org/wiki/Korean_drama"
response = requests.get(url)
#checks if information can be scraped from the website
if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find_all('table', {'class': 'wikitable'})[2]

    if table:
        df = pd.read_html(str(table), header=0)[0]
        df['Network'].fillna(method='ffill', inplace=True)
        df = df.drop(columns=['Ref'])
        df.to_csv('kdramas_info.csv', index=False, encoding='utf-8')

        print("Kdramas information saved to 'kdramas_info.csv'")
    else:
        print("Table not found on the page.")
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")


Kdramas information saved to 'kdramas_info.csv'


In [None]:
# Adding the column called drama link which is the hyperlink of the drama
url = "https://en.wikipedia.org/wiki/Korean_drama"
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find_all('table', {'class': 'wikitable'})[2]
    if table:
        df = pd.read_html(str(table), header=0)[0]
        df['Network'].fillna(method='ffill', inplace=True)
        drama_links = []
        for cell in table.select('tr td:nth-of-type(2)'):
            link = cell.find('a')
            drama_links.append(link['href'] if link else None)
        df['Drama Link'] = drama_links
        df = df.drop(columns=['Ref'])

        df.to_csv('kdramas_info.csv', index=False, encoding='utf-8')

        print("Kdramas information saved to 'kdramas_info.csv'")
    else:
        print("Table not found on the page.")
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")


Kdramas information saved to 'kdramas_info.csv'


In [None]:
# Gets the cast members of the kdrama, adds them to the previously created csv
def get_cast_info(series_url):
    response = requests.get(series_url)
    if response.status_code == 200:
        series_soup = BeautifulSoup(response.text, 'html.parser')
        cast_heading = series_soup.find('span', {'id': 'Cast'})
        cast_text = ''

        if cast_heading:
            parent_section = cast_heading.find_parent()
            paragraphs = []
            current_element = parent_section.find_next()

            while current_element and not current_element.name.startswith('h2'):
                if current_element.name in ['p', 'ul', 'ol', 'dl']:
                    paragraphs.append(current_element.get_text(strip=True))
                current_element = current_element.find_next()
            cast_text = '\n'.join(paragraphs)

        return cast_text.strip()

    else:
        print(f"Failed to retrieve the series page. Status code: {response.status_code}")
        return None
df = pd.read_csv('kdramas_info.csv')
df['Series Link'] = df.apply(lambda row: f"https://en.wikipedia.org{row['Drama Link']}" if pd.notnull(row['Drama Link']) and row['Drama Link'].startswith('/wiki/') else None, axis=1)
df['Cast'] = ''
for index, row in df.iterrows():
    series_url = row['Series Link']
    if pd.notnull(series_url):
        cast_info = get_cast_info(series_url)
        if cast_info is not None:
            df.at[index, 'Cast'] = cast_info if cast_info else 'N/A'
df.to_csv('kdramas_info_with_cast.csv', index=False, encoding='utf-8')
print("Kdramas information with cast details saved to 'kdramas_info_with_cast.csv'")


Kdramas information with cast details saved to 'kdramas_info_with_cast.csv'


In [None]:
# Adds the synopsis and plot to the culumn and adds it to the casts csv
def get_synopsis_info(series_url):
    response = requests.get(series_url)
    if response.status_code == 200:
        series_soup = BeautifulSoup(response.text, 'html.parser')

        synopsis_heading = series_soup.find('span', {'id': 'Synopsis'})
        plot_heading = series_soup.find('span', {'id': 'Plot'})

        if synopsis_heading:
            parent_section = synopsis_heading.find_parent()
        elif plot_heading:
            parent_section = plot_heading.find_parent()
        else:
            return None

        next_h2 = parent_section.find_next('h2')
        paragraphs = parent_section.find_all_next(['p', 'ul', 'ol', 'dl', 'h2'])
        paragraphs = paragraphs[:paragraphs.index(next_h2)] if next_h2 else paragraphs

        synopsis_text = '\n'.join(paragraph.get_text(strip=True) for paragraph in paragraphs)
        return synopsis_text.strip()

    print(f"Failed to retrieve the series page. Status code: {response.status_code}")
    return None

df = pd.read_csv('kdramas_info_with_cast.csv')
df['Synopsis'] = ''
for index, row in df.iterrows():
    series_url = row['Series Link']
    if pd.notnull(series_url):
        synopsis_info = get_synopsis_info(series_url)
        if synopsis_info is not None:
            df.at[index, 'Synopsis'] = synopsis_info if synopsis_info else 'N/A'

df.to_csv('kdramas_info_with_cast_and_synopsis.csv', index=False, encoding='utf-8')
print("Kdramas cast and synopsis information saved to 'kdramas_info_with_cast_and_synopsis.csv'")

Kdramas cast and synopsis information saved to 'kdramas_info_with_cast_and_synopsis.csv'


In [None]:
# Table is created to show the available information

df = pd.read_csv('/content/kdramas_info_with_cast_and_synopsis.csv')

# Display information about series, cast, and synopsis
table_data = []
for index, row in df.iterrows():
    series_name = row['Series']
    cast_info = 'Contains data' if pd.notna(row['Cast']) else 'No data collected'
    synopsis_info = 'Contains data' if pd.notna(row['Synopsis']) else 'No data collected'
    table_data.append([series_name, cast_info, synopsis_info])

headers = ['Series', 'Cast', 'Synopsis']
table = tabulate(table_data, headers=headers, tablefmt='grid')

print("\nK-Dramas Information Table:")
print(table)



K-Dramas Information Table:
+------------------------------------+---------------+---------------+
| Series                             | Cast          | Synopsis      |
| The World of the Married           | Contains data | Contains data |
+------------------------------------+---------------+---------------+
| Reborn Rich                        | Contains data | Contains data |
+------------------------------------+---------------+---------------+
| Sky Castle                         | Contains data | Contains data |
+------------------------------------+---------------+---------------+
| Crash Landing on You               | Contains data | Contains data |
+------------------------------------+---------------+---------------+
| Reply 1988                         | Contains data | Contains data |
+------------------------------------+---------------+---------------+
| Guardian: The Lonely and Great God | Contains data | Contains data |
+------------------------------------+----------