# **Web Scrapping Using BeautifulSoup**

# Scrapping of Content of Single Page

In [None]:
import requests
from bs4 import BeautifulSoup
import os

# Define the root URL
root = "https://subslikescript.com/movie"
web = f"{root}/Titanic-120338"

try:
    # Send the request to the web page
    response = requests.get(web)
    res = response.text
    soup = BeautifulSoup(res, "lxml")

    # Find the main article containing the script
    article = soup.find("article", class_="main-article")

    # Extract the title and script
    title = article.find("h1").text
    script = article.find("div", class_="full-script").get_text(strip=True, separator=' ')

    # Sanitize the title to make it a valid filename
    sanitized_title = ''.join(c if c.isalnum() or c.isspace() else '_' for c in title)

    # Save the script to a file with a .txt extension
    filename = f"{sanitized_title}.txt"
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(script)

    print(f"Successfully saved script for: {title}")

except Exception as e:
    print(f"Error occurred: {e}")


Successfully saved script for: Titanic (1997) - full transcript


# Scraping of Content of Entire One Page

In [None]:
import requests
from bs4 import BeautifulSoup
import os

# The root URL
root = "https://subslikescript.com"

# Send the request to the first page
response = requests.get("https://subslikescript.com/movies_letter-A?page=1")
res = response.text
soup = BeautifulSoup(res, "lxml")

# Find all the movie links in the main article
article = soup.find("article", class_="main-article")
linkss = article.find_all('a', href=True)

# Create a list to store links
links = []

# Collect all hrefs (movie links)
for link in linkss:
    links.append(link['href'])

# Iterate over each movie link to scrape its content
for link in links:
    try:
        # Construct the full URL for each movie script page
        web = f"{root}{link}"  # Correcting URL construction

        # Send request to the movie page
        response = requests.get(web)
        res = response.text
        soup = BeautifulSoup(res, "lxml")

        # Find the article and extract title and script
        article = soup.find("article", class_="main-article")
        title = article.find("h1").text.strip()
        script = article.find("div", class_="full-script").get_text(strip=True, separator=' ')

        # Sanitize the title to remove any invalid characters for filenames
        sanitized_title = ''.join(c if c.isalnum() or c.isspace() else '_' for c in title)

        # Save the script to a .txt file
        filename = f"{sanitized_title}.txt"
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(script)

        print(f"Successfully saved script for: {title}")

    except Exception as e:
        print(f"Error processing {link}: {e}")


Successfully saved script for: A ¥1,000,000,000,000,000 Ransom (2015) - full transcript
Successfully saved script for: A 2nd Chance (2011) - full transcript
Successfully saved script for: A Aa (2016) - full transcript
Successfully saved script for: A Baby at any Cost (2022) - full transcript
Successfully saved script for: A Babysitter's Guide to Monster Hunting (2020) - full transcript
Successfully saved script for: A Bad Son (1980) - full transcript
Successfully saved script for: A Bag of Marbles (1975) - full transcript
Successfully saved script for: A Balloon for Allah (2011) - full transcript
Successfully saved script for: A Banana? At This Time of Night? (2018) - full transcript
Successfully saved script for: A Banquet (2021) - full transcript
Successfully saved script for: A Barefoot Dream (2010) - full transcript
Successfully saved script for: A Bear Named Winnie (2004) - full transcript
Successfully saved script for: A Beautiful Curse (2021) - full transcript
Successfully saved

# **Scraping of Content of Entire One Page including pagination**

In [None]:
import requests
from bs4 import BeautifulSoup
import os

# Define the root URL
root = "https://subslikescript.com"
web = f"{root}/movies_letter-A?page=1"

# Send the initial request
response = requests.get(web)
res = response.text
soup = BeautifulSoup(res, "lxml")

# Find the pagination element
pagination = soup.find('ul', class_='pagination')
pages = pagination.find_all('li', class_='page-item')

# Get the last page number
last_page = pages[-2].text

# Create a list to store links
links = []

# Iterate over the first two pages
for page in range(1, 3):  # Limiting to 2 pages
    web = f"{root}/movies_letter-A?page={page}"
    response = requests.get(web)
    res = response.text
    soup = BeautifulSoup(res, "lxml")

    # Find all movie links
    article = soup.find("article", class_="main-article")
    linkss = article.find_all('a', href=True)

    # Loop through each link and process it
    for link in linkss:
        try:
            # Get the text and URL of the movie
            movie_title = link.text.strip()
            movie_url = root + link['href']  # Correct URL construction

            # Send a request to the movie page
            response = requests.get(movie_url)
            res = response.text
            soup = BeautifulSoup(res, "lxml")

            # Extract movie title and script
            article = soup.find("article", class_="main-article")
            title = article.find("h1").text.strip()
            script = article.find("div", class_="full-script").get_text(strip=True, separator=' ')

            # Save the script to a file
            filename = f"{title}.txt"  # Adding .txt extension to the file
            with open(filename, 'w', encoding='utf-8') as file:
                file.write(script)

            print(f"Successfully saved script for: {title}")
        except Exception as e:
            print(f"Error processing {link['href']}: {e}")


Successfully saved script for: A ¥1,000,000,000,000,000 Ransom (2015) - full transcript
Successfully saved script for: A 2nd Chance (2011) - full transcript
Successfully saved script for: A Aa (2016) - full transcript
Successfully saved script for: A Baby at any Cost (2022) - full transcript
Successfully saved script for: A Babysitter's Guide to Monster Hunting (2020) - full transcript
Successfully saved script for: A Bad Son (1980) - full transcript
Successfully saved script for: A Bag of Marbles (1975) - full transcript
Successfully saved script for: A Balloon for Allah (2011) - full transcript
Successfully saved script for: A Banana? At This Time of Night? (2018) - full transcript
Successfully saved script for: A Banquet (2021) - full transcript
Successfully saved script for: A Barefoot Dream (2010) - full transcript
Successfully saved script for: A Bear Named Winnie (2004) - full transcript
Successfully saved script for: A Beautiful Curse (2021) - full transcript
Successfully saved

# **Scrapping from whole Website**

In [None]:
import requests
from bs4 import BeautifulSoup
import os

# Looping through the first two letters (A and B)
for i in range(65, 65+2):  # Limiting to 2 characters (A and B)
    root = "https://subslikescript.com"
    web = f"{root}/movies_letter-{chr(i)}?page=1"

    # Send the initial request
    response = requests.get(web)
    res = response.text
    soup = BeautifulSoup(res, "lxml")

    # Find the pagination element (check if pagination exists)
    pagination = soup.find('ul', class_='pagination')
    if pagination:
        pages = pagination.find_all('li', class_='page-item')
        last_page = pages[-2].text  # Get the last page number
    else:
        last_page = 1  # Default to 1 if no pagination exists

    # Create a list to store links
    links = []

    # Iterate over the first two pages
    for page in range(1, min(3, int(last_page)+1)):  # Limiting to 2 pages
        web = f"{root}/movies_letter-{chr(i)}?page={page}"
        response = requests.get(web)
        res = response.text
        soup = BeautifulSoup(res, "lxml")

        # Find all movie links
        article = soup.find("article", class_="main-article")
        linkss = article.find_all('a', href=True)

        # Loop through each link and process it
        for link in linkss:
            try:
                # Get the text and URL of the movie
                movie_title = link.text.strip()
                movie_url = root + link['href']  # Correct URL construction

                # Send a request to the movie page
                response = requests.get(movie_url)
                res = response.text
                soup = BeautifulSoup(res, "lxml")

                # Extract movie title and script
                article = soup.find("article", class_="main-article")
                title = article.find("h1").text.strip()

                # Check if script section exists
                script_div = article.find("div", class_="full-script")
                if script_div:
                    script = script_div.get_text(strip=True, separator=' ')

                    # Sanitize the title to create a valid filename
                    sanitized_title = ''.join(c if c.isalnum() or c.isspace() else '_' for c in title)

                    # Save the script to a file
                    filename = f"{sanitized_title}.txt"
                    with open(filename, 'w', encoding='utf-8') as file:
                        file.write(script)

                    print(f"Successfully saved script for: {title}")
                else:
                    print(f"Script not found for {title}")
            except Exception as e:
                print(f"Error processing {link['href']}: {e}")


Successfully saved script for: A ¥1,000,000,000,000,000 Ransom (2015) - full transcript
Successfully saved script for: A 2nd Chance (2011) - full transcript
Successfully saved script for: A Aa (2016) - full transcript
Successfully saved script for: A Baby at any Cost (2022) - full transcript
Successfully saved script for: A Babysitter's Guide to Monster Hunting (2020) - full transcript
Successfully saved script for: A Bad Son (1980) - full transcript
Successfully saved script for: A Bag of Marbles (1975) - full transcript
Successfully saved script for: A Balloon for Allah (2011) - full transcript
Successfully saved script for: A Banana? At This Time of Night? (2018) - full transcript
Successfully saved script for: A Banquet (2021) - full transcript
Successfully saved script for: A Barefoot Dream (2010) - full transcript
Successfully saved script for: A Bear Named Winnie (2004) - full transcript
Successfully saved script for: A Beautiful Curse (2021) - full transcript
Successfully saved

# **Web Scrapping Using Selenium**

In [None]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# Setup Chrome options
chrome_options = Options()

# Provide the path to the Chrome binary (if it's in a non-standard location)
chrome_options.binary_location = r"C:\Program Files\Google\Chrome\Application\chrome.exe"  # Change this path to where Chrome is installed on your system

# Setup ChromeDriver using WebDriverManager
service = Service(ChromeDriverManager().install())

# Create a Chrome WebDriver instance with the binary location set
driver = webdriver.Chrome(service=service, options=chrome_options)

# Open a website
driver.get("https://www.example.com")

# Close the browser
driver.quit()


WebDriverException: Message: unknown error: no chrome binary at C:\Program Files\Google\Chrome\Application\chrome.exe
Stacktrace:
#0 0x57f510fa54e3 <unknown>
#1 0x57f510cd4c76 <unknown>
#2 0x57f510cfb5e0 <unknown>
#3 0x57f510cfa029 <unknown>
#4 0x57f510d38ccc <unknown>
#5 0x57f510d3847f <unknown>
#6 0x57f510d2fde3 <unknown>
#7 0x57f510d052dd <unknown>
#8 0x57f510d0634e <unknown>
#9 0x57f510f653e4 <unknown>
#10 0x57f510f693d7 <unknown>
#11 0x57f510f73b20 <unknown>
#12 0x57f510f6a023 <unknown>
#13 0x57f510f381aa <unknown>
#14 0x57f510f8e6b8 <unknown>
#15 0x57f510f8e847 <unknown>
#16 0x57f510f9e243 <unknown>
#17 0x7a0c77923ac3 <unknown>


In [None]:
from selenium import webdriver
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options

# Create an instance of Options
edge_options = Options()

# If Edge is not in a default location, provide the full path to msedge binary
# edge_options.binary_location = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe"  # Update this path if necessary
# The above line is commented out as it's the likely cause of the error.
# If Edge is not in the default location, you'll need to find the correct path
# and uncomment the line, replacing the path with the correct one.

# OR, you could try auto-detecting the Edge binary using shutil:
import shutil
edge_path = shutil.which("msedge")
if edge_path:
    edge_options.binary_location = edge_path
else:
    raise Exception("Microsoft Edge not found in your system's PATH. Please install it or provide the correct binary path.")


# Setup the service for the Edge WebDriver
service = Service(EdgeChromiumDriverManager().install())

# Initialize WebDriver using the service and options
driver = webdriver.Edge(service=service, options=edge_options)

# Navigate to the website
driver.get("https://www.adamchoi.co.uk/overs/detailed")

# Close the browser
driver.quit()

Exception: Microsoft Edge not found in your system's PATH. Please install it or provide the correct binary path.

In [None]:
!pip install webdriver_manager

Collecting webdriver_manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting python-dotenv (from webdriver_manager)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv, webdriver_manager
Successfully installed python-dotenv-1.0.1 webdriver_manager-4.0.2
