In [1]:
import scrapy
import requests
import re
import os
import sys
import codecs
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
from progress.bar import Bar
from selenium import webdriver

In [3]:
def get_all_movies():
     # Parse the page with beautiful soup
    link_all_scripts = 'https://imsdb.com/all-scripts.html'
    response_all_scripts = requests.get(link_all_scripts)
    soup = BeautifulSoup(response_all_scripts.text, 'html.parser')

    # This webpage is constructed with tables, the 3rd one is the one we want
    find_tables = soup.find_all('td', valign='top')
    all_movies = find_tables[2].findAll('a')

    movies = [(movie_info.string, \
              movie_info["href"], \
              re.split("[,.]",movie_info.string)[0].replace(' ', '_'))
              for movie_info in all_movies]
    return movies

In [6]:
def check_movie_info(movies):

    for movie in movies:
        if movie[1][0:15] !='/Movie Scripts/':
            return 'One of the movie link does not start with /Movie Scripts/.'
    return 'All movie URLs have a correct format.'

In [7]:
try:
    def handle_movie (movie, browser):
        # Unpack tuple
        title, link_to_movie_page, movie_title = movie

        # Interrogate the page with all the movie information (ratings, writer,
        # genre, link to script)
        full_html_link = u'http://www.imsdb.com' + link_to_movie_page
        response_script = requests.get(full_html_link)
        soup = BeautifulSoup(response_script.text, 'html.parser')

        # Get all relevant information (genre, writer, script) from page

        list_links = soup.find_all('table', align="center")[0].find_all('a')
        
    #     print(len(list_links))
        genre = []
        writer = []
        script = ''
        for link in list_links:
            href = link['href']
            if href[0:7]== "/writer":
                writer.append(link.get_text())
            if href[0:7]== "/genre/":
                genre.append(link.get_text())
                genre.append("//")
            if href[0:9]== "/scripts/":
                script = href

        # If the link to the script points to a PDF, skip this movie, but log
        # the information in `movies_pdf_script.csv`
        if script == '' or script[-5:] != '.html':
            path_to_directory = '../data/scraping/'
            pdf_logging_filename = path_to_directory + 'movies_pdf_script.csv'
            with open(pdf_logging_filename, 'a') as f:
                new_row = title + '\n'
                f.write(new_row)

        # If the link to the script points to an html page, write the corresponding
        # text to a file and include the movie in a csv file, with meta-information
        else:

            # Parse the webpage which contains the script text
            full_script_url =  u'http://www.imsdb.com' + script
            browser.get(full_script_url)
            page_text = browser.page_source
            soup = BeautifulSoup(page_text, 'html.parser')

            # If the scraping does not go as planned (unexpected structure),
            # log the file name in an error file
            if len(soup.find_all('td', "scrtext"))!=1 or soup.find_all("title") == "404 Not Found":
                error_file_name = '../data/scraping/scraping_error.csv'
                with open(error_file_name, 'a') as error_file:
                    new_row = title + '\n'
                    error_file.write( new_row )

            # Normal scraping:
            else:
                # Write the script text to a file
                path_to_directory = '../data/scraping/texts/'
                filename = path_to_directory + movie_title + '.txt'
                text = soup.find_all('td', "scrtext")[0].get_text()
                with codecs.open(filename, "w",
                        encoding='ascii', errors='ignore') as f:
                    f.write(text)

                # Add the meta-information to a CSV file
                path_to_directory = '../data/scraping/'
                success_filename = path_to_directory + 'successful_files.csv'
                new_row = title + ';' + str(genre) + ';' + str(writer) + ';' \
                        + movie_title + ';' + filename + '\n'
                with open(success_filename, 'a') as f:
                    f.write(new_row)
except:
    pass

In [15]:
if __name__ == '__main__':
    

    # Create data/scraping/texts files
    if not os.path.exists('../data'):
        os.mkdir('../data')
        print ('making ../data folder')
    if not os.path.exists('../data/scraping'):
        os.mkdir('../data/scraping')
        print ('making ../data/scraping folder')
    if not os.path.exists('../data/scraping/texts'):
        os.mkdir('../data/scraping/texts')
        print ('making ../data/scraping/texts folder')   
        
    # List all the available movies, and the corresponding URL links
    movies = get_all_movies()
    print (check_movie_info(movies))

    
    # Write all the scripts (in texts folder) and the summary of the movies
    # in .csv format (in scraping folder)
    browser = webdriver.Safari()
    for idx,movie in enumerate(movies[1179:]):
        handle_movie(movie, browser)
        Bar(len(movies))      

#stopping points [:822], [823:1167],[1168:1178], [1179:]""
    
        
        

All movie URLs have a correct format.
