In [60]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

In [2]:
base_url = "https://www.scriptslug.com/scripts/category/disney"
url = base_url #+ "/wiki/Category:Disney_Transcripts"

In [97]:
def get_df_available_movies():
    
    page = requests.get(url)
    
    soup = BeautifulSoup(page.content, 'html.parser')
    
    list_available = soup.find_all('article')
    
    movies_df = pd.DataFrame()
    for movie in list_available:
        link = movie.find("a")["href"]
        title = movie.find("a")["title"][:-20]
        movies_df = movies_df.append({"link":link, "title":title}, ignore_index=True)

    return movies_df

def get_scripts_pdf(movies_df, titles):
    
    movies_df["pdf_path"] = None
    movies_df["year"] = None
    for title in titles:
        try:
            link = movies_df.loc[movies_df.title == title, "link"].values[0]
        except:
            print(f"Title {title} not valid. Are you sure you introduced an available one?")
            return 0

        print(link)



        decons = link.split("/")
        final_link = decons[0] + "//" + decons[2] + "/assets/scripts/" + decons[4] + ".pdf"
        name = decons[4]

        page = requests.get(final_link)

        with open(f'scripts/{name}.pdf', 'wb') as f:
            f.write(page.content)
            
        movies_df.loc[movies_df.title == title, "pdf_path"] = f"scripts/{name}.pdf"
        movies_df.loc[movies_df.title == title, "year"] = name.split("-")[-1]
            
    return movies_df
            
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    
    return text        

def get_scripts_from_pdf(df_movies):
    
    df_movies["text"] = None
    for i, row in df_movies.iterrows():
        
        path = row["pdf_path"]
        print(i, path)
        
        if path is not None:
            df_movies.loc[i, "text"] = convert_pdf_to_txt(path)
            
        print("finish")
            
    return df_movies

In [88]:
movies_df = get_df_available_movies()

In [89]:
movies_df

Unnamed: 0,link,title
0,https://www.scriptslug.com/script/aladdin-1992,Aladdin
1,https://www.scriptslug.com/script/beauty-and-t...,Beauty and the Beast
2,https://www.scriptslug.com/script/black-panthe...,Black Panther
3,https://www.scriptslug.com/script/coco-2017,Coco
4,https://www.scriptslug.com/script/finding-nemo...,Finding Nemo
5,https://www.scriptslug.com/script/frozen-2013,Frozen
6,https://www.scriptslug.com/script/the-good-din...,The Good Dinosaur
7,https://www.scriptslug.com/script/hannah-monta...,"Hannah Montana: 101: Lilly, Do You Want to Kno..."
8,https://www.scriptslug.com/script/how-to-train...,How to Train Your Dragon
9,https://www.scriptslug.com/script/the-incredib...,The Incredibles


In [90]:
titles = ["Aladdin", "Coco", "Beauty and the Beast", "Finding Nemo", "Frozen", "How to Train Your Dragon", "The Incredibles", "The Lion King", "The Little Mermaid", "Mulan", "Ratatouille", "Up", "WALL-E"]

movies_df = get_scripts_pdf(movies_df, titles)

https://www.scriptslug.com/script/aladdin-1992
https://www.scriptslug.com/script/coco-2017
https://www.scriptslug.com/script/beauty-and-the-beast-2017
https://www.scriptslug.com/script/finding-nemo-2003
https://www.scriptslug.com/script/frozen-2013
https://www.scriptslug.com/script/how-to-train-your-dragon-2010
https://www.scriptslug.com/script/the-incredibles-2004
https://www.scriptslug.com/script/the-lion-king-1994
https://www.scriptslug.com/script/the-little-mermaid-1989
https://www.scriptslug.com/script/mulan-1998
https://www.scriptslug.com/script/ratatouille-2007
https://www.scriptslug.com/script/up-2009
https://www.scriptslug.com/script/wall-e-2008


In [96]:
movies_df.loc[2, "pdf_path"] is None

True

In [98]:
movies_df = get_scripts_from_pdf(movies_df)
movies_df

0 scripts/aladdin-1992.pdf
finish
1 scripts/beauty-and-the-beast-2017.pdf
finish
2 None
finish
3 scripts/coco-2017.pdf
finish
4 scripts/finding-nemo-2003.pdf
finish
5 scripts/frozen-2013.pdf
finish
6 None
finish
7 None
finish
8 scripts/how-to-train-your-dragon-2010.pdf
finish
9 scripts/the-incredibles-2004.pdf
finish
10 None
finish
11 None
finish
12 scripts/the-lion-king-1994.pdf
finish
13 scripts/the-little-mermaid-1989.pdf
finish
14 None
finish
15 None
finish
16 scripts/mulan-1998.pdf
finish
17 None
finish
18 scripts/ratatouille-2007.pdf
finish
19 None
finish
20 None
finish
21 None
finish
22 scripts/up-2009.pdf
finish
23 scripts/wall-e-2008.pdf
finish
24 None
finish
25 None
finish
26 None
finish


Unnamed: 0,link,title,pdf_path,year,text
0,https://www.scriptslug.com/script/aladdin-1992,Aladdin,scripts/aladdin-1992.pdf,1992.0,FADE IN: \n\nBEGIN TITLES: the song ARAB...
1,https://www.scriptslug.com/script/beauty-and-t...,Beauty and the Beast,scripts/beauty-and-the-beast-2017.pdf,2017.0,"""BEAUTY AND THE BEAST""\n\nStephen Chbosky and ..."
2,https://www.scriptslug.com/script/black-panthe...,Black Panther,,,
3,https://www.scriptslug.com/script/coco-2017,Coco,scripts/coco-2017.pdf,2017.0,"COCO\n\nLee Unkrich, Jason Katz, Matthew Aldri..."
4,https://www.scriptslug.com/script/finding-nemo...,Finding Nemo,scripts/finding-nemo-2003.pdf,2003.0,1'.1.NDI.NG IIBMO \n\nOrigin&!St.o:ry by \nAn...
5,https://www.scriptslug.com/script/frozen-2013,Frozen,scripts/frozen-2013.pdf,2013.0,\n \n\n \n\n \n\n \n\nFinal Shootin...
6,https://www.scriptslug.com/script/the-good-din...,The Good Dinosaur,,,
7,https://www.scriptslug.com/script/hannah-monta...,"Hannah Montana: 101: Lilly, Do You Want to Kno...",,,
8,https://www.scriptslug.com/script/how-to-train...,How to Train Your Dragon,scripts/how-to-train-your-dragon-2010.pdf,2010.0,HOW TO TRAIN YOUR DRAGON\n\nWritten by\n\nDean...
9,https://www.scriptslug.com/script/the-incredib...,The Incredibles,scripts/the-incredibles-2004.pdf,2004.0,"FOR YOUR CONSIDERATION \n\n""'ORIGINAL SCREE..."


In [107]:
movies_df.to_csv("trial.csv", index=None, sep="\t")