## Assignment plan:
- Crawl the whole Wiki web page - `Done`;
- Crawl initial all links that lead to each films's individual web page - `Done`;
- Crawl the data for each film using Scrapy Spider `films_spider.py` as JSON file `films_v2_res.json` - `Done`;
- Clean the data and save it in an appropriate format;
- Create a DB for storing all files;
...

In [47]:
# Import necessary dependencies
import scrapy
from urllib.parse import urljoin
from lxml import etree
import pandas as pd

In [46]:
# Primary information source link
web_link = "https://en.wikipedia.org/wiki/List_of_highest-grossing_films"
main_link = "https://en.wikipedia.org"

In [4]:
# Start a project with Scrapy
#! scrapy startproject wiki_parsing

### Get the links for each film

In [44]:
with open("/home/viper/Data_science/DWV/assignments/assignment_1/wiki_parsing/quotes-wiki.html", "r", encoding="utf-8") as file:
    html_content = file.read()

# Parse the HTML content
tree = etree.HTML(html_content)

table = tree.xpath('//table[contains(@class, "wikitable plainrowheaders")]')[1] # We need the second table that is sorted by year
rows = table.xpath('.//tr')


for row in rows:
    cells = row.xpath('.//td//i//a//@href')
    print(cells)

[]
['/wiki/The_Birth_of_a_Nation']
['/wiki/Intolerance_(film)']
['/wiki/Cleopatra_(1917_film)']
['/wiki/Mickey_(1918_film)']
['/wiki/The_Miracle_Man_(1919_film)']
['/wiki/Way_Down_East']
['/wiki/The_Four_Horsemen_of_the_Apocalypse_(film)']
['/wiki/Douglas_Fairbanks_in_Robin_Hood']
['/wiki/The_Covered_Wagon']
['/wiki/The_Sea_Hawk_(1924_film)']
['/wiki/The_Big_Parade']
['/wiki/Ben-Hur_(1925_film)']
['/wiki/For_Heaven%27s_Sake_(1926_film)']
['/wiki/Wings_(1927_film)']
['/wiki/The_Singing_Fool']
['/wiki/The_Broadway_Melody']
['/wiki/Sunny_Side_Up_(1929_film)']
['/wiki/All_Quiet_on_the_Western_Front_(1930_film)']
['/wiki/Frankenstein_(1931_film)']
['/wiki/City_Lights']
['/wiki/The_Sign_of_the_Cross_(1932_film)']
['/wiki/King_Kong_(1933_film)']
['/wiki/I%27m_No_Angel']
['/wiki/Cavalcade_(1933_film)']
['/wiki/She_Done_Him_Wrong']
['/wiki/The_Merry_Widow_(1934_film)']
['/wiki/It_Happened_One_Night']
['/wiki/Mutiny_on_the_Bounty_(1935_film)']
['/wiki/San_Francisco_(1936_film)']
['/wiki/Snow_Whi

In [61]:
len(rows)

152

### Create a spider to parse all the main information about each film (Saved as `films_spider.py`)

In [None]:
class FilmsSpider(scrapy.Spider):
    name = "films_v2"
    start_urls = [
        "https://en.wikipedia.org/wiki/List_of_highest-grossing_films"
    ]

    def parse(self, response):
        # Extract all film links from the second table with films by year
        table = response.xpath('//table[contains(@class, "wikitable plainrowheaders")]')[1]
        rows = table.xpath(".//tr")
        film_links = []
        for row in rows:
            res = row.xpath('.//td//i//a//@href').getall()
            if len(res) == 0:
                continue
            else:
                film_links.append(res[0])

        # Get data from each film link
        for link in film_links:
            absolute_url = urljoin("https://en.wikipedia.org", link)
            yield scrapy.Request(url=absolute_url, callback=self.parse_film_page)

    def parse_film_page(self, response):
        info_table = response.xpath('//table[contains(@class, "infobox")]')
        data = {
            "title": info_table.xpath('.//th[contains(@class, "infobox-above")]//text()').get(),
            "directed_by": info_table.xpath('.//th[contains(text(), "Directed by")]/following-sibling::td//text()').getall(),
            "screenplay_by": info_table.xpath('.//th[contains(text(), "Screenplay by")]/following-sibling::td//text()').getall(),
            "based_on": info_table.xpath('.//th[contains(text(), "Based on")]/following-sibling::td//text()').getall(),
            "produced_by": info_table.xpath('.//th[contains(text(), "Produced by")]/following-sibling::td//text()').getall(),
            "starring": info_table.xpath('.//th[contains(text(), "Starring")]/following-sibling::td//text()').getall(),
            "release_date": info_table.xpath('.//th[contains(text(), "Release date")]/following-sibling::td//text()').get(),
            "country": info_table.xpath('.//th[contains(text(), "Country")]/following-sibling::td//text()').getall(),
            "budget": info_table.xpath('.//th[contains(text(), "Budget")]/following-sibling::td//text()').getall(),
            "box_office": info_table.xpath('.//th[contains(text(), "Box office")]/following-sibling::td//text()').getall(),
        }

        yield data

### Read the data from JSON file and clean it

In [52]:
df = pd.read_json("wiki_parsing/films_v2_res.json")
df

Unnamed: 0,title,directed_by,screenplay_by,based_on,produced_by,starring,release_date,country,budget,box_office
0,The Birth of a Nation,[D. W. Griffith],"[.mw-parser-output .plainlist ol,.mw-parser-ou...","[The Clansman, by , Thomas Dixon Jr.]","[D. W. Griffith, Harry Aitken, [, 1, ]]","[\n, Lillian Gish, \n, Mae Marsh, \n, Henry B....",[],[United States],"[$100,000+, [, 3, ]]","[$50–100 million, [, 4, ]]"
1,The Greatest Show on Earth,[Cecil B. DeMille],"[.mw-parser-output .plainlist ol,.mw-parser-ou...",[],[Cecil B. DeMille],"[Betty Hutton, Cornel Wilde, Charlton Heston, ...",[],[United States],"[$4 million, [, 1, ]]","[$36 million, [, 1, ]]"
2,This Is Cinerama,"[.mw-parser-output .plainlist ol,.mw-parser-ou...",[],[],"[Robert L. Bendick, Merian C. Cooper]",[Lowell Thomas],[],[United States],[$1 million],"[$41.6 million, [, 2, ]]"
3,Cinerama Holiday,"[Robert L. Bendick, Philippe De Lacy]",[],[],[],"[Fred Troller, , Beatrice Troller , John Mars...",[],[United States],"[$1.5 million, [, 1, ]]","[$29.6 million , [, 2, ]]"
4,The Robe,[Henry Koster],"[.mw-parser-output .plainlist ol,.mw-parser-ou...","[The Robe, by , Lloyd C. Douglas]",[Frank Ross],"[\n, Richard Burton, \n, Jean Simmons, \n, Vic...",[],[United States],"[$4.1 million, [, 1, ], – $4.6 million, [, 2, ]]","[$36 million (United States), [, 3, ]]"
...,...,...,...,...,...,...,...,...,...,...
145,Cleopatra,[J. Gordon Edwards],[Adrian Johnson],"[Cleopatra, 1889 novel, by , H. Rider Haggard]",[William Fox],"[Theda Bara, Fritz Leiber Sr., Thurston Hall]",[],[United States],"[$250,000–500,000, [, 1, ]]","[$1 million, [, 1, ]]"
146,The Miracle Man,[George Loane Tucker],[],"[The Miracle Man, (play), by , George M. Cohan]",[George Loane Tucker],"[Thomas Meighan, Betty Compson, Lon Chaney, Jo...",[],[United States],"[$120,000]","[$1 million (rentals), [, 1, ], or $2 million..."
147,The Four Horsemen of the Apocalypse,[Rex Ingram],[June Mathis],"[The Four Horsemen of the Apocalypse, 1916 nov...",[Rex Ingram],"[Pomeroy Cannon, Josef Swickard, Bridgetta Cla...",[],[United States],"[$800,000 or $1 million, [, 1, ]]","[$9.2 million, [, 2, ], or $4 million (world ..."
148,Mickey,"[F. Richard Jones, James Young]",[],[],"[Mabel Normand, Mack Sennett]",[Mabel Normand],[],[United States],"[$250,000]","[$16,450,000]"


In [59]:
df.loc[0, 'screenplay_by']

['.mw-parser-output .plainlist ol,.mw-parser-output .plainlist ul{line-height:inherit;list-style:none;margin:0;padding:0}.mw-parser-output .plainlist ol li,.mw-parser-output .plainlist ul li{margin-bottom:0}',
 'D. W. Griffith',
 'Frank E. Woods']

In [62]:
df.loc[:, 'release_date']

0      []
1      []
2      []
3      []
4      []
       ..
145    []
146    []
147    []
148    []
149    []
Name: release_date, Length: 150, dtype: object

As you can see, the most major problem right now - absense of the release date. It may happen due to 2 possible options as:
- Release date - if the film has one release date;
- Release dates - if the film has multiple release dates.

However, we can still parse this data from the inital table.

In [73]:
table = tree.xpath('//table[contains(@class, "wikitable plainrowheaders")]')[1] # We need the second table that is sorted by year
rows = table.xpath('.//tr')

release_years = []
for row in rows:
    cells = row.xpath('.//th//a//text()')
    release_years.append(cells)
    print(cells)

[]
['1915']
['1916']
['1917']
['1918']
['1919']
['1920']
['1921']
['1922']
['1923']
['1924']
['1925']
[]
['1926']
['1927']
['1928']
['1929']
[]
['1930']
['1931']
[]
['1932']
['1933']
[]
[]
[]
['1934']
[]
['1935']
['1936']
['1937']
['1938']
['1939']
['1940']
[]
['1941']
['1942']
[]
['1943']
[]
['1944']
['1945']
[]
['1946']
[]
[]
['1947']
[]
['1948']
[]
[]
['1949']
['1950']
[]
['1951']
['1952']
[]
['1953']
[]
['1954']
[]
[]
['1955']
[]
[]
['1956']
['1957']
['1958']
['1959']
['1960']
[]
[]
['1961']
[]
['1962']
[]
[]
['1963']
[]
['1964']
[]
[]
['1965']
['1966']
[]
[]
['1967']
[]
['1968']
[]
['1969']
['1970']
['1971']
[]
[]
['1972']
['1973']
[]
['1974']
['1975']
['1976']
['1977']
['1978']
['1979']
[]
['1980']
['1981']
['1982']
['1983']
['1984']
['1985']
['1986']
['1987']
['1988']
['1989']
['1990']
['1991']
['1992']
['1993']
['1994']
['1995']
[]
['1996']
['1997']
['1998']
['1999']
['2000']
['2001']
['2002']
['2003']
['2004']
['2005']
['2006']
['2007']
['2008']
['2009']
['2010']
['2011']
['20

In [None]:
# Delete the first empty element
release_years = release_years[1:]

As you can see from the list (and website itself), empty elements means that the previous year was used for the film with empty release date. Let's fix it.

In [75]:
last_release = None
for release_year_idx in range(len(release_years)):
    if len(release_years[release_year_idx]):
        last_release = release_years[release_year_idx]
    else:
        release_years[release_year_idx] = last_release
release_years

[['1915'],
 ['1916'],
 ['1917'],
 ['1918'],
 ['1919'],
 ['1920'],
 ['1921'],
 ['1922'],
 ['1923'],
 ['1924'],
 ['1925'],
 ['1925'],
 ['1926'],
 ['1927'],
 ['1928'],
 ['1929'],
 ['1929'],
 ['1930'],
 ['1931'],
 ['1931'],
 ['1932'],
 ['1933'],
 ['1933'],
 ['1933'],
 ['1933'],
 ['1934'],
 ['1934'],
 ['1935'],
 ['1936'],
 ['1937'],
 ['1938'],
 ['1939'],
 ['1940'],
 ['1940'],
 ['1941'],
 ['1942'],
 ['1942'],
 ['1943'],
 ['1943'],
 ['1944'],
 ['1945'],
 ['1945'],
 ['1946'],
 ['1946'],
 ['1946'],
 ['1947'],
 ['1947'],
 ['1948'],
 ['1948'],
 ['1948'],
 ['1949'],
 ['1950'],
 ['1950'],
 ['1951'],
 ['1952'],
 ['1952'],
 ['1953'],
 ['1953'],
 ['1954'],
 ['1954'],
 ['1954'],
 ['1955'],
 ['1955'],
 ['1955'],
 ['1956'],
 ['1957'],
 ['1958'],
 ['1959'],
 ['1960'],
 ['1960'],
 ['1960'],
 ['1961'],
 ['1961'],
 ['1962'],
 ['1962'],
 ['1962'],
 ['1963'],
 ['1963'],
 ['1964'],
 ['1964'],
 ['1964'],
 ['1965'],
 ['1966'],
 ['1966'],
 ['1966'],
 ['1967'],
 ['1967'],
 ['1968'],
 ['1968'],
 ['1969'],
 ['1970'],

In [79]:
# Remove "Ne Zha 2" from the last element since we have no information about this film
release_years = release_years[:-1]
release_years

[['1915'],
 ['1916'],
 ['1917'],
 ['1918'],
 ['1919'],
 ['1920'],
 ['1921'],
 ['1922'],
 ['1923'],
 ['1924'],
 ['1925'],
 ['1925'],
 ['1926'],
 ['1927'],
 ['1928'],
 ['1929'],
 ['1929'],
 ['1930'],
 ['1931'],
 ['1931'],
 ['1932'],
 ['1933'],
 ['1933'],
 ['1933'],
 ['1933'],
 ['1934'],
 ['1934'],
 ['1935'],
 ['1936'],
 ['1937'],
 ['1938'],
 ['1939'],
 ['1940'],
 ['1940'],
 ['1941'],
 ['1942'],
 ['1942'],
 ['1943'],
 ['1943'],
 ['1944'],
 ['1945'],
 ['1945'],
 ['1946'],
 ['1946'],
 ['1946'],
 ['1947'],
 ['1947'],
 ['1948'],
 ['1948'],
 ['1948'],
 ['1949'],
 ['1950'],
 ['1950'],
 ['1951'],
 ['1952'],
 ['1952'],
 ['1953'],
 ['1953'],
 ['1954'],
 ['1954'],
 ['1954'],
 ['1955'],
 ['1955'],
 ['1955'],
 ['1956'],
 ['1957'],
 ['1958'],
 ['1959'],
 ['1960'],
 ['1960'],
 ['1960'],
 ['1961'],
 ['1961'],
 ['1962'],
 ['1962'],
 ['1962'],
 ['1963'],
 ['1963'],
 ['1964'],
 ['1964'],
 ['1964'],
 ['1965'],
 ['1966'],
 ['1966'],
 ['1966'],
 ['1967'],
 ['1967'],
 ['1968'],
 ['1968'],
 ['1969'],
 ['1970'],

In [81]:
assert len(release_years) == df.shape[0], (
    "Lengths must match."
)

In [82]:
# Add the release years to our df
df['release_date'] = release_years
df.head()

Unnamed: 0,title,directed_by,screenplay_by,based_on,produced_by,starring,release_date,country,budget,box_office
0,The Birth of a Nation,[D. W. Griffith],"[.mw-parser-output .plainlist ol,.mw-parser-ou...","[The Clansman, by , Thomas Dixon Jr.]","[D. W. Griffith, Harry Aitken, [, 1, ]]","[\n, Lillian Gish, \n, Mae Marsh, \n, Henry B....",[1915],[United States],"[$100,000+, [, 3, ]]","[$50–100 million, [, 4, ]]"
1,The Greatest Show on Earth,[Cecil B. DeMille],"[.mw-parser-output .plainlist ol,.mw-parser-ou...",[],[Cecil B. DeMille],"[Betty Hutton, Cornel Wilde, Charlton Heston, ...",[1916],[United States],"[$4 million, [, 1, ]]","[$36 million, [, 1, ]]"
2,This Is Cinerama,"[.mw-parser-output .plainlist ol,.mw-parser-ou...",[],[],"[Robert L. Bendick, Merian C. Cooper]",[Lowell Thomas],[1917],[United States],[$1 million],"[$41.6 million, [, 2, ]]"
3,Cinerama Holiday,"[Robert L. Bendick, Philippe De Lacy]",[],[],[],"[Fred Troller, , Beatrice Troller , John Mars...",[1918],[United States],"[$1.5 million, [, 1, ]]","[$29.6 million , [, 2, ]]"
4,The Robe,[Henry Koster],"[.mw-parser-output .plainlist ol,.mw-parser-ou...","[The Robe, by , Lloyd C. Douglas]",[Frank Ross],"[\n, Richard Burton, \n, Jean Simmons, \n, Vic...",[1919],[United States],"[$4.1 million, [, 1, ], – $4.6 million, [, 2, ]]","[$36 million (United States), [, 3, ]]"
