In [1]:
import re
import dash_table as dt
import pandas as pd
from requests import get
from bs4 import BeautifulSoup
import numpy as np
import requests
import plotly.express as px

# Web Scraper

## Reading in page data

In [2]:
headers = {"Accept-Language": "en-US, en;q=0.5"}

url = "https://www.imdb.com/search/title/?title_type=feature&ref_=adv_prv"

results = requests.get(url, headers = headers)

bsoup = BeautifulSoup(results.text , "html.parser")

## Getting data

In [3]:
titles = []

IMDB_ratings = []

ratings = []

genres = []

runtimes = []

notable_figs = []

descriptions = []

images = []

years = []

### Adding elements to lists

#### First page

In [4]:
movies_divs = bsoup.find_all('div', class_= 'lister-item mode-advanced')

for container in movies_divs:

    title = container.h3.a.text
    titles.append(title)

    year = container.find('span', class_ = 'lister-item-year text-muted unbold').text.replace('(','').replace(')','').split()

    if len(year) > 1:
        years.append(int(year[1]))
    elif len(year) == 1:
        years.append(int(year[0]))
    else:
        years.append(None)

    rating = container.find('span', class_= 'certificate').text if container.p.find('span', class_='certificate') else 'Not Rated'
    ratings.append(rating)

    runtime = int(container.find('span', class_= 'runtime').text.strip().replace(" min","")) if container.p.find('span', class_='runtime') else None
    runtimes.append(runtime)

    genre = container.find('span', class_= 'genre').text.strip().replace('\n','').replace(" ",'').split(",") if container.p.find('span', class_='genre') else ''
    genres.append(genre)

    imdb_rat = float(container.strong.text) if container.find('strong') else 'N/A'
    IMDB_ratings.append(imdb_rat)

    stars = container.find('p', class_="").text.strip().replace("\n",'').replace("Director:","").replace("Stars:","").replace(' ','').split("|", 1)
    notable_figs.append(stars)

    description = container.find_all('p', class_= "text-muted")[1].text.replace("\n",'').strip()
    descriptions.append(description)

    img = container.find('img', class_= "loadlate")['loadlate']
    images.append(img)

#### The Other Pages

In [5]:
url2 = "https://www.imdb.com/search/title/?title_type=feature&"

url4 = "&ref_=adv_nxt"

i = 0

while i < 9000:
    url3 = "start=" + str(51 + 50 * i)
    complete_URL = url2  + url3 + url4
    new_results = requests.get(complete_URL, headers = headers)
    
    bsoup_new = BeautifulSoup(new_results.text , "html.parser")

    movies_divs_new = bsoup_new.find_all('div', class_= 'lister-item mode-advanced')

    for container in movies_divs_new:

        title = container.h3.a.text
        titles.append(title)

        year = container.find('span', class_ = 'lister-item-year text-muted unbold').text.replace('(','').replace(')','').split()

        if len(year) > 1:
            years.append(int(year[1]))
        elif len(year) == 1 and year[0].isnumeric():
            years.append(int(year[0]))
        else: 
            years.append('N/A')

        rating = container.find('span', class_= 'certificate').text if container.p.find('span', class_='certificate') else 'Not Rated'
        ratings.append(rating)

        runtime = int(container.find('span', class_= 'runtime').text.strip().replace(" min","")) if container.p.find('span', class_='runtime') else None
        runtimes.append(runtime)

        genre = container.find('span', class_= 'genre').text.strip().replace('\n','').replace(" ",'').split(",") if container.p.find('span', class_='genre') else ''
        genres.append(genre)

        imdb_rat = float(container.strong.text) if container.find('strong') else 'N/A'
        IMDB_ratings.append(imdb_rat)

        stars = container.find('p', class_="").text.strip().replace("\n",'').replace("Director:","").replace("Stars:","").replace(' ','').split("|", 1)
        notable_figs.append(stars)

        description = container.find_all('p', class_= "text-muted")[1].text.replace("\n",'').strip()
        descriptions.append(description)

        img = container.find('img', class_= "loadlate")['loadlate']
        images.append(img)
    
    i += 1    

## Creating Data Frame from lists

In [6]:
df = pd.DataFrame({
    'movie': titles,
    'year': years,
    'rating': ratings,
    'runtime': runtimes,
    'genres': genres,
    'IMDB_rating': IMDB_ratings,
    'actors': notable_figs,
    'description': descriptions,
    'image': images})

In [7]:
len(df)

450050

### Finding Misc Info

#### Finding Genres

In [8]:

distinct_genres = set(())

for genre in genres:
    for x in genre:
        distinct_genres.add(x.strip())

genre_options = [{'label': n, 'value': n} for n in distinct_genres]

print(genre_options[1:5])


[{'label': 'Animation', 'value': 'Animation'}, {'label': 'Music', 'value': 'Music'}, {'label': 'Horror', 'value': 'Horror'}, {'label': 'Family', 'value': 'Family'}]


Exporting data


### Finding PG Ratings

In [9]:
distinct_ratings = set(())

for rating in ratings:
    distinct_ratings.add(rating)

rating_options = [{'label': n, 'value': n} for n in distinct_ratings]

print(rating_options)

[{'label': 'MA-17', 'value': 'MA-17'}, {'label': 'R', 'value': 'R'}, {'label': 'TV-PG', 'value': 'TV-PG'}, {'label': 'M/PG', 'value': 'M/PG'}, {'label': 'M', 'value': 'M'}, {'label': 'Passed', 'value': 'Passed'}, {'label': 'TV-Y7-FV', 'value': 'TV-Y7-FV'}, {'label': 'T', 'value': 'T'}, {'label': 'Approved', 'value': 'Approved'}, {'label': '12', 'value': '12'}, {'label': 'PG-13', 'value': 'PG-13'}, {'label': 'TV-G', 'value': 'TV-G'}, {'label': '18', 'value': '18'}, {'label': 'GP', 'value': 'GP'}, {'label': 'TV-MA', 'value': 'TV-MA'}, {'label': 'G', 'value': 'G'}, {'label': 'Not Rated', 'value': 'Not Rated'}, {'label': '22', 'value': '22'}, {'label': 'X', 'value': 'X'}, {'label': 'Unrated', 'value': 'Unrated'}, {'label': '15', 'value': '15'}, {'label': 'PG', 'value': 'PG'}, {'label': 'TV-14', 'value': 'TV-14'}, {'label': 'TV-Y7', 'value': 'TV-Y7'}, {'label': 'NC-17', 'value': 'NC-17'}]


## Writing to CSV

In [10]:
df.to_csv("data/movies.csv")

In [11]:
df.year.replace('N/A', None)

new_df = df.loc[(df.year != 'N/A') & (df.rating != 'N/A') & (df.IMDB_rating != 'N/A') & pd.notnull(df.runtime)]

new_df.year = new_df.year.astype('int64')
new_df.IMDB_rating = new_df.IMDB_rating.astype('float64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [12]:

graph_df = new_df.groupby('year').mean().reset_index()

graph_df.head()

px.scatter(graph_df, x = "year", y = "IMDB_rating")