In [1]:
#pip install pdmongo

In [7]:
'''Data Acquisition and transformation pipeline'''
def movie_scraper():
    groups = ["top_100","top_250","top_1000","bottom_100","bottom_250","bottom_1000,oscar_winner",
     "emmy_winner","golden_globe_winner","oscar_nominee","emmy_nominee","golden_globe_nominee","best_picture_winner",
     "best_director_winner","oscar_best_picture_nominees","oscar_best_director_nominees",
     "national_film_preservation_board_winner","razzie_winner","razzie_nominee"]
    f = dict(zip(groups, list(range(0,len(groups)))))
    from pprint import pprint
    from bs4 import BeautifulSoup
    import requests
    import pandas as pd 
    print('Possible groups\n\n', pprint(f))
    group_index = int(input('Enter the category to scrape: '))
    num = int(input('Number of movies to scrape: '))
    group = groups[group_index]
    url = 'https://www.imdb.com/search/title/?count='+str(num)+'&groups='+group+'&sort=user_rating'

    def get_html_data(url):
        page = requests.get(url, headers={"Accept-Language": "en-US"})
        return BeautifulSoup(page.text, "html.parser")
    try:
        soup = get_html_data(url)
        #extract data elements
        movies = soup.findAll('div', class_='lister-item-content')
        titles = []
        released = []
        ratings = []
        runtime = []
        genres = []
        imdbs = []
        votes = []
        revenue = []
        for movie in movies:
            #Get movie names
            title = movie.find('a').text
            titles.append(title)
            #Get release date
            rel = movie.find('span', class_='lister-item-year text-muted unbold').text 
            released.append(rel)
            #ratings
            rating = movie.find('span', class_='certificate')
            ratings.append(rating)
            #runtime
            run = movie.find('span', class_='runtime').text
            runtime.append(run)
            #Genre
            gen = movie.find('span', class_='genre').text.strip()
            genres.append(gen)
            #IMDB rating
            rat = movie.find('div', 'inline-block ratings-imdb-rating', text_attribute=False).text.strip()
            imdbs.append(rat)
            #Votes
            vote = movie.find('span', {'name' : 'nv'}, text_attribute=False, order=None).text
            votes.append(vote)
            #Revenue generated
            rev = movie.find('span', {'name' : 'nv'},[1], text_attribute=False).text
            revenue.append(rev)

        movies_dict = {'Title': titles, 'Release date': released, 'Audience Rating': ratings,
                   'Runtime': runtime, 'Genre': genres,
                   'Votes': votes, 'Box Office Earnings': revenue, 'IMDB Rating': imdbs}
        movie_data = pd.DataFrame(movies_dict)
        #Data transformation
        #convert rating to a float
        movie_data['IMDB Rating'] = [float(item) for item in movie_data['IMDB Rating']]
        #convert votes to a int
        movie_data['Votes'] = [int(item.replace(",", "")) for item in movie_data['Votes']]
        movie_data['Box Office Earnings'] = [int(item.replace(",", "")) for item in movie_data['Box Office Earnings']]
        #sort by rating
        movie_data = movie_data.sort_values('IMDB Rating', ascending = False)
        #Brief analysis
        print('===============================================================================================================================')
        print("Number of extracted movies:", len(movie_data))
        print('\n\nBest movie (highest rating) in the selected category:', movie_data['Title'].head(1)[0])
        print("Rating:", movie_data['IMDB Rating'].head(1)[0])
        print("Revenue:", movie_data['Box Office Earnings'].head(1)[0])
        print('===============================================================================================================================')

        print("\nTop 10 movies by revenue:")
        display(movie_data.sort_values('Box Office Earnings', ascending = False).head(10))
        print('===============================================================================================================================')
        print("\nTop 10 movies by ratings:")
        display(movie_data.sort_values('IMDB Rating', ascending = False).head(10))
        print('===============================================================================================================================')
        #wsave data to a mysql database
        #movie_data.to_sql(con=con, name=groups[group_index], if_exists='replace', flavor='mysql')
        movie_data.to_csv(group+".csv", index = False)
        print("Data saved successfully!")
        return movie_data
    except:
        print("The selected category does not have any movies")
        
        
data = movie_scraper()

{'best_director_winner': 12,
 'best_picture_winner': 11,
 'bottom_100': 3,
 'bottom_1000,oscar_winner': 5,
 'bottom_250': 4,
 'emmy_nominee': 9,
 'emmy_winner': 6,
 'golden_globe_nominee': 10,
 'golden_globe_winner': 7,
 'national_film_preservation_board_winner': 15,
 'oscar_best_director_nominees': 14,
 'oscar_best_picture_nominees': 13,
 'oscar_nominee': 8,
 'razzie_nominee': 17,
 'razzie_winner': 16,
 'top_100': 0,
 'top_1000': 2,
 'top_250': 1}
Possible groups

 None
Enter the category to scrape: 1
Number of movies to scrape: 100
Number of extracted movies: 100


Best movie (highest rating) in the selected category: The Shawshank Redemption
Rating: 9.3
Revenue: 2665639

Top 10 movies by revenue:


Unnamed: 0,Title,Release date,Audience Rating,Runtime,Genre,Votes,Box Office Earnings,IMDB Rating
0,The Shawshank Redemption,(1994),[R],142 min,Drama,2665639,2665639,9.3
2,The Dark Knight,(2008),[PG-13],152 min,"Action, Crime, Drama",2638614,2638614,9.0
9,Inception,(2010),[PG-13],148 min,"Action, Adventure, Sci-Fi",2338282,2338282,8.8
11,Fight Club,(1999),[R],139 min,Drama,2111066,2111066,8.8
13,Forrest Gump,(1994),[PG-13],142 min,"Drama, Romance",2066236,2066236,8.8
8,Pulp Fiction,(1994),[R],154 min,"Crime, Drama",2040409,2040409,8.9
15,The Matrix,(1999),[R],136 min,"Action, Sci-Fi",1904538,1904538,8.7
12,The Lord of the Rings: The Fellowship of the Ring,(2001),[PG-13],178 min,"Action, Adventure, Drama",1866790,1866790,8.8
1,The Godfather,(1972),[R],175 min,"Crime, Drama",1847225,1847225,9.2
3,The Lord of the Rings: The Return of the King,(2003),[PG-13],201 min,"Action, Adventure, Drama",1837663,1837663,9.0



Top 10 movies by ratings:


Unnamed: 0,Title,Release date,Audience Rating,Runtime,Genre,Votes,Box Office Earnings,IMDB Rating
0,The Shawshank Redemption,(1994),[R],142 min,Drama,2665639,2665639,9.3
1,The Godfather,(1972),[R],175 min,"Crime, Drama",1847225,1847225,9.2
2,The Dark Knight,(2008),[PG-13],152 min,"Action, Crime, Drama",2638614,2638614,9.0
3,The Lord of the Rings: The Return of the King,(2003),[PG-13],201 min,"Action, Adventure, Drama",1837663,1837663,9.0
4,Schindler's List,(1993),[R],195 min,"Biography, Drama, History",1349739,1349739,9.0
5,The Godfather Part II,(1974),[R],202 min,"Crime, Drama",1265123,1265123,9.0
6,12 Angry Men,(1957),[Approved],96 min,"Crime, Drama",787208,787208,9.0
7,Jai Bhim,(2021),[TV-MA],164 min,"Crime, Drama, Mystery",201443,201443,8.9
8,Pulp Fiction,(1994),[R],154 min,"Crime, Drama",2040409,2040409,8.9
10,The Lord of the Rings: The Two Towers,(2002),[PG-13],179 min,"Action, Adventure, Drama",1659324,1659324,8.8


Data saved successfully!
