<font color='green' size = 4> Web scrapping and creating dataset contaning information on movies from IMDB for the last 20 years using BeautifulSoup</font>

<font color='green' size = 3> STEP 01 : Creating a blank dataset with the required column headers</font>

In [1]:
import pandas as pd
column_names = ['movie_name', 'year','genre','rating','director','poster','votes']
movie_df = pd.DataFrame(columns = column_names)

In [2]:
movie_df.head()

Unnamed: 0,movie_name,year,genre,rating,director,poster,votes


<font color='green' size = 3> STEP 02 : Extracting the top 100 highest rated movies from IMDB from 1999 - 2019. </font>
<font color='green' size = 3>After scraping the data from IMDB, it was inserted as a row in the dataset created above.</font>

In [3]:
import numpy as np
import re
from requests import get
from bs4 import BeautifulSoup

from time import sleep
from random import randint
from time import time
from IPython.core.display import clear_output
from warnings import warn
warn("Warning Simulation")

#Preparing the monitoring of the loop
start_time = time()
requests = 0

for year in range(2000,2020):
    for page in range(1,11):
        url = 'https://www.imdb.com/search/title/?year='+str(year)+'&title_type=feature&sort=user_rating,desc&page='+str(page)
        #print(url)
        response = get(url)
        
        #Pause the loop
        sleep(randint(8,15))
        
        #Monitor the requests
        requests+=1
        elapsed_time = time() - start_time
        print('Request:{}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
        clear_output(wait = True)
        
        #Throwing a warning for non-200 status codes
        if response.status_code != 200:
            warn('Request: {}; Status code: {}'.format(requests, response.status_code))

        # Break the loop if the number of requests is greater than expected
        if requests > 72:
            warn('Number of requests was greater than expected.')
            break
            
        html_soup = BeautifulSoup(response.text, 'html.parser')
        movie_container = html_soup.find_all('div',class_ = 'lister-item mode-advanced')
        for i in range(0,49):
            if movie_container[i].h3.a is not None:
                name = movie_container[i].h3.a.text
            else:
                name = np.nan
            
            if movie_container[i].h3.find('span', class_='lister-item-year text-muted unbold') is not None:
                year = movie_container[i].h3.find('span', class_='lister-item-year text-muted unbold').text.strip('(').strip(')')
            else:
                year = np.nan
                
            if movie_container[i].find('span', class_='genre') is not None:
                genre = movie_container[i].find('span', class_='genre').text.strip()
            else:
                genre = np.nan
            
            if movie_container[i].strong is not None:
                rating = movie_container[i].strong.text
            else:
                rating = np.nan
                
            if movie_container[i].find('a', attrs = {'href':re.compile("^/name")}) is not None:
                director = movie_container[i].find('a', attrs = {'href':re.compile("^/name")}).text
            else:
                director = np.nan
                
            if movie_container[i].find('img', class_ = 'loadlate')['loadlate'] is not None:
                poster = movie_container[i].find('img', class_ = 'loadlate')['loadlate']
            else:
                poster = np.nan
                
            if movie_container[i].find('span', attrs = {'name':'nv'})['data-value'] is not None:
                votes = movie_container[i].find('span', attrs = {'name':'nv'})['data-value']
            else:
                votes = np.nan
            
            movie_df = movie_df.append({'movie_name' : name, 
                                        'year' : year,
                                        'genre' : genre,
                                        'rating' : rating,
                                        'director': director,
                                        'poster': poster,
                                        'votes': votes}, ignore_index = True)
        

Request:85; Frequency: 0.07256426238861602 requests/s


In [4]:
movie_df.head(10)

Unnamed: 0,movie_name,year,genre,rating,director,poster,votes
0,No Man's Love,2000,Drama,9.0,Nidhal Chatta,https://m.media-amazon.com/images/M/MV5BZDFiZT...,14
1,Dr. Babasaheb Ambedkar,2000,"Biography, History",8.8,Jabbar Patel,https://m.media-amazon.com/images/M/MV5BNGNjZD...,988
2,The Artist's Journey: Funk Blast,2000,,8.8,Ray Giarratana,https://m.media-amazon.com/images/M/MV5BNjlkND...,5
3,Nag-aapoy na laman,2000,Drama,8.6,Don Escudero,https://m.media-amazon.com/images/M/MV5BNmRjNW...,5
4,O Auto da Compadecida,2000,Comedy,8.6,Guel Arraes,https://m.media-amazon.com/images/M/MV5BMjdmND...,10071
5,Hlou w'Morr,2000,Drama,8.6,Naceur Ktari,https://m.media-amazon.com/images/M/MV5BMDBiMD...,10
6,Hiya Diya Niya,2000,"Drama, Romance",8.5,Munin Barua,https://m.media-amazon.com/images/M/MV5BMjM1OW...,17
7,Choo Lenge Akash,2000,Family,8.5,Virendra Saini,https://m.media-amazon.com/images/G/01/imdb/im...,6
8,Srabon Megher Din,2000,"Drama, Romance",8.5,Humayun Ahmed,https://m.media-amazon.com/images/M/MV5BMDU1Yj...,1933
9,El nadador inmóvil,2000,Drama,8.5,Fernán Rudnik,https://m.media-amazon.com/images/M/MV5BZTc0ZD...,6


<font color='green' size = 3> STEP 03 : Extracting the 100 lowest rated movies from IMDB from 1999 - 2019. </font>
<font color='green' size = 3>After scraping the data from IMDB, it was inserted as a row in the dataset created above.</font>

In [5]:
for year in range(2000,2020):
    for page in range(1,11):
        url = 'https://www.imdb.com/search/title/?year='+str(year)+'&title_type=feature&sort=user_rating,asc&page='+str(page)
        #print(url)
        response = get(url)
        html_soup = BeautifulSoup(response.text, 'html.parser')
        movie_container = html_soup.find_all('div',class_ = 'lister-item mode-advanced')
        for i in range(0,49):
            if movie_container[i].h3.a is not None:
                name = movie_container[i].h3.a.text
            else:
                name = np.nan
            
            if movie_container[i].h3.find('span', class_='lister-item-year text-muted unbold') is not None:
                year = movie_container[i].h3.find('span', class_='lister-item-year text-muted unbold').text.strip('(').strip(')')
            else:
                year = np.nan
                
            if movie_container[i].find('span', class_='genre') is not None:
                genre = movie_container[i].find('span', class_='genre').text.strip()
            else:
                genre = np.nan
            
            if movie_container[i].strong is not None:
                rating = movie_container[i].strong.text
            else:
                rating = np.nan
                
            if movie_container[i].find('a', attrs = {'href':re.compile("^/name")}) is not None:
                director = movie_container[i].find('a', attrs = {'href':re.compile("^/name")}).text
            else:
                director = np.nan
                
            if movie_container[i].find('img', class_ = 'loadlate')['loadlate'] is not None:
                poster = movie_container[i].find('img', class_ = 'loadlate')['loadlate']
            else:
                poster = np.nan
                
            if movie_container[i].find('span', attrs = {'name':'nv'})['data-value'] is not None:
                votes = movie_container[i].find('span', attrs = {'name':'nv'})['data-value']
            else:
                votes = np.nan
            
            movie_df = movie_df.append({'movie_name' : name, 
                                        'year' : year,
                                        'genre' : genre,
                                        'rating' : rating,
                                        'director': director,
                                        'poster': poster,
                                        'votes': votes}, ignore_index = True)
        

In [6]:
#Checking the no. of rows extracted
movie_df.shape

(13328, 7)

In [7]:
movie_df.poster[0]

'https://m.media-amazon.com/images/M/MV5BZDFiZTdhYTUtNjM5MC00ODJiLTgxNGUtNzhlYmY0OTNkYmM5L2ltYWdlL2ltYWdlXkEyXkFqcGdeQXVyMzE3ODM3MTI@._V1_UY98_CR1,0,67,98_AL_.jpg'

In [8]:
movie_df.head()

Unnamed: 0,movie_name,year,genre,rating,director,poster,votes
0,No Man's Love,2000,Drama,9.0,Nidhal Chatta,https://m.media-amazon.com/images/M/MV5BZDFiZT...,14
1,Dr. Babasaheb Ambedkar,2000,"Biography, History",8.8,Jabbar Patel,https://m.media-amazon.com/images/M/MV5BNGNjZD...,988
2,The Artist's Journey: Funk Blast,2000,,8.8,Ray Giarratana,https://m.media-amazon.com/images/M/MV5BNjlkND...,5
3,Nag-aapoy na laman,2000,Drama,8.6,Don Escudero,https://m.media-amazon.com/images/M/MV5BNmRjNW...,5
4,O Auto da Compadecida,2000,Comedy,8.6,Guel Arraes,https://m.media-amazon.com/images/M/MV5BMjdmND...,10071


<font color='green' size = 3> STEP 04 : Converting the poster's link to html tags to insert the images into the dataset for multilabel classification of genre using movie poster</font>

In [9]:
from IPython.core.display import HTML

images = movie_df['poster']
movie_df['image'] = images

# convert your links to html tags 
def path_to_image_html(path):
    return '<img src="'+ path + '" width="60" >'

pd.set_option('display.max_colwidth', -1)

HTML(movie_df.to_html(escape=False ,formatters=dict(image=path_to_image_html)))

Unnamed: 0,movie_name,year,genre,rating,director,poster,votes,image
0,No Man's Love,2000,Drama,9.0,Nidhal Chatta,"https://m.media-amazon.com/images/M/MV5BZDFiZTdhYTUtNjM5MC00ODJiLTgxNGUtNzhlYmY0OTNkYmM5L2ltYWdlL2ltYWdlXkEyXkFqcGdeQXVyMzE3ODM3MTI@._V1_UY98_CR1,0,67,98_AL_.jpg",14,
1,Dr. Babasaheb Ambedkar,2000,"Biography, History",8.8,Jabbar Patel,"https://m.media-amazon.com/images/M/MV5BNGNjZDBmMjQtNWE2MC00NDRhLThkZDMtM2Q1YTU5NTA3MzJhXkEyXkFqcGdeQXVyMjUxMTY3ODM@._V1_UY98_CR2,0,67,98_AL_.jpg",988,
2,The Artist's Journey: Funk Blast,2000,,8.8,Ray Giarratana,"https://m.media-amazon.com/images/M/MV5BNjlkNDM4YzQtOWYwYi00NmZmLWIzYmQtZTU1NDFmYTU5NDY4XkEyXkFqcGdeQXVyNjEwNjA4NDE@._V1_UY98_CR20,0,67,98_AL_.jpg",5,
3,Nag-aapoy na laman,2000,Drama,8.6,Don Escudero,"https://m.media-amazon.com/images/M/MV5BNmRjNWFjNGEtNmNmZS00ZWRmLWE5NmItZWI1M2I0ZDQwMGNkXkEyXkFqcGdeQXVyNTM3MDMyMDQ@._V1_UY98_CR3,0,67,98_AL_.jpg",5,
4,O Auto da Compadecida,2000,Comedy,8.6,Guel Arraes,"https://m.media-amazon.com/images/M/MV5BMjdmNDA4ZGEtODcxZi00ODg5LThmYzctZDAxYjJjNjAyYjgwXkEyXkFqcGdeQXVyNDc2MTA2NDg@._V1_UY98_CR0,0,67,98_AL_.jpg",10071,
5,Hlou w'Morr,2000,Drama,8.6,Naceur Ktari,"https://m.media-amazon.com/images/M/MV5BMDBiMDM4OTMtYzE0My00YjkyLWE4NjAtMmUyNTYzY2ZhMmNkXkEyXkFqcGdeQXVyNjgzMjQ0MTA@._V1_UY98_CR53,0,67,98_AL_.jpg",10,
6,Hiya Diya Niya,2000,"Drama, Romance",8.5,Munin Barua,"https://m.media-amazon.com/images/M/MV5BMjM1OWFiNTQtMTk2OS00Mzk5LWEwNTMtNmQ2NzYyYWU0M2M4XkEyXkFqcGdeQXVyODAzNzAwOTU@._V1_UX67_CR0,0,67,98_AL_.jpg",17,
7,Choo Lenge Akash,2000,Family,8.5,Virendra Saini,https://m.media-amazon.com/images/G/01/imdb/images/nopicture/67x98/film-2500266839._CB470041825_.png,6,
8,Srabon Megher Din,2000,"Drama, Romance",8.5,Humayun Ahmed,"https://m.media-amazon.com/images/M/MV5BMDU1YjczYmYtNmU2NS00NDY1LWFhMDgtOTJiNmE1N2M1MDU2XkEyXkFqcGdeQXVyNjA3OTI5MjA@._V1_UY98_CR6,0,67,98_AL_.jpg",1933,
9,El nadador inmóvil,2000,Drama,8.5,Fernán Rudnik,"https://m.media-amazon.com/images/M/MV5BZTc0ZDljNTgtZDIyOS00NTgyLTg3N2ItZGY3MmMyY2UxZmUxXkEyXkFqcGdeQXVyMjQ0NzgwNzY@._V1_UY98_CR2,0,67,98_AL_.jpg",6,
