Skip to content
This repository was archived by the owner on May 25, 2022. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions projects/Movie Information Scraper/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Movie Information Scraper
This script obtains movie details by scraping IMDB website.

### Prerequisites
* bs4
* requests
* Run `pip install -r requirements.txt` to install required external modules.

### How to run the script
Execute `python3 movieInfoScraper.py` and type in the movie name when prompted.

### Screenshot/GIF showing the sample use of the script
<!--Remove the below lines and add yours -->
![Screenshot of the Output](Screenshot.png)

## Author Name
Anandha Krishnan Aji
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
100 changes: 100 additions & 0 deletions projects/Movie Information Scraper/movieInfoScraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from bs4 import BeautifulSoup
import requests

# Function to get Movie Details


def getMovieDetails(movieName):
# Base URL of IMDB website
url = 'https://www.imdb.com'

# Query to find movie title
query = '/search/title?title='

# Empty dictionary to store movie Details
movieDetails = {}

# Query formed
movienamequery = query+'+'.join(movieName.strip().split(' '))

# WebPage is obtained and parsed
html = requests.get(url+movienamequery+'&title_type=feature')
bs = BeautifulSoup(html.text, 'html.parser')

# Gets the first movie that appears in title section
result = bs.find('h3', {'class': 'lister-item-header'})

if result is None:
return None

movielink = url+result.a.attrs['href']
movieDetails['name'] = result.a.text

# Gets the page with movie details
html = requests.get(movielink)
bs = BeautifulSoup(html.text, 'html.parser')
# Year
try:
movieDetails['year'] = bs.find('span', {'id': 'titleYear'}).a.text
except AttributeError:
movieDetails['year'] = 'Not available'
subtext = bs.find('div', {'class': 'subtext'})

# Rating,Genres,Runtime,Release Date,

movieDetails['genres'] = [
i.text for i in subtext.findAll('a', {'title': None})]
try:
movieDetails['rating'] = bs.find(
'div', {'class': 'ratingValue'}).span.text
movieDetails['runtime'] = subtext.time.text.strip()
except AttributeError:
movieDetails['rating'] = 'Not yet rated'
movieDetails['runtime'] = 'Not available'
movieDetails['release_date'] = subtext.find(
'a', {'title': 'See more release dates'}).text.strip()

# Gets the credit section of the page
creditSummary = bs.findAll('div', {'class': 'credit_summary_item'})

# Directors,Writers and Cast
movieDetails['directors'] = [i.text for i in creditSummary[0].findAll('a')]
movieDetails['writers'] = [i.text for i in creditSummary[1].findAll(
'a') if 'name' in i.attrs['href']]
try:
movieDetails['cast'] = [i.text for i in creditSummary[2].findAll(
'a') if 'name' in i.attrs['href']]

# For some films, writer details are not provided
except IndexError:
movieDetails['cast']=movieDetails['writers']
movieDetails['writers']='Not found'

# The plot is seperate AJAX call and does not come in the html page, So one more request to plotsummary page
html = requests.get(movielink+'plotsummary')
bs = BeautifulSoup(html.text, 'html.parser')

# Plot
movieDetails['plot'] = bs.find(
'li', {'class': 'ipl-zebra-list__item'}).p.text.strip()

# Returns the dictionary with movie details
return movieDetails


if __name__ == "__main__":
movieName = input('Enter the movie name whose details are to be fetched\n')
movieDetails = getMovieDetails(movieName)
if movieDetails is None:
print('No movie of this name found !!!!!')
quit()
print('\n{movie} ({year})'.format(
movie=movieDetails['name'], year=movieDetails['year']))
print('Rating:', movieDetails['rating'])
print('Runtime:', movieDetails['runtime'])
print('Release Date:', movieDetails['release_date'])
print('Genres:', ', '.join(movieDetails['genres']))
print('Director:', ', '.join(movieDetails['directors']))
print('Writer:', ', '.join(movieDetails['writers']))
print('Cast:', ', '.join(movieDetails['cast']))
print('Plot Summary:\n', movieDetails['plot'])
2 changes: 2 additions & 0 deletions projects/Movie Information Scraper/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
bs4==0.0.1
requests==2.23.0