# Imported Libraries

In [8]:
from bs4 import BeautifulSoup

import requests
from requests.exceptions import ConnectionError
from requests.exceptions import Timeout
from requests.exceptions import TooManyRedirects
from requests.exceptions import RequestException
from requests.exceptions import HTTPError as HTTPErrorRequests

from json import loads
from json import dump

from html import unescape
from selenium import webdriver
from selenium.webdriver.common.by import By
import time

import pandas as pd
import os

# Class Declaration

In [9]:
class MovieIMDb:
    def __init__(self, movieURL, parser='html.parser'):
        self.movieURL = movieURL
        self.movieID = self._extractMovieID()
        
        self.movieHomepageHTML = self._simpleConnector(self.movieURL)
        self.movieSoup = BeautifulSoup(self.movieHomepageHTML, parser=parser)
        self.movieContentJSON = self._extractContent()
        
        """For metadata"""
        self.movieTitle = self.movieContentJSON['name']
        self.movieTitle = ''.join(ch for ch in self.movieTitle if (ch.isalnum() or ch == ' ')) 
        print(self.movieTitle)
        self.movieIMDbRating, self.totalReviews = self.movieContentJSON['aggregateRating']['ratingValue'], self.movieContentJSON['aggregateRating']['ratingCount']
        self.totalUserReviews, self.totalCriticReviews, self.metaScore = self._extractMovieScores()
        self.movieDatePublished = self.movieContentJSON['datePublished']
        self.movieGenres = self.movieContentJSON['genre']
        self.movieDirectors = self._extractDirectors()
        self.movieWriters = self._extractWriters()
        self.movieStars = self._extractStars()
        self.movieDescription = self._extractDescription()
        self.movieDuration = self._extractDuration()
        
        """For reviews"""
        self.reviewsURL = self.movieURL + 'reviews'
        self.reviewsSoup = self._fullPageSoup()
        self.reviewsExtracted = self._extractReviewsInfo()
        
        """Compilation"""
        self.metaData = self._prepareMetadata()
        self.reviewsExtracted = self._prepareReviewsDataframe()
        
        """Export"""
        self._exportData()
        

    """For metadeta"""
    def _simpleConnector(self, url):
        try:
            html = requests.get(url)
            html.raise_for_status()
        except HTTPErrorRequests as e:
            print('HTTP ERROR')
            print(e)
            return None
        except ConnectionError as e:
            print('CONNECTION ERROR')
            print(e)
            return None
        except Timeout as e:
            print('TIMEOUT')
            print(e)
            return None
        except TooManyRedirects as e:
            print('BAD URL: TOO MANY REDIRECTS')
            print(e)
            return None
        except RequestException as e:
            raise SystemExit(e)
        else:
            print('URL Connected')
            return html.text
    
    def _extractMovieID(self):
        movieID = self.movieURL.split('/')[-2]
        print('Movie ID Extraction done')
        return movieID
    
    def _extractContent(self):
        content = loads(self.movieSoup.find('script', {'type': 'application/ld+json'}).text)
        print('Content Extraction done')
        return content
    
    def _extractMovieScores(self):
        all_reviews = self.movieSoup.find('ul', {'data-testid': 'reviewContent-all-reviews'}).find_all('li')
        stats = []
        for review in all_reviews:
            score = review.find('span', {'class': 'score'}).text
            print(score)
            stats.append(score)
            
        return stats
    
    def _extractDirectors(self):
        directors = self.movieContentJSON['director']
        
        directors_list = []
        for director in directors:
            if 'name' not in director:
                continue
            directors_list.append(director['name'])
            
        print('Directors Extraction done!')
        print(directors_list)
        return directors_list
    
    def _extractWriters(self):
        writers = self.movieContentJSON['creator']
        names = []
        for name in writers:
            if 'name' not in name:
                continue
            names.append(name['name'])
        print("Writers extracted!")
        print(names)
        return names
    
    def _extractStars(self):
        stars = self.movieContentJSON['actor']
        actors = []
        
        for star in stars:
            if 'name' not in star:
                continue
            actors.append(star['name'])
        
        print('Stars Extracted')
        print(actors)
        return actors
    
    def _extractDescription(self):
        description = unescape(self.movieContentJSON['description'])
        print(description)
        return description
    
    def _extractDuration(self):
        duration_split = self.movieContentJSON['duration'][2:].split('H')
        hours = int(duration_split[0])
        minutes = int(duration_split[1][:-1])
        
        duration = hours*60 + minutes
        
        return duration
    
    """For reviews"""
    def _fullPageSoup(self):
        driver = webdriver.Chrome('C:\\chromedriver.exe')
        driver.get(self.reviewsURL)

        while True:
            link = driver.find_element(By.CLASS_NAME, "ipl-load-more__button")
            link.click()
            time.sleep(9)
            if link.value_of_css_property('display') == 'none':
                break

        souper = BeautifulSoup(driver.page_source, 'html.parser')
        driver.quit()
        print('Full Soup Extracted from User Reviews Page!')
        
        return souper
    
    def _extractReviewsInfo(self):
        def extractHelpfulness(review):
            review = review.split()
            useful = review[0]
            total = review[3]

            useful = useful.replace(',', '')
            total = total.replace(',', '')

            return int(useful), int(total)
        
        list_collapsable = self.reviewsSoup.find_all('div', {'class': 'lister-item mode-detail imdb-user-review with-spoiler'})
        list_with_spoiler = self.reviewsSoup.find_all('div', {'class': 'lister-item mode-detail imdb-user-review collapsable'})
        
        reviews = list_collapsable + list_with_spoiler
        
        complete_data = []
        
        for review in reviews:
            try:
                temp = []
                reviewRating = review.find('span', {'class': None})
                if reviewRating == None:
                    continue
                reviewDate = review.find('span', {'class':'review-date'}).get_text()
                reviewDesc = review.find('div', {'class': 'text'}).get_text()
                reviewUser = review.find('span', {'class': 'display-name-link'}).get_text()
                reviewTitle = review.find('a', {'class': 'title'}).get_text()
                reviewFooter = review.find('div', {'class': 'actions text-muted'}).get_text()
                reviewUseful, reviewTotal = extractHelpfulness(reviewFooter)
                
                temp.append(reviewDate)
                temp.append(reviewUser)
                temp.append(reviewUseful)
                temp.append(reviewTotal)
                temp.append(reviewRating.get_text())
                temp.append(reviewTitle.strip())
                temp.append(reviewDesc.strip())
                
                complete_data.append(temp)
            except Exception as E:
                print(E)
                continue
        
        print('User Reviews are extracted!')
        return complete_data
    
    """Compilation"""
    
    def _prepareMetadata(self):
        dictionary = {
            'title': self.movieTitle,
            'movieIMDbRating': self.movieIMDbRating,
            'totalRatingCount': self.totalReviews,
            'totalUserReviews': self.totalUserReviews,
            'totalCriticReviews': self.totalCriticReviews,
            'metaScore': self.metaScore,
            'movieGenres': self.movieGenres,
            'directors': self.movieDirectors,
            'datePublished': self.movieDatePublished,
            'creators': self.movieWriters,
            'mainStars': self.movieStars,
            'description': self.movieDescription,
            'duration': self.movieDuration
        }
        
        return dictionary
    
    def _prepareReviewsDataframe(self):
        df = pd.DataFrame(self.reviewsExtracted, columns=['Date of Review', 'User', 'Usefulness Vote', 'Total Votes', "User's Rating out of 10", 'Review Title', "Review"])
        return df
    
    """Exporting data"""
    def _exportData(self):
        if not os.path.isdir(self.movieTitle):
            os.makedirs(self.movieTitle)
        
        self.reviewsExtracted.to_csv(self.movieTitle + '/movieReviews.csv', index=False)
        
        with open(self.movieTitle + '/metadata.json', 'w') as fp:
            dump(self.metaData, fp)
    
    """Getters"""
    def getMovieSoup(self):
        return self.movieSoup
    def getReviewsSoup(self):
        return self.reviewsSoup
    def getReviews(self):
        return self.reviewsExtracted
    def getMetadata(self):
        return self.metaData

# Test URLs

In [10]:
movieURLs = [
    "https://www.imdb.com/title/tt10872600/", #Spiderman: No Way Home
    "https://www.imdb.com/title/tt4154796/", #Avengers: Endgame
    "https://www.imdb.com/title/tt7286456/", #Joker
    "https://www.imdb.com/title/tt0109830/", #Forrest Gump
    "https://www.imdb.com/title/tt0468569/" #The Dark Knight
]

# Testing

Steps to follow first: <br>
<ol>
    <li>Download the respective libraries mentioned in "Imported Libraries" section. </li>
    <li>Download chromedriver.exe and specify its path in the function <code>_fullPageSoup(self).</code></li>
    <li>Paste the link of Movies' Main IMDb page in the movieURLs list.</li>
    <li>Run the code below to start the scrapping process. The data will be stored automatically by making a folder of movie's name.</li>
</ol> 

In [None]:
#Run the below code to start scrapping
for url in movieURLs:
    movie = MovieIMDb(url)