In [3]:
# v1 - 8m 36s
#Importing required libraries
from bs4 import BeautifulSoup
from datetime import datetime
import requests
import csv

#Starting the timer
start = datetime.now()

#Initializing browser's user agent
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"}

#We are collecting movie details of five different streaming platforms on RottenTomatoes website
#RottenTomatoes is a movie rating website which has details of movies, which are streamed on different platforms
#Since each platform has different pages, we are storing all 5 URLs in a list
url = ["https://www.rottentomatoes.com/browse/movies_at_home/affiliates:netflix?page=5",
       "https://www.rottentomatoes.com/browse/movies_at_home/affiliates:amazon_prime?page=5",
       "https://www.rottentomatoes.com/browse/movies_at_home/affiliates:peacock?page=5",
       "https://www.rottentomatoes.com/browse/movies_at_home/affiliates:vudu?page=5"
       "https://www.rottentomatoes.com/browse/movies_at_home/affiliates:apple_tv?page=5"]

#To get a list of all movies on the RottenTomatoes from all 5 streaming platforms
movie_list_all = []

#Traversing through each URL(web page), collecting only the movie titles
for x in range(len(url)):
    
    #Getting the page using BeautifulSoup and storing it in "soup"
    response = requests.get(url[x], headers=headers)
    soup = BeautifulSoup(response.text, "lxml")
    
    #Movie titles are in tag="span" and class="p--small"
    movies = soup.find_all("span", class_ = "p--small")
    
    #Traversing through each element with same tag and class and appending to the movie name list
    for movie in movies:
        movie_list_all.append(movie.text.strip())

# print(len(movie_list_all))

#Same movies can be repeated on different platforms, so we remove duplicates in the list
def unique(not_unique_list):
 
    # initialize a null list
    unique_list = []
 
    # traverse for all elements
    for x in not_unique_list:
        # check if exists in unique_list or not
        if x not in unique_list:
            unique_list.append(x)
#     print(len(unique_list))
    return unique_list

#Calling unique function to remove duplicates in the movie list
movie_list = unique(movie_list_all)

#To get the details of each movie, we should open the individual web pages of each movies, and we need individual movie URLs
#Individual movie URLs are obtained by concatinating common URL with "movie name" string.
#NOTE: In individual URLs special characters are removed and spaces are replaced by underscores

for i in range(len(movie_list)):
    movie_list[i] = movie_list[i].replace(" ","_").replace("-","").replace(":","").replace("'","").replace("?","").replace(",","").replace(".","").replace("&","").replace("__","_").replace("(","").replace(")","")

# print(len(movie_list))

#Opening a CSV in write mode
file = open("RottenTomatoes.csv", "w", newline="")
writer = csv.writer(file)

#Writing the header row
writer.writerow(["NAME", "RATING", "GENRE", "ORIGINAL LANGUAGE", "DIRECTOR", "PRODUCER", "BOX OFFICE GROSS", "WRITER", "RELEASE DATE", "DURATION", "DISTRIBUTION", "PRODUCTION COMPANY", "DESCRIPTION"])

#Each loop through the outermost for loop means that, we are traversing through the whole movie list.
#For each movie list element, we open its unique URL and get the required details.
for movie in movie_list:
# for j in range(0,10):

    #Whenever there is an error (like missing web pages, missing parameters etc.) the for loop ends.
    #To bypass the error during fetching of individual webpages, we use exception handling.
    try:
        
        #Creating the unique movie URL
        movie_url = "https://www.rottentomatoes.com/m/" + movie
        
        #Getting the page using BeautifulSoup and storing it in "movie_soup"
        movie_response = requests.get(movie_url, headers=headers)    
        movie_soup = BeautifulSoup(movie_response.text, "html.parser")
    
        #Getting movie name
        movie_title = movie_soup.find("h1", class_ = "title").text.strip()
    
        #Getting into details section (All the below variables are under the same tag name and class name)
        movie_details_label = movie_soup.find_all("b", attrs={"class":"info-item-label", "data-qa":"movie-info-item-label"})
        movie_details_value = movie_soup.find_all("span", attrs={"class":"info-item-value", "data-qa":"movie-info-item-value"})
        
        #Initializing the variables with NA before scraping details (To avoid previous values getting repeated when there is a NA)
        movie_rating = "NA"
        movie_genre = "NA"
        movie_language = "NA"
        movie_director = "NA"
        movie_producer = "NA"
        movie_writer = "NA"
        movie_release_date = "NA"
        movie_runtime = "NA"
        movie_distributor = "NA"
        movie_production_co = "NA"
        movie_box_office_gross = "NA"
        
        #Going through variables one by one which have same tag name and same attributes
        for i in range(0,10):
            if movie_details_label[i].text == "Rating:":
                movie_rating = movie_details_value[i].text.strip()
            if movie_details_label[i].text == "Genre:":
                movie_genre = movie_details_value[i].text.strip()
            if movie_details_label[i].text == "Original Language:":
                movie_language = movie_details_value[i].text.strip()
            if movie_details_label[i].text == "Director:":
                movie_director = movie_details_value[i].text.strip()
            if movie_details_label[i].text == "Producer:":
                movie_producer = movie_details_value[i].text.strip()
            if movie_details_label[i].text == "Box Office (Gross USA):":
                movie_box_office_gross = movie_details_value[i].text.strip()
            if movie_details_label[i].text == "Writer:":
                movie_writer = movie_details_value[i].text.strip()
            if movie_details_label[i].text == "Release Date (Theaters):":
                movie_release_date = movie_details_value[i].text.strip()
            if movie_details_label[i].text == "Runtime:":
                movie_runtime = movie_details_value[i].text.strip()
            if movie_details_label[i].text == "Distributor:":
                movie_distributor = movie_details_value[i].text.strip()
            if movie_details_label[i].text == "Production Co:":
                movie_production_co = movie_details_value[i].text.strip()     

        print(str(movie) + " -> " + movie_title)
                
        #Getting Description of movie
        movie_description = movie_soup.find("p", attrs={"data-qa":"movie-info-synopsis", "slot":"content"}).text.strip()
        
        #Writing the collected details about individual movie to CSV
        writer.writerow([movie_title, movie_rating, movie_genre, movie_language, movie_director, movie_producer, movie_box_office_gross, movie_writer, movie_release_date, movie_runtime, movie_distributor, movie_production_co, movie_description])
    
    except (AttributeError, IndexError):
        pass

#Closing the csv file
file.close()

#Ending the timer and printing it
end = datetime.now()
print(f"Program ran since {start} to {end}")

Mission_Impossible_Dead_Reckoning_Part_One -> Mission: Impossible - Dead Reckoning, Part One
Joy_Ride -> Joy Ride
Biosphere -> Biosphere
Bird_Box_Barcelona -> Bird Box Barcelona
Leave_No_Trace -> Leave No Trace
Nimona -> Nimona
Extraction_2 -> Extraction 2
Annihilation -> Annihilation
A_Man_Called_Otto -> A Man Called Otto
Titanic -> Titanic
Emily_the_Criminal -> Emily the Criminal
The_Nice_Guys -> The Nice Guys
It_Follows -> It Follows
White_House_Down -> White House Down
The_Woman_King -> The Woman King
The_Huntsman_Winters_War -> The Huntsman: Winter's War
Under_the_Shadow -> Under the Shadow
Where_the_Crawdads_Sing -> Where the Crawdads Sing
The_Power_of_the_Dog -> The Power of the Dog
Luther_The_Fallen_Sun -> Luther: The Fallen Sun
To_Leslie -> To Leslie
White_Noise -> White Noise
Glass_Onion_A_Knives_Out_Mystery -> Glass Onion: A Knives Out Mystery
RRR -> RRR
Hunt_for_the_Wilderpeople -> Hunt for the Wilderpeople
The_Lost_Daughter -> The Lost Daughter
The_Perfect_Find -> The Perf

The_Hunger_Games_Catching_Fire -> The Hunger Games: Catching Fire
Moneyball -> Moneyball
Madagascar -> Madagascar
Glengarry_Glen_Ross -> Glengarry Glen Ross
Lone_Survivor -> Lone Survivor
Snatch -> Snatch
Blue_Valentine -> Blue Valentine
Scott_Pilgrim_vs_the_World -> Scott Pilgrim vs. the World
2_Fast_2_Furious -> 2 Fast 2 Furious
Lost_in_Translation -> Lost in Translation
Gangs_of_New_York -> Gangs of New York
10_Things_I_Hate_About_You -> 10 Things I Hate About You
How_to_Train_Your_Dragon_2 -> How to Train Your Dragon 2
Harry_Potter_and_the_Prisoner_of_Azkaban -> Harry Potter and the Prisoner of Azkaban
Hanna -> Hanna
True_Lies -> True Lies
Freaks -> Freaks
2012 -> 2012
The_Invitation -> The Invitation
The_Gangster_the_Cop_the_Devil -> The Gangster, the Cop, the Devil
The_Smurfs -> The Smurfs
God_Is_a_Bullet -> God Is a Bullet
The_Starling_Girl -> The Starling Girl
Program ran since 2023-07-11 09:31:49.592738 to 2023-07-11 09:39:10.897395
