The task is to understand audience preferences for different movie genres and how they have evolved over the last decade. 
Utilize the IMDb dataset to extract information on current top 100 popular movies on IMDb and the top 100 movies in various genres released over the past decade, all specific to the 8 popular production houses. 


# Collection 1

##Most Popular Movies on IMDB
## Import packages

In [None]:
# Import packages
from bs4 import BeautifulSoup
import requests
import time
import os
import warnings
import re
import pandas as pd
from pymongo import MongoClient

## Extract links and store pages

In [None]:
# Get the page
headers = {'User-Agent': 'Mozilla/5.0'}
url = 'https://www.imdb.com/chart/moviemeter/'
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')

In [None]:
# Get href link for each movie
links = soup.select('a.ipc-title-link-wrapper')
len(links)

for link in links[:100]:
    print(link['href'])

In [None]:
#create a list to save these extracted links

top100_links = []

for link in links:
    new_link = 'https://www.imdb.com' + link['href']
    top100_links.append(new_link)

for i in top100_links:
    print(i)

In [None]:
# Use loop to save all html files
for link in top100_links[:100]:
    
    # Pause between two requests
    time.sleep(5)
    
    # Use 'requests' to fetch the listing page
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(link, headers = headers)
    
    # Read the content of the html
    soup = BeautifulSoup(response.content, 'html.parser')
    
    match = re.search(r'tt(\d+)/', link)
    imdb_id = match.group(1)
    
    # Name the html file
    file_name = f"../Final Project/top100_popular/{imdb_id}.html"
    
    # Write the content to html file
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(soup.prettify())

## Process html pages to extract information

In [None]:
# Set work directory
directory = '../Final Project/top100_popular'

In [None]:
# Create a list
movies_list = []

# Loop through each file in the directory
for filename in os.listdir(directory):
    
    # Construct the full file path
    filepath = os.path.join(directory, filename)

    # Read file to string
    with open(filepath, 'r', encoding='utf-8', errors='ignore') as file:
        html = file.read()

    # Use BeautifulSoup to parse the file content
    soup = BeautifulSoup(html, 'html.parser')

    # Create a dictionary to store movie information
    movie_info = {}
    
    # ID
    match = re.search(r'(\d+)\.html', filename)
    imdb_id = match.group(1)
    movie_info['id'] = imdb_id
    
    # Title
    titles = soup.select('h1 > span.hero__primary-text')
    for title in titles:
        movie_info['Title'] = title.text.strip()

    # Infos
    info = soup.select('a.ipc-link.ipc-link--baseAlt.ipc-link--inherit-color')
    if len(info) > 18:
        movie_info['Year'] = info[5].text.strip()
        movie_info['Rating'] = info[6].text.strip()
    elif len(info) > 17:
        movie_info['Year'] = None
        movie_info['Rating'] = info[5].text.strip()
    else:
        movie_info['Year'] = None
        movie_info['Rating'] = None
    
    # Rating
    scores = soup.select_one('span.sc-bde20123-1.cMEQkK')
    if scores is not None:
        movie_info['Rating score'] = scores.text.strip()
    else:
        movie_info['Rating score'] = None

    # Number of raters
    num_scores = soup.select_one('div.sc-bde20123-3.gPVQxL')
    if num_scores is not None:
        movie_info['Number of raters'] = num_scores.text.strip()
    else:
        movie_info['Number of raters'] = None

    # Popularity
    popularity = soup.select_one('div.sc-5f7fb5b4-1.fTREEx')
    if popularity is not None:
        movie_info['Popularity rank'] = popularity.text.strip()
    else:
        movie_info['Popularity rank'] = None
   
    # Genre
    genre = soup.select('span.ipc-chip__text')
    if len(genre) == 4:
        movie_info['Genre_1'] = genre[0].text.strip()
        movie_info['Genre_2'] = genre[1].text.strip()
        movie_info['Genre_3'] = genre[2].text.strip()
    elif len(genre) == 3: 
        movie_info['Genre_1'] = genre[0].text.strip()
        movie_info['Genre_2'] = genre[1].text.strip()
        movie_info['Genre_3'] = None
    elif len(genre) == 2:   
        movie_info['Genre_1'] = genre[0].text.strip()
        movie_info['Genre_2'] = None
        movie_info['Genre_3'] = None
    else:
        movie_info['Genre_1'] = None
        movie_info['Genre_2'] = None
        movie_info['Genre_3'] = None
    
    # Introduction
    intro = soup.select('span.sc-466bb6c-2.chnFO')
    for i in intro:
        movie_info['Introduction'] = i.text.strip()

    # User reviews
    user_review = soup.select('span.three-Elements > span.score')
    if len(user_review) > 2:
        movie_info['User reviews'] = user_review[0].text.strip()
        movie_info['Critic reviews'] = user_review[1].text.strip()
        movie_info['Metascore'] = user_review[2].text.strip()
    else:
        movie_info['User reviews'] = None
        movie_info['Critic reviews'] = None
        movie_info['Metascore'] = None

    # Append movie_info dictionary to movies_info list
    movies_list.append(movie_info)

In [None]:
# Print the dictionary
for movie_info in movies_list:
    print(movie_info)

## Create Table and Save the File

In [None]:
# Change the list to dataframe format
df = pd.DataFrame(movies_list)

# Print the dataframe
df.head(5)

In [None]:
# Save it as csv file
df.to_csv('top100_popular.csv', index=False)

## Connect to MongoDB Database

In [None]:
# Connect to local MongoDB instance
mo_c = MongoClient()
client = MongoClient('localhost', 27017)

# Create a database named 'IMDB'
db = client["IMDB"]

# Create a collection
collection = db["top100_popular"]

# Insert documents to collection
for movie in movies_list:
    collection.insert_one(movie)

# Print the collection
for document in collection.find():
    print(document)

#collection 2

In [13]:
#start by importing the necessary libraries
import requests
import json
import time
import os
from bs4 import BeautifulSoup

#1_navigates to the url, a consicuous choice was made to use the url that already has all but one filter set. 
#we kept the original url and proceeded to add the filters using selenium. 
#results suggested that using the url with the filters already set was the best choice as it would save time and resources

#the top 8 production houses for which analysis is to be done are: 20th century fox, DreamWorks, MGM, Paramount, sony, Universal, Warner Bros and walt disney

#analysing and formulating the url

#the original url with no filters 
#url_1 = https://www.imdb.com

#the url with that leads to the advanced search page for movies. 
#url_2 = https://www.imdb.com/search/title/?title_type=feature&sort=num_votes,desc
#The search results are sorted in descending order by the number of votes gained irrespective of them being good or bad, meaning the movies with the highest number of votes appear first in the list.

#the url with the filters set to show movies from 1990 to 2020
#url_3 = https://www.imdb.com/search/title/?title_type=feature&release_date=2014-01-01,2024-01-31&sort=num_votes,desc
# "&release_date=2014-01-01,2024-01-31" clause gets added to the url to filter the movies from 2014 to 2024

#the url with the filters set to the top 8 production houses
#url_4 = https://www.imdb.com/search/title/?title_type=feature&companies=fox&release_date=2014-01-01,2024-01-31&sort=num_votes,desc
# "&companies=fox" clause gets added to the url to filter the movies from 2014 to 2024 produced by 20th century fox
#by editing the companies element we can automate the scrapping for multiple production houses

#the url with the filters set to the top 8 production houses
# fox
# dreamworks
# mgm
# paramount
# sony
# universal
# disney
# warner

#since all other filters except for the production houses are the same, we can use the same url and just change the production house name in the companies element
#we need to uppend these to a list so that these urls can be iterated over to get the data for all the production houses

# List of top 8 production houses
production_houses = ['fox', 'dreamworks', 'mgm', 'paramount', 'sony', 'universal', 'disney', 'warner']

# create a List to store URLs
urls = []

# formulagte the base URL 
base_url = 'https://www.imdb.com/search/title/?title_type=feature&companies={}&release_date=2014-01-01,2024-01-31&sort=num_votes,desc'

# Loop through production houses to create URLs
for house in production_houses:
    # Create URL by formatting production house into the base URL
    url = base_url.format(house)
    urls.append(url)
print(urls)


['https://www.imdb.com/search/title/?title_type=feature&companies=fox&release_date=2014-01-01,2024-01-31&sort=num_votes,desc', 'https://www.imdb.com/search/title/?title_type=feature&companies=dreamworks&release_date=2014-01-01,2024-01-31&sort=num_votes,desc', 'https://www.imdb.com/search/title/?title_type=feature&companies=mgm&release_date=2014-01-01,2024-01-31&sort=num_votes,desc', 'https://www.imdb.com/search/title/?title_type=feature&companies=paramount&release_date=2014-01-01,2024-01-31&sort=num_votes,desc', 'https://www.imdb.com/search/title/?title_type=feature&companies=sony&release_date=2014-01-01,2024-01-31&sort=num_votes,desc', 'https://www.imdb.com/search/title/?title_type=feature&companies=universal&release_date=2014-01-01,2024-01-31&sort=num_votes,desc', 'https://www.imdb.com/search/title/?title_type=feature&companies=disney&release_date=2014-01-01,2024-01-31&sort=num_votes,desc', 'https://www.imdb.com/search/title/?title_type=feature&companies=warner&release_date=2014-01-0

#base code to save the pages as they are - with 50 results with a get request

# create a loop to iterate over the urls and save the html content of the page with the production house name
for production_house in production_houses:
    url = base_url.format(production_house)
    
    #send a GET request to the URL
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Save the HTML content of the page with the production house name
        with open('{}.html'.format(production_house), 'w', encoding='utf-8') as f:
            f.write(response.text)
    
    # Pause the loop for 5 seconds
    time.sleep(5)  


In [None]:
  #50 movies sounded good but we wanted to check if we can get 100 movies for each of the production house. 
    #The next step was to use selenium to navigate to the url and save the html content of the page
    #preliminary search results show that the html content of the page is not static 
    #the page in the first go loads only the first 50 movies and the rest are loaded as the user scrolls down and presses the "50 more" option
    #but the url of the page does not change when we press the button, html content is saved only for those 50 movies unless the user scrolls down and presses the "50 more" option


from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# Initialize the chrome WebDriver instance
driver = webdriver.Chrome()

# Base URL for IMDb search
base_url = 'https://www.imdb.com/search/title/?title_type=feature&companies={}&release_date=2014-01-01,2024-01-31&sort=num_votes,desc'

# List of production houses
production_houses = ['fox', 'dreamworks', 'mgm', 'paramount', 'sony', 'universal', 'disney', 'warner']

# Define the directory path
directory = "C:/UCD MSBA Coursework/Winter quarter/Data Design & Representation/Final Project/100links"

# Create the directory if it doesn't exist
if not os.path.exists(directory):
    os.makedirs(directory)

# Loop through production houses
for house in production_houses:
    # Construct the URL for the particular production house
    url = base_url.format(house)
    
    # Navigate to the advanced search page using the driver
    driver.get(url)
    
    # Press the "50 more" button to load more movies
    while True:
        try:
            # Wait for the "50 more" button to be show
            load_more_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, 'button.ipc-see-more__button'))
            )
            # Click the "50 more" button
            load_more_button.click()
            
            # Pause to allow the page to load more movies
            time.sleep(10)  
        except Exception as e:
            # adding exception to break the loop if the "50 more" button is not found
            print(f"No more '50 more' button found for {house}.")
            break
    
    #it is important to give a long pause so that the page content gets loaded completely
    #leaving a shorter pause period like 5 seconds might lead to loss of data 
    time.sleep(30)
    
    
    # Save the HTML content of the page with the appropriate file in folder grouped under the production house name 
    file_path = os.path.join(directory, f'{house}.html')
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(driver.page_source)

# Exit the webdriver
driver.quit()


In [34]:
#import beacutifulSoup to parse the saved pages for individual links
from bs4 import BeautifulSoup
import os

# Define the directory path where HTML files are saved
directory = "C:/UCD MSBA Coursework/Winter quarter/Data Design & Representation/Final Project/100links"

# loop to iterate through all HTML files in the directory
for filename in os.listdir(directory):
    # Check if the file is an HTML file
    if filename.endswith(".html"):
        # Extract house name from the filename
        house_name = filename.split('.')[0]
        
        # Open the HTML file
        with open(os.path.join(directory, filename), 'r', encoding='utf-8') as f:
            html = f.read()

        # Create a BeautifulSoup object
        soup = BeautifulSoup(html, 'html.parser')

        # Find all divs containing movie titles
        movie_divs = soup.find_all('div', class_='ipc-title ipc-title--base ipc-title--title ipc-title-link-no-icon ipc-title--on-textPrimary sc-b0691f29-9 klOwFB dli-title')

        # Loop through each div to extract and print the movie title
        for div in movie_divs:
            movie_title = div.find('h3', class_='ipc-title__text').text.strip()
            print(f"{house_name} - Movie Title: {movie_title}")


disney - Movie Title: 1. Guardians of the Galaxy
disney - Movie Title: 2. Avengers: Endgame
disney - Movie Title: 3. Avengers: Infinity War
disney - Movie Title: 4. Deadpool
disney - Movie Title: 5. Star Wars: Episode VII - The Force Awakens
disney - Movie Title: 6. Avengers: Age of Ultron
disney - Movie Title: 7. Captain America: The Winter Soldier
disney - Movie Title: 8. Spider-Man: No Way Home
disney - Movie Title: 9. Captain America: Civil War
disney - Movie Title: 10. Black Panther
disney - Movie Title: 11. Thor: Ragnarok
disney - Movie Title: 12. Doctor Strange
disney - Movie Title: 13. Inside Out
disney - Movie Title: 14. Guardians of the Galaxy Vol. 2
disney - Movie Title: 15. Ant-Man
disney - Movie Title: 16. Star Wars: Episode VIII - The Last Jedi
disney - Movie Title: 17. Captain Marvel
disney - Movie Title: 18. Baby Driver
disney - Movie Title: 19. Coco
disney - Movie Title: 20. Zootopia
disney - Movie Title: 21. Big Hero 6
disney - Movie Title: 22. Star Wars: Episode IX -

In [35]:
#check for the count of the links found in the file
#dreamworks is an exception as they produced only 20 movies in the last decade

# Iterate through all HTML files in the directory
for filename in os.listdir(directory):
    # Check if the file is an HTML file
    if filename.endswith(".html"):
        # Extract house name from the filename
        house_name = filename.split('.')[0]
        
        # Open the HTML file
        with open(os.path.join(directory, filename), 'r', encoding='utf-8') as f:
            html = f.read()

        # Create a BeautifulSoup object
        soup = BeautifulSoup(html, 'html.parser')

        # Get href link for each movie
        links = soup.select('a.ipc-title-link-wrapper')
        
        # Print the length of links list along with the house name
        print(f"{house_name} - Number of links: {len(links)}")


disney - Number of links: 100
dreamworks - Number of links: 20
fox - Number of links: 100
mgm - Number of links: 100
paramount - Number of links: 100
sony - Number of links: 100
universal - Number of links: 100
warner - Number of links: 100


In [None]:
#print the link object
for link in links[:50]:
    print(link['href'])

/title/tt0816692/?ref_=sr_t_1
/title/tt1431045/?ref_=sr_t_2
/title/tt1392190/?ref_=sr_t_3
/title/tt2267998/?ref_=sr_t_4
/title/tt3659388/?ref_=sr_t_5
/title/tt2278388/?ref_=sr_t_6
/title/tt1663202/?ref_=sr_t_7
/title/tt7131622/?ref_=sr_t_8
/title/tt3315342/?ref_=sr_t_9
/title/tt1877832/?ref_=sr_t_10
/title/tt5013056/?ref_=sr_t_11
/title/tt1631867/?ref_=sr_t_12
/title/tt1386697/?ref_=sr_t_13
/title/tt2802144/?ref_=sr_t_14
/title/tt2562232/?ref_=sr_t_15
/title/tt3783958/?ref_=sr_t_16
/title/tt5463162/?ref_=sr_t_17
/title/tt4154664/?ref_=sr_t_18
/title/tt3890160/?ref_=sr_t_19
/title/tt1396484/?ref_=sr_t_20
/title/tt1727824/?ref_=sr_t_21
/title/tt6644200/?ref_=sr_t_22
/title/tt6966692/?ref_=sr_t_23
/title/tt5027774/?ref_=sr_t_24
/title/tt3183660/?ref_=sr_t_25
/title/tt1790864/?ref_=sr_t_26
/title/tt1677720/?ref_=sr_t_27
/title/tt0974015/?ref_=sr_t_28
/title/tt2103281/?ref_=sr_t_29
/title/tt1950186/?ref_=sr_t_30
/title/tt3385516/?ref_=sr_t_31
/title/tt5580390/?ref_=sr_t_32
/title/tt2584384/

In [41]:
#ensuring that the links are grouped by respective production houses
# List of production houses
production_houses = ['fox', 'dreamworks', 'mgm', 'paramount', 'sony', 'universal', 'disney', 'warner']

# Iterate through each production house
for house in production_houses:
    # Open the HTML file corresponding to the production house
    with open(os.path.join(directory, f"{house}.html"), 'r', encoding='utf-8') as f:
        html = f.read()

    # Create a BeautifulSoup object
    soup = BeautifulSoup(html, 'html.parser')

    # Get href links for each movie
    links = soup.select('a.ipc-title-link-wrapper')

    # Print the first 50 links for the current production house
    print(f"Links for {house}:")
    for link in links[:100]:
        print(link['href'])
    print()  # Add a newline for clarity


Links for fox:
/title/tt0816692/?ref_=sr_t_1
/title/tt1431045/?ref_=sr_t_2
/title/tt1392190/?ref_=sr_t_3
/title/tt2267998/?ref_=sr_t_4
/title/tt3659388/?ref_=sr_t_5
/title/tt2278388/?ref_=sr_t_6
/title/tt1663202/?ref_=sr_t_7
/title/tt7131622/?ref_=sr_t_8
/title/tt3315342/?ref_=sr_t_9
/title/tt1877832/?ref_=sr_t_10
/title/tt5013056/?ref_=sr_t_11
/title/tt1631867/?ref_=sr_t_12
/title/tt1386697/?ref_=sr_t_13
/title/tt2802144/?ref_=sr_t_14
/title/tt3783958/?ref_=sr_t_15
/title/tt2562232/?ref_=sr_t_16
/title/tt5463162/?ref_=sr_t_17
/title/tt4154664/?ref_=sr_t_18
/title/tt3890160/?ref_=sr_t_19
/title/tt1396484/?ref_=sr_t_20
/title/tt1727824/?ref_=sr_t_21
/title/tt6644200/?ref_=sr_t_22
/title/tt6966692/?ref_=sr_t_23
/title/tt5027774/?ref_=sr_t_24
/title/tt3183660/?ref_=sr_t_25
/title/tt1790864/?ref_=sr_t_26
/title/tt1677720/?ref_=sr_t_27
/title/tt0974015/?ref_=sr_t_28
/title/tt2103281/?ref_=sr_t_29
/title/tt1950186/?ref_=sr_t_30
/title/tt3385516/?ref_=sr_t_31
/title/tt5580390/?ref_=sr_t_32
/t

In [42]:
#formulate the individual title links

# Initialize a dictionary to store links for each movie production house (the main 8 links)
house_links = {house: [] for house in production_houses}

# Iterate through each production house
for house in production_houses:
    # Open the HTML file corresponding to the production house
    with open(os.path.join(directory, f"{house}.html"), 'r', encoding='utf-8') as f:
        html = f.read()

    # Create a BeautifulSoup object
    soup = BeautifulSoup(html, 'html.parser')

    # Get href links for each movie
    links = soup.select('a.ipc-title-link-wrapper')
 
    # Append the house name to each link and add it to the list
    for link in links:
        full_link = 'https://www.imdb.com' + link['href']
        house_links[house].append(full_link)

# Print the list of all links with their respective house names
for house, links in house_links.items():
    print(f"{house}:")
    for link in links:
        print(link)



fox:
https://www.imdb.com/title/tt0816692/?ref_=sr_t_1
https://www.imdb.com/title/tt1431045/?ref_=sr_t_2
https://www.imdb.com/title/tt1392190/?ref_=sr_t_3
https://www.imdb.com/title/tt2267998/?ref_=sr_t_4
https://www.imdb.com/title/tt3659388/?ref_=sr_t_5
https://www.imdb.com/title/tt2278388/?ref_=sr_t_6
https://www.imdb.com/title/tt1663202/?ref_=sr_t_7
https://www.imdb.com/title/tt7131622/?ref_=sr_t_8
https://www.imdb.com/title/tt3315342/?ref_=sr_t_9
https://www.imdb.com/title/tt1877832/?ref_=sr_t_10
https://www.imdb.com/title/tt5013056/?ref_=sr_t_11
https://www.imdb.com/title/tt1631867/?ref_=sr_t_12
https://www.imdb.com/title/tt1386697/?ref_=sr_t_13
https://www.imdb.com/title/tt2802144/?ref_=sr_t_14
https://www.imdb.com/title/tt3783958/?ref_=sr_t_15
https://www.imdb.com/title/tt2562232/?ref_=sr_t_16
https://www.imdb.com/title/tt5463162/?ref_=sr_t_17
https://www.imdb.com/title/tt4154664/?ref_=sr_t_18
https://www.imdb.com/title/tt3890160/?ref_=sr_t_19
https://www.imdb.com/title/tt139648

In [45]:


# Loop through each production house
for house in production_houses:
    # Create a directory for the current production house if it doesn't exist
    house_directory = os.path.join(base_directory, house)
    if not os.path.exists(house_directory):
        os.makedirs(house_directory)

    # Get the list of links for the current production house
    links = house_links[house]

    # Loop through each link and save the HTML content
    for link in links:
        # Fetch the HTML content of the page

         # Use requests to fetch the listing page by setting the user agent as Mozilla
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(link, headers=headers)
      
        if response.status_code == 200:
            # Parse the HTML content
            soup = BeautifulSoup(response.content, 'html.parser')
            # Get the IMDb ID from the link using regex
            imdb_id = link.split('/')[-2]
            # Save the HTML content of the individual movie title to a file within the production house directory
            file_path = os.path.join(house_directory, f"{imdb_id}.html")
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(soup.prettify())
            print(f"Saved HTML content for {imdb_id} in {house} directory.")
        else:
            print(f"Failed to fetch HTML content for {link}")

        time.sleep(5)

Saved HTML content for tt0816692 in fox directory.
Saved HTML content for tt1431045 in fox directory.
Saved HTML content for tt1392190 in fox directory.
Saved HTML content for tt2267998 in fox directory.
Saved HTML content for tt3659388 in fox directory.
Saved HTML content for tt2278388 in fox directory.
Saved HTML content for tt1663202 in fox directory.
Saved HTML content for tt7131622 in fox directory.
Saved HTML content for tt3315342 in fox directory.
Saved HTML content for tt1877832 in fox directory.
Saved HTML content for tt5013056 in fox directory.
Saved HTML content for tt1631867 in fox directory.
Saved HTML content for tt1386697 in fox directory.
Saved HTML content for tt2802144 in fox directory.
Saved HTML content for tt3783958 in fox directory.
Saved HTML content for tt2562232 in fox directory.
Saved HTML content for tt5463162 in fox directory.
Saved HTML content for tt4154664 in fox directory.
Saved HTML content for tt3890160 in fox directory.
Saved HTML content for tt139648

In [50]:
#Extract the required movie components for each of the extracted movie pages

#out of the 720 links, 5 were unresponsive so we will consider them as missing values and drop them

# Create a list to store movie information
movies_list = []

# Loop through each production house
for house in production_houses:
    # Define the directory path for the current production house
    house_directory = os.path.join(base_directory, house)
    
    # Loop through each file in the directory
    for filename in os.listdir(house_directory):
        # Construct the full file path
        filepath = os.path.join(house_directory, filename)

        # Read file to string
        with open(filepath, 'r', encoding='utf-8', errors='ignore') as file:
            html = file.read()

        # Use BeautifulSoup to parse the file content
        soup = BeautifulSoup(html, 'html.parser')

        # Create a dictionary to store movie information
        movie_info = {}
        
        # ID
        match = re.search(r'(\d+)\.html', filename)
        imdb_id = match.group(1)
        movie_info['id'] = imdb_id
        
        # Title
        titles = soup.select('h1 > span.hero__primary-text')
        for title in titles:
            movie_info['Title'] = title.text.strip()

        # Infos
        info = soup.select('a.ipc-link.ipc-link--baseAlt.ipc-link--inherit-color')
        if len(info) > 18:
            movie_info['Year'] = info[5].text.strip()
            movie_info['Rating'] = info[6].text.strip()
        elif len(info) > 17:
            movie_info['Year'] = None
            movie_info['Rating'] = info[5].text.strip()
        else:
            movie_info['Year'] = None
            movie_info['Rating'] = None
        
        # Add house name
        movie_info['House'] = house

        # Rating
        scores = soup.select_one('span.sc-bde20123-1.cMEQkK')
        if scores is not None:
            movie_info['Rating score'] = scores.text.strip()
        else:
            movie_info['Rating score'] = None

        # Number of raters
        num_scores = soup.select_one('div.sc-bde20123-3.gPVQxL')
        if num_scores is not None:
            movie_info['Number of raters'] = num_scores.text.strip()
        else:
            movie_info['Number of raters'] = None

        # Popularity
        popularity = soup.select_one('div.sc-5f7fb5b4-1.fTREEx')
        if popularity is not None:
            movie_info['Popularity rank'] = popularity.text.strip()
        else:
            movie_info['Popularity rank'] = None
       
        # Genre
        genre = soup.select('span.ipc-chip__text')
        if len(genre) == 4:
            movie_info['Genre_1'] = genre[0].text.strip()
            movie_info['Genre_2'] = genre[1].text.strip()
            movie_info['Genre_3'] = genre[2].text.strip()
        elif len(genre) == 3: 
            movie_info['Genre_1'] = genre[0].text.strip()
            movie_info['Genre_2'] = genre[1].text.strip()
            movie_info['Genre_3'] = None
        elif len(genre) == 2:   
            movie_info['Genre_1'] = genre[0].text.strip()
            movie_info['Genre_2'] = None
            movie_info['Genre_3'] = None
        else:
            movie_info['Genre_1'] = None
            movie_info['Genre_2'] = None
            movie_info['Genre_3'] = None
        
        # Introduction
        intro = soup.select('span.sc-466bb6c-2.chnFO')
        for i in intro:
            movie_info['Introduction'] = i.text.strip()

        # User reviews
        user_review = soup.select('span.three-Elements > span.score')
        if len(user_review) > 2:
            movie_info['User reviews'] = user_review[0].text.strip()
            movie_info['Critic reviews'] = user_review[1].text.strip()
            movie_info['Metascore'] = user_review[2].text.strip()
        else:
            movie_info['User reviews'] = None
            movie_info['Critic reviews'] = None
            movie_info['Metascore'] = None

        # Append movie_info dictionary to movies_info list
        movies_list.append(movie_info)



In [51]:
# Print the dictionary
for movie_info in movies_list:
    print(movie_info)

{'id': '0437086', 'Title': 'Alita: Battle Angel', 'Year': '2019', 'Rating': 'PG-13', 'House': 'fox', 'Rating score': '7.3', 'Number of raters': '292K', 'Popularity rank': '1,200', 'Genre_1': 'Action', 'Genre_2': 'Adventure', 'Genre_3': 'Sci-Fi', 'Introduction': "A deactivated cyborg's revived, but can't remember anything of her past and goes on a quest to find out who she is.", 'User reviews': '2.9K', 'Critic reviews': '398', 'Metascore': '53'}
{'id': '0816692', 'Title': 'Interstellar', 'Year': '2014', 'Rating': 'PG-13', 'House': 'fox', 'Rating score': '8.7', 'Number of raters': '2.1M', 'Popularity rank': '48', 'Genre_1': 'Adventure', 'Genre_2': 'Drama', 'Genre_3': 'Sci-Fi', 'Introduction': 'When Earth becomes uninhabitable in the future, a farmer and ex-NASA pilot, Joseph Cooper, is tasked to pilot a spacecraft, along with a team of researchers, to find a new planet for humans.', 'User reviews': '5.6K', 'Critic reviews': '487', 'Metascore': '74'}
{'id': '0831387', 'Title': 'Godzilla',

In [53]:
#import the panda and numpy libraries
import pandas as pd
import numpy as ny

# Change the list to dataframe format
df_decade = pd.DataFrame(movies_list)

# Print the dataframe
df_decade.head(5)

Unnamed: 0,id,Title,Year,Rating,House,Rating score,Number of raters,Popularity rank,Genre_1,Genre_2,Genre_3,Introduction,User reviews,Critic reviews,Metascore
0,437086,Alita: Battle Angel,2019,PG-13,fox,7.3,292K,1200,Action,Adventure,Sci-Fi,"A deactivated cyborg's revived, but can't reme...",2.9K,398,53
1,816692,Interstellar,2014,PG-13,fox,8.7,2.1M,48,Adventure,Drama,Sci-Fi,When Earth becomes uninhabitable in the future...,5.6K,487,74
2,831387,Godzilla,2014,PG-13,fox,6.4,437K,425,Action,Adventure,Sci-Fi,The world is beset by the appearance of monstr...,1.7K,666,62
3,974015,Justice League,2017,PG-13,fox,6.1,477K,1227,Action,Adventure,Fantasy,Fueled by his restored faith in humanity and i...,2.3K,466,45
4,10640346,Babylon,2022,R,fox,7.1,165K,215,Comedy,Drama,History,A tale of outsized ambition and outrageous exc...,941,310,61


In [54]:
# Save it as csv file
df_decade.to_csv('last_decade_popular.csv', index=False)

In [56]:
#import the module
import pymongo
from pymongo import MongoClient

# Connect to local MongoDB instance
mo_c = MongoClient()
client = MongoClient('localhost', 27017)

In [59]:
# Create a database named 'msba'
db = client["IMDB"]

# Create a collection
collection = db["Last_decade_popular"]

# Insert documents to collection
for movie in movies_list:
    collection.insert_one(movie)

# Print the collection
for document in collection.find():
    print(document)

{'_id': ObjectId('65ff98718cd911a9acd7c432'), 'id': '0437086', 'Title': 'Alita: Battle Angel', 'Year': '2019', 'Rating': 'PG-13', 'House': 'fox', 'Rating score': '7.3', 'Number of raters': '292K', 'Popularity rank': '1,200', 'Genre_1': 'Action', 'Genre_2': 'Adventure', 'Genre_3': 'Sci-Fi', 'Introduction': "A deactivated cyborg's revived, but can't remember anything of her past and goes on a quest to find out who she is.", 'User reviews': '2.9K', 'Critic reviews': '398', 'Metascore': '53'}
{'_id': ObjectId('65ff98718cd911a9acd7c433'), 'id': '0816692', 'Title': 'Interstellar', 'Year': '2014', 'Rating': 'PG-13', 'House': 'fox', 'Rating score': '8.7', 'Number of raters': '2.1M', 'Popularity rank': '48', 'Genre_1': 'Adventure', 'Genre_2': 'Drama', 'Genre_3': 'Sci-Fi', 'Introduction': 'When Earth becomes uninhabitable in the future, a farmer and ex-NASA pilot, Joseph Cooper, is tasked to pilot a spacecraft, along with a team of researchers, to find a new planet for humans.', 'User reviews': 

#check querying with an example

In [63]:
# Create an empty list to store filtered movies
filtered_movies = []

# Iterate through movies_list and filter out movies with Metascore less than 20
for movie in movies_list:
    # Check if Metascore exists and is less than 20
    if movie.get('Metascore') is not None and int(movie['Metascore']) < 20:
        # Append the movie to filtered_movies list
        filtered_movies.append(movie)

# Print the filtered movies
for movie in filtered_movies:
    print(movie)


{'id': '13929998', 'Title': 'On a Wing and a Prayer', 'Year': '2023', 'Rating': 'PG', 'House': 'mgm', 'Rating score': '5.5', 'Number of raters': '5.2K', 'Popularity rank': None, 'Genre_1': 'Drama', 'Genre_2': None, 'Genre_3': None, 'Introduction': 'After their pilot dies unexpectedly mid-flight, Doug White (Dennis Quaid) has to safely land the plane and save his entire family from insurmountable danger.', 'User reviews': '165', 'Critic reviews': '30', 'Metascore': '19', '_id': ObjectId('65ff98718cd911a9acd7c4c1')}


In [None]:
#DB and collections have been set up right