# install dependencies

In [2]:
!pip install pymongo

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pymongo
  Downloading pymongo-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (492 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.1/492.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dnspython<3.0.0,>=1.16.0
  Downloading dnspython-2.3.0-py3-none-any.whl (283 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m283.7/283.7 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.3.0 pymongo-4.3.3


# import libraries

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pymongo
from getpass import getpass
# import sqlite3
from sqlalchemy import create_engine
from sqlalchemy import text as sql_text

# SQLite Engine

In [4]:
SQL_ENGINE = create_engine("sqlite:///foo.db")

# Functions

## Fetch data from imdb

In [5]:

def get_data(link: str, pages: int):
    # link = "https://www.imdb.com/search/title/?groups=top_100&sort=user_rating,desc"
    # pages = 2
    data = []
    for page in range(pages):
        url = f"{link}&start={page*50}"
        ack = requests.get(url)
        getdata = BeautifulSoup(ack.content, "html.parser")

        movies = getdata.find_all("div", {"class": "lister-item mode-advanced"})
        for movie in movies:
            rank = (
                movie.find_all("span", {"name": "nv"})[-1].text.strip("#")
                if len(movie.find_all("span", {"name": "nv"})) > 1
                else ""
            )
            name = movie.find("h3", {"class": "lister-item-header"}).a.text.strip()
            year = movie.find("span", {"class": "lister-item-year"}).text.strip("()")
            rating = movie.find("div", {"class": "inline-block ratings-imdb-rating"})[
                "data-value"
            ]
            duration = movie.find("span", {"class": "runtime"}).text.strip(" min")
            metascore = (
                movie.find("span", {"class": "metascore"}).text.strip()
                if movie.find("span", {"class": "metascore"})
                else ""
            )
            directors = [
                director.text
                for director in movie.find_all(
                    "a", {"href": lambda href: href and "/name/" in href}
                )
            ]
            director = directors[0] if directors else ""

            data.append([rank, name, year, rating, duration, metascore, director])
    return pd.DataFrame(
        data,
        columns=[
            "Rank",
            "Name",
            "Year",
            "Rating",
            "Duration (mins)",
            "Metascore",
            "Director",
        ],
    )



## insert data into mongo

In [6]:
def insert_into_db(uri: str, data_frame: pd.DataFrame):
    client = pymongo.MongoClient(uri)
    db = client["Jisons"]
    collection = db["movies"]
    data_dict = data_frame.to_dict(orient="records")
    # drop collection if already exists
    if db.drop_collection("movies"):
        print(f"Dropped already existing collection!")
    collection.insert_many(data_dict)
    client.close()


## Test mongodb Query 

In [7]:
def test_query(uri: str):
    client = pymongo.MongoClient(uri)
    db = client["Jisons"]
    collection = db["movies"]

    documents = collection.find().limit(10)
    document_list = list(documents)
    df2 = pd.DataFrame(document_list)
    return df2
    # print(df2.head(10))


# Main Code

## set up mongodb connections

In [8]:
mongo_db_password = getpass() # 1WgLZ0xLWIAXPvJe
db_uri = f"mongodb+srv://tulikayadav16:{mongo_db_password}@cluster0.ykvdygv.mongodb.net/?retryWrites=true&w=majority"
link = "https://www.imdb.com/search/title/?groups=top_100&sort=user_rating,desc"
pages = 2



··········


In [9]:
df = get_data(link=link, pages=pages)
insert_into_db(uri=db_uri, data_frame=df)

Dropped already existing collection!


In [10]:
query_df = test_query(db_uri)
display(query_df)

Unnamed: 0,_id,Rank,Name,Year,Rating,Duration (mins),Metascore,Director
0,644111969d8dfe81659a1cf7,1,The Shawshank Redemption,1994,9.3,142,82,Frank Darabont
1,644111969d8dfe81659a1cf8,2,The Godfather,1972,9.2,175,100,Francis Ford Coppola
2,644111969d8dfe81659a1cf9,3,The Dark Knight,2008,9.0,152,84,Christopher Nolan
3,644111969d8dfe81659a1cfa,6,Schindler's List,1993,9.0,195,95,Oskar Schindler
4,644111969d8dfe81659a1cfb,7,The Lord of the Rings: The Return of the King,2003,9.0,201,94,Peter Jackson
5,644111969d8dfe81659a1cfc,5,12 Angry Men,1957,9.0,96,97,Sidney Lumet
6,644111969d8dfe81659a1cfd,4,The Godfather Part II,1974,9.0,202,90,Francis Ford Coppola
7,644111969d8dfe81659a1cfe,8,Pulp Fiction,1994,8.9,154,95,Quentin Tarantino
8,644111969d8dfe81659a1cff,14,Inception,2010,8.8,148,74,Christopher Nolan
9,644111969d8dfe81659a1d00,9,The Lord of the Rings: The Fellowship of the Ring,2001,8.8,178,92,Peter Jackson


In [11]:
query_df.drop("_id", inplace=True, axis=1)

In [12]:
query_df.columns = [
    i.lower().replace(" ", "_").replace("(", "").replace(")", "")
    for i in query_df.columns
]

In [13]:
query_df.head()

Unnamed: 0,rank,name,year,rating,duration_mins,metascore,director
0,1,The Shawshank Redemption,1994,9.3,142,82,Frank Darabont
1,2,The Godfather,1972,9.2,175,100,Francis Ford Coppola
2,3,The Dark Knight,2008,9.0,152,84,Christopher Nolan
3,6,Schindler's List,1993,9.0,195,95,Oskar Schindler
4,7,The Lord of the Rings: The Return of the King,2003,9.0,201,94,Peter Jackson


## insert into SQLite

## Execute Queries