In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

URL = "https://www.imdb.com/title/tt6836936/reviews/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/115.0 Safari/537.36"
}

def scrape_reviews(url):
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")

    reviews_data = []

    for article in soup.select("article.user-review-item"):
        # rating (some reviews don't have one)
        rating_tag = article.select_one("span.ipc-rating-star--rating")
        rating = rating_tag.get_text(strip=True) if rating_tag else None

        # title
        title_tag = article.select_one("a[data-testid='review-title']")
        title = title_tag.get_text(strip=True) if title_tag else None

        # text
        text_tag = article.select_one("div[data-testid='review-text']") \
                   or article.select_one("div.ipc-html-content-inner-div")
        text = text_tag.get_text(strip=True) if text_tag else None

        # date
        date_tag = article.select_one("li.review-date")
        date = date_tag.get_text(strip=True) if date_tag else None

        reviews_data.append({
            "rating": rating,
            "text": text,
            "date": date
        })

    return reviews_data

print("Scraping first page of reviews…")
reviews = scrape_reviews(URL)

Scraping first page of reviews…


In [13]:
df=pd.DataFrame(reviews)
ls=[]
for i in range(len(df)):
    s=df['date'][i]
    if len(s)==11:
        d=int(s[4:5])
        d-=1
        s=s[:4]+str(d)+s[5:]
    elif len(s)==12:
        d=int(s[4:6])
        d-=1
        s=s[:4]+str(d)+s[6:]
    ls.append(s)
df['date']=ls
df

Unnamed: 0,rating,text,date
0,6.0,Saaho is entertaining. It is a proper Bollywoo...,"Jan 29, 2022"
1,6.0,The base story is good. Audience are left in c...,"Sep 0, 2019"
2,4.0,Looks like the story was inspired from Largo w...,"Sep 0, 2019"
3,1.0,Saaho...! There are so many problems with this...,"Aug 29, 2019"
4,2.0,Dont believe all the 10 star ratings and revie...,"Sep 7, 2019"
5,,I have seen several flop or below average film...,"Aug 30, 2019"
6,6.0,1) Prabhas Hindi sucks\n2) Imaginary UnderWorl...,"Aug 29, 2019"
7,2.0,"Watched it in Hindi, god why didn't they hire ...","Aug 29, 2019"
8,9.0,Good movie with action scenes but songs placem...,"Aug 28, 2019"
9,6.0,Bad:\nBoring 1st half\nVery slow and confusing...,"Aug 28, 2019"


In [19]:
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df


Unnamed: 0,rating,text,date
0,6.0,Saaho is entertaining. It is a proper Bollywoo...,2022-01-30
1,6.0,The base story is good. Audience are left in c...,2019-09-01
2,4.0,Looks like the story was inspired from Largo w...,2019-09-01
3,1.0,Saaho...! There are so many problems with this...,2019-08-30
4,2.0,Dont believe all the 10 star ratings and revie...,2019-09-08
5,,I have seen several flop or below average film...,2019-08-31
6,6.0,1) Prabhas Hindi sucks\n2) Imaginary UnderWorl...,2019-08-30
7,2.0,"Watched it in Hindi, god why didn't they hire ...",2019-08-30
8,9.0,Good movie with action scenes but songs placem...,2019-08-29
9,6.0,Bad:\nBoring 1st half\nVery slow and confusing...,2019-08-29


In [21]:
df = df.sort_values(by="date").reset_index()

df

Unnamed: 0,index,rating,text,date
0,21,8.0,This movie isn't favorable for Sentiment Movie...,2019-08-29
1,18,10.0,If you're the fan of action thrillers that you...,2019-08-29
2,8,9.0,Good movie with action scenes but songs placem...,2019-08-29
3,9,6.0,Bad:\nBoring 1st half\nVery slow and confusing...,2019-08-29
4,11,9.0,SAAHO is a fantastic experience. Never seen ac...,2019-08-29
5,20,1.0,Would rate it 0.. if i had the option...\nThe ...,2019-08-30
6,7,2.0,"Watched it in Hindi, god why didn't they hire ...",2019-08-30
7,6,6.0,1) Prabhas Hindi sucks\n2) Imaginary UnderWorl...,2019-08-30
8,23,1.0,No common sense. Hopeless acting. Worst direct...,2019-08-30
9,22,6.0,Watch on your risk\nTime and money wasted\nPra...,2019-08-30


In [22]:
df = pd.DataFrame(reviews)
df.to_csv('imdb_sample.csv')

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   rating  24 non-null     object
 1   text    23 non-null     object
 2   date    25 non-null     object
dtypes: object(3)
memory usage: 732.0+ bytes
