<a href="https://colab.research.google.com/github/Prakhar021-hub/Deep-Learning-Notebooks/blob/main/Text_preprocessing_on_TMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import pandas as pd
from tqdm import tqdm

In [2]:


API_KEY = "8265bd1679663a7ea12ac168da84d2e8"
BASE_URL = "https://api.themoviedb.org/3/movie/top_rated"
GENRE_URL = "https://api.themoviedb.org/3/genre/movie/list"

# 1. Get genre mapping
genre_response = requests.get(GENRE_URL, params={"api_key": API_KEY, "language": "en-US"})
genre_data = genre_response.json()
genre_map = {g["id"]: g["name"] for g in genre_data["genres"]}

# 2. Collect movie data from all 471 pages
all_movies = []

for page in tqdm(range(1, 472)):  # 1 to 471
    response = requests.get(BASE_URL, params={"api_key": API_KEY, "language": "en-US", "page": page})
    data = response.json()

    if "results" not in data:
        continue

    for movie in data["results"]:
        name = movie.get("title")
        review = movie.get("overview")  # or movie.get("vote_average") if tutor means rating
        genre_ids = movie.get("genre_ids", [])

        # map IDs to names
        genres = [genre_map.get(gid, "Unknown") for gid in genre_ids]

        all_movies.append({
            "name": name,
            "review": review,
            "genre": ", ".join(genres)
        })

# 3. Convert to DataFrame
df = pd.DataFrame(all_movies)

# Save
df.to_csv("tmdb_top_rated_movies.csv", index=False)

print(df.head())


100%|██████████| 471/471 [01:08<00:00,  6.89it/s]


                       name  \
0  The Shawshank Redemption   
1             The Godfather   
2     The Godfather Part II   
3          Schindler's List   
4              12 Angry Men   

                                              review                genre  
0  Imprisoned in the 1940s for the double murder ...         Drama, Crime  
1  Spanning the years 1945 to 1955, a chronicle o...         Drama, Crime  
2  In the continuing saga of the Corleone crime f...         Drama, Crime  
3  The true story of how businessman Oskar Schind...  Drama, History, War  
4  The defense and the prosecution have rested an...                Drama  


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9420 entries, 0 to 9419
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    9420 non-null   object
 1   review  9420 non-null   object
 2   genre   9420 non-null   object
dtypes: object(3)
memory usage: 220.9+ KB


In [5]:
## Lower casing

df = df.apply(lambda x: x.str.lower())


In [7]:
df.head(20)

Unnamed: 0,name,review,genre
0,the shawshank redemption,imprisoned in the 1940s for the double murder ...,"drama, crime"
1,the godfather,"spanning the years 1945 to 1955, a chronicle o...","drama, crime"
2,the godfather part ii,in the continuing saga of the corleone crime f...,"drama, crime"
3,schindler's list,the true story of how businessman oskar schind...,"drama, history, war"
4,12 angry men,the defense and the prosecution have rested an...,drama
5,spirited away,"a young girl, chihiro, becomes trapped in a st...","animation, family, fantasy"
6,the dark knight,batman raises the stakes in his war on crime. ...,"drama, action, crime, thriller"
7,dilwale dulhania le jayenge,"raj is a rich, carefree, happy-go-lucky second...","comedy, drama, romance"
8,the green mile,a supernatural tale set on death row in a sout...,"fantasy, drama, crime"
9,parasite,"all unemployed, ki-taek's family takes peculia...","comedy, thriller, drama"


In [13]:
## Removing punctuations from the reviews column

import string

def remove_punctuation1(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df['review_clean'] = df['review'].apply(remove_punctuation1)



In [14]:
df.head()

Unnamed: 0,name,review,genre,review_clean
0,the shawshank redemption,imprisoned in the 1940s for the double murder ...,"drama, crime",imprisoned in the 1940s for the double murder ...
1,the godfather,"spanning the years 1945 to 1955, a chronicle o...","drama, crime",spanning the years 1945 to 1955 a chronicle of...
2,the godfather part ii,in the continuing saga of the corleone crime f...,"drama, crime",in the continuing saga of the corleone crime f...
3,schindler's list,the true story of how businessman oskar schind...,"drama, history, war",the true story of how businessman oskar schind...
4,12 angry men,the defense and the prosecution have rested an...,drama,the defense and the prosecution have rested an...


In [20]:
## removing the stop words

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word.lower() not in stop_words])

# Apply to your reviews column
df['review_clean'] = df['review'].apply(remove_stopwords)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
df.head()

Unnamed: 0,name,review,genre,review_clean
0,the shawshank redemption,imprisoned in the 1940s for the double murder ...,"drama, crime","imprisoned 1940s double murder wife lover, ups..."
1,the godfather,"spanning the years 1945 to 1955, a chronicle o...","drama, crime","spanning years 1945 1955, chronicle fictional ..."
2,the godfather part ii,in the continuing saga of the corleone crime f...,"drama, crime","continuing saga corleone crime family, young v..."
3,schindler's list,the true story of how businessman oskar schind...,"drama, history, war",true story businessman oskar schindler saved t...
4,12 angry men,the defense and the prosecution have rested an...,drama,defense prosecution rested jury filing jury ro...


In [24]:
import nltk
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [25]:
## Tokenisation

from nltk.tokenize import word_tokenize

# make sure you have downloaded punkt tokenizer once
# nltk.download('punkt')

df['review_tokens'] = df['review_clean'].apply(word_tokenize)


In [26]:
df.head()

Unnamed: 0,name,review,genre,review_clean,review_tokens
0,the shawshank redemption,imprisoned in the 1940s for the double murder ...,"drama, crime","imprisoned 1940s double murder wife lover, ups...","[imprisoned, 1940s, double, murder, wife, love..."
1,the godfather,"spanning the years 1945 to 1955, a chronicle o...","drama, crime","spanning years 1945 1955, chronicle fictional ...","[spanning, years, 1945, 1955, ,, chronicle, fi..."
2,the godfather part ii,in the continuing saga of the corleone crime f...,"drama, crime","continuing saga corleone crime family, young v...","[continuing, saga, corleone, crime, family, ,,..."
3,schindler's list,the true story of how businessman oskar schind...,"drama, history, war",true story businessman oskar schindler saved t...,"[true, story, businessman, oskar, schindler, s..."
4,12 angry men,the defense and the prosecution have rested an...,drama,defense prosecution rested jury filing jury ro...,"[defense, prosecution, rested, jury, filing, j..."


In [27]:
## Stemming

from nltk.stem import PorterStemmer

# Initialize stemmer
ps = PorterStemmer()

# Apply stemming on tokens
df['review_stemmed'] = df['review_tokens'].apply(lambda tokens: [ps.stem(word) for word in tokens])

print(df[['review_clean', 'review_tokens', 'review_stemmed']])


                                           review_clean  \
0     imprisoned 1940s double murder wife lover, ups...   
1     spanning years 1945 1955, chronicle fictional ...   
2     continuing saga corleone crime family, young v...   
3     true story businessman oskar schindler saved t...   
4     defense prosecution rested jury filing jury ro...   
...                                                 ...   
9415  minnesota, 1990. detective bruce kenner invest...   
9416  eruption icelandic volcano eyjafjallajökull ca...   
9417  small-town pennsylvania schoolteacher linda si...   
9418  musical comedy, valerie dealing philandering f...   
9419  young nurse kidnapped group violent teens esca...   

                                          review_tokens  \
0     [imprisoned, 1940s, double, murder, wife, love...   
1     [spanning, years, 1945, 1955, ,, chronicle, fi...   
2     [continuing, saga, corleone, crime, family, ,,...   
3     [true, story, businessman, oskar, schindler, s...

In [28]:
df.head()

Unnamed: 0,name,review,genre,review_clean,review_tokens,review_stemmed
0,the shawshank redemption,imprisoned in the 1940s for the double murder ...,"drama, crime","imprisoned 1940s double murder wife lover, ups...","[imprisoned, 1940s, double, murder, wife, love...","[imprison, 1940, doubl, murder, wife, lover, ,..."
1,the godfather,"spanning the years 1945 to 1955, a chronicle o...","drama, crime","spanning years 1945 1955, chronicle fictional ...","[spanning, years, 1945, 1955, ,, chronicle, fi...","[span, year, 1945, 1955, ,, chronicl, fiction,..."
2,the godfather part ii,in the continuing saga of the corleone crime f...,"drama, crime","continuing saga corleone crime family, young v...","[continuing, saga, corleone, crime, family, ,,...","[continu, saga, corleon, crime, famili, ,, you..."
3,schindler's list,the true story of how businessman oskar schind...,"drama, history, war",true story businessman oskar schindler saved t...,"[true, story, businessman, oskar, schindler, s...","[true, stori, businessman, oskar, schindler, s..."
4,12 angry men,the defense and the prosecution have rested an...,drama,defense prosecution rested jury filing jury ro...,"[defense, prosecution, rested, jury, filing, j...","[defens, prosecut, rest, juri, file, juri, roo..."
