In [1]:
import kagglehub
import pandas as pd
import os


In [2]:
dataset_path = kagglehub.dataset_download("raedaddala/top-500-600-movies-of-each-year-from-1960-to-2024")

csv_filename = os.path.join(dataset_path, "final_dataset.csv")
df = pd.read_csv(csv_filename)

print("Loaded dataset from:", csv_filename)
print(df.head())


Loaded dataset from: C:\Users\rohan\.cache\kagglehub\datasets\raedaddala\top-500-600-movies-of-each-year-from-1960-to-2024\versions\9\final_dataset.csv
          id                       title duration        mpa  rating votes  \
0  tt0027483          The Crimson Circle   1h 16m        NaN     6.4    30   
1  tt0058131  The Mystery of Thug Island   1h 36m        NaN     5.0   114   
2  tt0042760   Las mujeres de mi general   1h 52m  Not Rated     6.8    74   
3  tt0027667                Gentle Julia    1h 2m   Approved     6.8    38   
4  tt0055747              Love at Twenty   1h 50m        NaN     7.2  2.5K   

   méta_score                                        description  \
0         NaN  An extortion ring murders anyone who refuses t...   
1         NaN  Three year old Ada, daughter of the British ca...   
2         NaN  Infante stars as a rebel general caught up in ...   
3         NaN  A shy newspaperman (Brown) nearly gives up whe...   
4         NaN  "Love at Twenty" unites 

In [3]:
def convert_duration_to_minutes(duration):
    if pd.isna(duration):
        return 0  
    if isinstance(duration, int):
        return duration  

    total_minutes = 0
    parts = duration.split()
    for part in parts:
        if 'h' in part:
            total_minutes += int(part[:-1]) * 60 
        elif 'm' in part:
            total_minutes += int(part[:-1])  
    return total_minutes

df['duration'] = df['duration'].apply(convert_duration_to_minutes)




In [4]:
for col in df.columns:
    if df[col].dtype in ['int', 'int64', 'float64']: 
        df[col].fillna(0, inplace=True)
    else:
        df[col].fillna('', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(0, inplace=True)


In [5]:
import ast
import pandas as pd

In [6]:
def parse_list(cell):
    if pd.isnull(cell) or cell == "":
        return []
    
    # Handle numeric values (int, float)
    if isinstance(cell, (int, float)):
        return [str(cell)]
    
    # Handle string values
    if isinstance(cell, str):
        try:
            return ast.literal_eval(cell)  
        except (ValueError, SyntaxError):
            return cell.split(", ")
    
    # Handle any other type
    return [str(cell)]

In [7]:
import re
import ast

def get_cleaned_name_string(name_list):
    cleaned_names = []
    if isinstance(name_list, str):
        if name_list != "":
            name_list = ast.literal_eval(name_list)
    if not isinstance(name_list, list):
        name_list = []
    for name in name_list:
        cleaned_name = re.sub(r'[^a-zA-Z]', '', name).lower()
        cleaned_names.append(cleaned_name)
    return ' '.join(cleaned_names)


In [8]:
def get_cleaned_locations(locations):
    cleaned_locations = []
    if isinstance(locations, str):
        if locations != "":
            locations = ast.literal_eval(locations)
    if not isinstance(locations, list):
        locations = []
    for location in locations:
        cleaned_location = desc_cleaning(location)
        cleaned_locations.append(cleaned_location)
    return ' '.join(cleaned_locations)

In [9]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer

In [10]:
stop_words = set(stopwords.words('english'))
tokenizer = TreebankWordTokenizer()

In [11]:
def name_cleaning(name):
    name = (name or "").lower()
    words = tokenizer.tokenize(name)
    filtered_words = [w for w in words if w not in stop_words]
    return " ".join(filtered_words)

def desc_cleaning(desc):
    desc = (desc or "").lower()
    desc = re.sub(r"[^a-z\s]", "", desc)
    words = tokenizer.tokenize(desc)
    filtered_words = [w for w in words if w not in stop_words]
    return " ".join(filtered_words)

def get_cleaned_doc(row):
    new_name = name_cleaning(row["title"])
    new_desc = desc_cleaning(row["description"])
    new_stars = get_cleaned_name_string(row["stars"])
    new_directors = get_cleaned_name_string(row["directors"])
    new_genres = get_cleaned_locations(row["genres"])
    new_production = get_cleaned_locations(row["production_companies"])
    new_filming_locations = get_cleaned_locations(row["filming_locations"])
    new_language = get_cleaned_name_string(row["languages"])
    new_countries = get_cleaned_name_string(row["countries_origin"])
    doc_list = [
        new_name,
        new_desc,
        new_stars,
        new_directors,
        new_genres,
        new_production,
        new_filming_locations,
        new_language,
        new_countries,
    ]
    return " ".join(doc_list)

In [12]:


# --- Build movie_data_dict safely ---
movie_data_dict = {}

# First iteration to store cleaned document data
for _, row in df.iterrows():
    movie_id = row["id"]
    movie_data_dict[movie_id] = {
        "docs": get_cleaned_doc(row)
    }



In [13]:
def ensure_year_column(df):
    """
    Ensures the DataFrame has a 'year' column.
    If missing, it extracts the year from 'release_date'.
    If 'release_date' is missing or invalid, fills with 0.
    """
    if 'year' not in df.columns:
        # Extract year from release_date (format YYYY-MM-DD)
        df['year'] = df.get('release_date', '').astype(str).str.extract(r'(\d{4})')
        # Convert to integer, fill NaN with 0
        df['year'] = df['year'].fillna(0).astype(int)
    return df

# Usage:
df = ensure_year_column(df)


In [14]:
for index, row in df.iterrows():
    print(row)
    break  # remove break to see all rows



id                                                               tt0027483
title                                                   The Crimson Circle
duration                                                                76
mpa                                                                       
rating                                                                 6.4
votes                                                                   30
méta_score                                                             0.0
description              An extortion ring murders anyone who refuses t...
movie_link                           https://www.imdb.com/title/tt0027483/
writers                  ['Reginald Denham', 'Edgar Wallace', 'Howard I...
directors                                              ['Reginald Denham']
stars                    ['Hugh Wakefield', 'Alfred Drayton', 'Niall Ma...
budget                                                                    
opening_weekend_gross    

In [15]:
# Second iteration to add other movie data
for _, row in df.iterrows():
    movie_id = row["id"]
    if movie_id in movie_data_dict:
        movie_data_dict[movie_id].update({
            "title": row["title"],
            "year": row["year"],
            "duration": row["duration"],
            "MPA": row["mpa"],
            "rating": row["rating"],
            "votes": row["votes"],
            "meta_score": row["méta_score"],
            "description": row["description"],
            "Movie_Link": row["movie_link"],
            "writers": parse_list(row["writers"]),
            "directors": parse_list(row["directors"]),
            "stars": parse_list(row["stars"]),
            "budget": row["budget"],
            "opening_weekend_gross": row["opening_weekend_gross"],
            "gross_worldwide": row["gross_worldwide"],
            "gross_us_canada": row["gross_us_canada"],
            "release_date": row["release_date"],
            "countries_origin": parse_list(row["countries_origin"]),
            "filming_locations": parse_list(row["filming_locations"]),
            "production_companies": parse_list(row["production_companies"]),
            "awards_content": parse_list(row["awards_content"]),
            "genres": parse_list(row["genres"]),
            "languages": parse_list(row["languages"])
        })

# Print first 5 movie IDs and their data
first_5_keys = list(movie_data_dict.keys())[:5]
for key in first_5_keys:
    print(f"Movie ID: {key}, Data: {movie_data_dict[key]}")


Movie ID: tt0027483, Data: {'docs': 'crimson circle extortion ring murders anyone refuses pay protection money new scotland yard case victim killed protection sexy secretary falls suspicion bodies pile hughwakefield alfreddrayton niallmacginnis juneduprez paulblake noahbeery basilgill gordonmcleod reneegadd ralphtruman reginalddenham drama richard wainwright productions  english unitedkingdom', 'title': 'The Crimson Circle', 'year': 1936, 'duration': 76, 'MPA': '', 'rating': 6.4, 'votes': '30', 'meta_score': 0.0, 'description': 'An extortion ring murders anyone who refuses to pay protection money. New Scotland Yard is on the case, but a victim is killed while under their protection. A sexy secretary falls under suspicion as the bodies pile up.', 'Movie_Link': 'https://www.imdb.com/title/tt0027483/', 'writers': ['Reginald Denham', 'Edgar Wallace', 'Howard Irving Young'], 'directors': ['Reginald Denham'], 'stars': ['Hugh Wakefield', 'Alfred Drayton', 'Niall MacGinnis', 'June Duprez', 'Pa

In [16]:
df_cleaned = pd.DataFrame.from_dict(movie_data_dict, orient="index")
df_cleaned.to_csv("./../cleaned_database/cleaned_final_dataset3.csv", index_label="id")
print("Dictionary saved to ./../cleaned_database/cleaned_final_dataset3.csv")


Dictionary saved to ./../cleaned_database/cleaned_final_dataset3.csv
