In [1]:
#Fuzzy Matching 
#26th July 2025 

import pandas as pd
import os
from rapidfuzz import process, fuzz
import re

# === Load datasets ===
#NOTE: PLEASE ENSURE both, movies_random_sample.csv and bollywood_box_office_2010_2024.csv 
# are in Desktop/ra_app
box_office_path = os.path.expanduser("~/Desktop/ra_app/bollywood_box_office_2010_2024.csv")
movies_sample_path = os.path.expanduser("~/Desktop/ra_app/movies_random_sample.csv")

box_office_df = pd.read_csv(box_office_path)
movies_df = pd.read_csv(movies_sample_path)

# === Clean and normalize movie names ===
def clean_title(title):
    if pd.isna(title):
        return ""
    title = str(title).lower()
    title = re.sub(r'\([^)]*\)', '', title)  # Remove text in parentheses
    title = re.sub(r'[^a-z0-9\s]', '', title)  # Remove punctuation
    title = re.sub(r'\s+', ' ', title).strip()  # Normalize whitespace
    return title

# Add cleaned columns
movies_df["clean_original_title"] = movies_df["original_title"].apply(clean_title)
movies_df["clean_title_kaggle"] = movies_df["title_kaggle"].apply(clean_title)
box_office_df["clean_movie_name"] = box_office_df["movie_name"].apply(clean_title)

# === Match function ===
def match_movie(clean_title1, clean_title2, choices, threshold=70):
    match1 = process.extractOne(clean_title1, choices, scorer=fuzz.ratio) if clean_title1 else None
    match2 = process.extractOne(clean_title2, choices, scorer=fuzz.ratio) if clean_title2 else None
    
    best_match = None
    if match1 and match2:
        best_match = match1 if match1[1] >= match2[1] else match2
    elif match1:
        best_match = match1
    elif match2:
        best_match = match2
    
    if best_match and best_match[1] >= threshold:
        return best_match
    else:
        return None

# === Full Run: Match and print all 100 results ===
print("\n=== Full Fuzzy Matching Results (All 100 Movies) ===\n")

for idx, row in movies_df.iterrows():
    # Special case override: force NA for "Humsafar"
    if row['title_kaggle'].strip().lower() == "humsafar":
        print(f"🎬 {row['title_kaggle']}")
        print("🔎 Matched With: NA")
        print("🏆 Lifetime Collection: NA")
        print("-" * 60)
        continue

    match_result = match_movie(row["clean_original_title"], row["clean_title_kaggle"], box_office_df["clean_movie_name"])
    
    print(f"🎬 {row['title_kaggle']}")
    
    if match_result:
        matched_title_clean, score, match_idx = match_result
        matched_row = box_office_df.iloc[match_idx]
        print(f"🔎 Matched With: {matched_row['movie_name']} (Score: {score})")
        print(f"🏆 Lifetime Collection: {matched_row['lifetime_collection']}")
    else:
        print("🔎 Matched With: NA")
        print("🏆 Lifetime Collection: NA")
    
    print("-" * 60)

# === Store lifetime collection results ===
lifetime_collections = []

for idx, row in movies_df.iterrows():
    # Special case: force NA for "Humsafar"
    if row['title_kaggle'].strip().lower() == "humsafar":
        lifetime_collections.append(None)
        continue

    match_result = match_movie(row["clean_original_title"], row["clean_title_kaggle"], box_office_df["clean_movie_name"])
    
    if match_result:
        _, score, match_idx = match_result
        matched_row = box_office_df.iloc[match_idx]
        lifetime_collections.append(matched_row['lifetime_collection'])
    else:
        lifetime_collections.append(None)

# Add matched results to original DataFrame and save
movies_df["lifetime_collection_matched"] = lifetime_collections
output_path = os.path.expanduser("~/Desktop/ra_app/movies_random_sample.csv")
movies_df.to_csv(output_path, index=False)
print(f"\n✅ Lifetime collections appended and saved to: {output_path}")


=== Full Fuzzy Matching Results (All 100 Movies) ===

🎬 Mere Pyare Prime Minister
🔎 Matched With: Merey Pyarey Prime Minister (Score: 96.15384615384616)
🏆 Lifetime Collection: 0.39 Cr
------------------------------------------------------------
🎬 Manikarnika: The Queen of Jhansi
🔎 Matched With: Manikarnika – The Queen Of Jhansi (Score: 100.0)
🏆 Lifetime Collection: 92.19 Cr
------------------------------------------------------------
🎬 Viceroy's House (film)
🔎 Matched With: NA
🏆 Lifetime Collection: NA
------------------------------------------------------------
🎬 3 Storeys
🔎 Matched With: 3 Storeys (Score: 100.0)
🏆 Lifetime Collection: 3.92 Cr
------------------------------------------------------------
🎬 Fatso!
🔎 Matched With: Fatso (Score: 100.0)
🏆 Lifetime Collection: 0.15 Cr
------------------------------------------------------------
🎬 Jigariyaa
🔎 Matched With: Jigariyaa (Score: 100.0)
🏆 Lifetime Collection: 0.16 Cr
------------------------------------------------------------
🎬 