In [1]:
#AI-Based Gender Inference 
#Director Name and Bio using TMDb 
#25th July 2025 

import os
import pandas as pd
import requests
import json
import time
from tqdm import tqdm
from openai import OpenAI

# --- 🔐 API Keys ---
TMDB_API_KEY = "ef74866feee9084817794614ffbae21d"

#NOTE: PLEASE REPLACE KEY WITH THE ONE PROVIDED OVER EMAIL
OPENAI_API_KEY = ""  

#PLEASE ENDURE movies_random_sample.csv is in Desktop/ra_app
# --- 📁 Load Movie Dataset ---
csv_path = os.path.expanduser("~/Desktop/ra_app/movies_random_sample.csv")
df = pd.read_csv(csv_path)

# --- 📦 Output Containers ---
output_rows = []
failed_imdb_ids = []

# --- 🔄 TMDb Helper Functions ---
def get_tmdb_id(imdb_id):
    try:
        url = f"https://api.themoviedb.org/3/find/{imdb_id}"
        params = {"api_key": TMDB_API_KEY, "external_source": "imdb_id"}
        r = requests.get(url, params=params)
        data = r.json()
        return data['movie_results'][0]['id'] if data['movie_results'] else None
    except:
        return None

def get_movie_title(tmdb_id):
    try:
        url = f"https://api.themoviedb.org/3/movie/{tmdb_id}"
        params = {"api_key": TMDB_API_KEY}
        r = requests.get(url, params=params)
        return r.json().get("title", "Unknown Title")
    except:
        return "Unknown Title"

def get_directors(tmdb_id):
    try:
        url = f"https://api.themoviedb.org/3/movie/{tmdb_id}/credits"
        params = {"api_key": TMDB_API_KEY}
        r = requests.get(url, params=params)
        data = r.json()
        return [(p['id'], p['name']) for p in data.get("crew", []) if p.get("job") == "Director"]
    except:
        return []

def get_bio(person_id):
    try:
        url = f"https://api.themoviedb.org/3/person/{person_id}"
        params = {"api_key": TMDB_API_KEY}
        r = requests.get(url, params=params)
        return r.json().get("biography", "")
    except:
        return ""

# --- 🤖 GPT Inference Function ---
def infer_gender(name, bio, max_retries=3):
    client = OpenAI(api_key=OPENAI_API_KEY)  # ✅ Correctly define client inside the function

    prompt = f"""
Based on the following name and biography of a film director associated with Indian cinema (especially Bollywood), what is the most likely gender of this person?

Name: {name}
Biography: {bio}

Respond in JSON format with two fields:
{{
  "gender": "male" or "female",
  "confidence": a number between 0.0 and 1.0 indicating how confident you are in this classification
}}
"""
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model="gpt-4",
                messages=[{"role": "user", "content": prompt}],
                temperature=0,
                timeout=60
            )
            raw = response.choices[0].message.content.strip()
            result = json.loads(raw)
            return result["gender"].lower(), float(result["confidence"])
        except Exception as e:
            print(f"⚠️ GPT error (attempt {attempt+1}) for {name}: {e}")
            time.sleep(2 + attempt * 2)

    return "unknown", 0.0

# --- 🚀 Main Loop Over Movies ---
for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    imdb_id = row['imdb_id']

    tmdb_id = get_tmdb_id(imdb_id)
    if not tmdb_id:
        print(f"❌ TMDb ID not found for IMDb ID {imdb_id}")
        failed_imdb_ids.append({"imdb_id": imdb_id, "reason": "tmdb_id_not_found"})
        continue

    movie_title = get_movie_title(tmdb_id)
    directors = get_directors(tmdb_id)

    if not directors:
        print(f"⚠️ No directors found for: {movie_title} ({imdb_id})")
        failed_imdb_ids.append({"imdb_id": imdb_id, "reason": "no_director_found"})
        continue

    for person_id, director_name in directors:
        bio = get_bio(person_id)
        gender, confidence = infer_gender(director_name, bio)

        output_rows.append({
            "imdb_id": imdb_id,
            "movie_title": movie_title,
            "director_name": director_name,
            "director_bio": bio,
            "inferred_gender": gender,
            "confidence_score": confidence
        })

        print(f"✅ {movie_title} → {director_name} → {gender} ({confidence:.2f})")

        time.sleep(1)  # polite rate limit delay

# --- 💾 Save Output Files ---
output_df = pd.DataFrame(output_rows)
output_path = os.path.expanduser("~/Desktop/ra_app/director_gender_full_output.csv")
output_df.to_csv(output_path, index=False)

if failed_imdb_ids:
    fail_df = pd.DataFrame(failed_imdb_ids)
    fail_path = os.path.expanduser("~/Desktop/ra_app/director_gender_failed_cases.csv")
    fail_df.to_csv(fail_path, index=False)
    print(f"\n⚠️ Logged {len(failed_imdb_ids)} failed cases to: {fail_path}")

print(f"\n✅ Saved final output to: {output_path}")

  0%|                                                   | 0/100 [00:00<?, ?it/s]

✅ My Dear Prime Minister → Rakeysh Omprakash Mehra → male (1.00)


  1%|▍                                          | 1/100 [00:07<13:05,  7.94s/it]

✅ Manikarnika: The Queen of Jhansi → Kangana Ranaut → female (1.00)
✅ Manikarnika: The Queen of Jhansi → Radha Krishna Jagarlamudi → male (1.00)


  2%|▊                                          | 2/100 [00:21<18:29, 11.32s/it]

✅ Viceroy's House → Gurinder Chadha → female (1.00)


  3%|█▎                                         | 3/100 [00:29<15:27,  9.56s/it]

✅ 3 Storeys → Arjun Mukerjee → male (1.00)


  4%|█▋                                         | 4/100 [00:36<13:51,  8.66s/it]

✅ Fatso! → Rajat Kapoor → male (1.00)


  5%|██▏                                        | 5/100 [00:44<13:35,  8.59s/it]

✅ Jigariyaa → Raj Purohit → male (1.00)


  6%|██▌                                        | 6/100 [00:52<12:43,  8.12s/it]

✅ 7 Khoon Maaf → Vishal Bhardwaj → male (1.00)


  7%|███                                        | 7/100 [01:00<12:31,  8.08s/it]

✅ October → Shoojit Sircar → male (1.00)


  8%|███▍                                       | 8/100 [01:07<12:10,  7.94s/it]

✅ The Ghazi Attack → Sankalp Reddy → male (1.00)


  9%|███▊                                       | 9/100 [01:16<12:20,  8.14s/it]

✅ Khwaabb → Zaid Ali Khan → male (1.00)


 11%|████▌                                     | 11/100 [01:25<08:59,  6.06s/it]

❌ TMDb ID not found for IMDb ID tt3166542
✅ #Yaaram → Ovais Khan → male (1.00)


 12%|█████                                     | 12/100 [01:32<09:23,  6.40s/it]

✅ Happy Bhag Jayegi → Mudassar Aziz → male (1.00)


 13%|█████▍                                    | 13/100 [01:41<10:01,  6.91s/it]

✅ Newton → Amit Masurkar → male (1.00)


 14%|█████▉                                    | 14/100 [01:49<10:28,  7.31s/it]

✅ Zero → Aanand L. Rai → male (1.00)


 15%|██████▎                                   | 15/100 [01:56<10:30,  7.41s/it]

✅ M Cream → Agneya Singh → male (1.00)
✅ M Cream → Aban Raza → female (0.60)


 16%|██████▋                                   | 16/100 [02:08<12:12,  8.72s/it]

✅ Chennai Express → Rohit Shetty → male (1.00)


 17%|███████▏                                  | 17/100 [02:16<11:48,  8.54s/it]

✅ Student Of The Year → Karan Johar → male (1.00)


 18%|███████▌                                  | 18/100 [02:24<11:29,  8.41s/it]

✅ Bewakoofiyaan → Nupur Asthana → female (1.00)


 19%|███████▉                                  | 19/100 [02:31<10:45,  7.97s/it]

✅ Raanjhanaa → Aanand L. Rai → male (1.00)


 20%|████████▍                                 | 20/100 [02:39<10:18,  7.73s/it]

✅ Kismat Love Paisa Dilli → Sanjay Khanduri → male (1.00)


 21%|████████▊                                 | 21/100 [02:46<10:10,  7.73s/it]

✅ Ek Tha Tiger → Kabir Khan → male (1.00)


 22%|█████████▏                                | 22/100 [02:55<10:27,  8.04s/it]

✅ Crazy Cukkad Family → Ritesh Menon → male (1.00)


 23%|█████████▋                                | 23/100 [03:02<09:55,  7.73s/it]

✅ Mary Kom → Omung Kumar → male (1.00)


 25%|██████████▌                               | 25/100 [03:12<07:21,  5.89s/it]

❌ TMDb ID not found for IMDb ID tt2403201
✅ Coffee Bloom → Manu Warrier → male (1.00)


 26%|██████████▉                               | 26/100 [03:19<07:49,  6.34s/it]

✅ Jai Gangaajal → Prakash Jha → male (1.00)
✅ Jai Gangaajal → Sham Kaushal → male (1.00)


 27%|███████████▎                              | 27/100 [03:31<09:42,  7.98s/it]

✅ Aiyyaa → Sachin Kundalkar → male (1.00)


 28%|███████████▊                              | 28/100 [03:39<09:47,  8.16s/it]

✅ Arjun Patiala → Rohit Jugraj → male (1.00)


 29%|████████████▏                             | 29/100 [03:47<09:32,  8.07s/it]

✅ 2 States → Abhishek Varman → male (1.00)


 30%|████████████▌                             | 30/100 [03:54<08:59,  7.71s/it]

✅ Zila Ghaziabad → Anand Kumar → male (1.00)


 31%|█████████████                             | 31/100 [04:02<08:54,  7.75s/it]

✅ Ek Paheli Leela → Bobby Khan → male (1.00)


 32%|█████████████▍                            | 32/100 [04:10<08:55,  7.88s/it]

✅ Brahman Naman → Qaushiq Mukherjee → male (1.00)


 33%|█████████████▊                            | 33/100 [04:24<10:38,  9.54s/it]

✅ Holiday → A.R. Murugadoss → male (1.00)


 34%|██████████████▎                           | 34/100 [04:32<09:58,  9.07s/it]

✅ Sixteen → Raj Purohit → male (1.00)


 35%|██████████████▋                           | 35/100 [04:40<09:32,  8.80s/it]

✅ Dum Laga Ke Haisha → Sharat Katariya → male (1.00)


 36%|███████████████                           | 36/100 [04:48<09:09,  8.59s/it]

✅ Lootera → Vikramaditya Motwane → male (1.00)


 37%|███████████████▌                          | 37/100 [04:56<08:54,  8.49s/it]

✅ Shaadi Mein Zaroor Aana → Ratnaa Sinha → female (1.00)


 39%|████████████████▍                         | 39/100 [05:08<07:02,  6.93s/it]

⚠️ No directors found for: Mumbhai Connection (tt1826763)
✅ Golmaal Again → Rohit Shetty → male (1.00)


 40%|████████████████▊                         | 40/100 [05:15<07:05,  7.10s/it]

✅ Kaashi in Search of Ganga → Dhiraj Kumar → male (1.00)


 41%|█████████████████▏                        | 41/100 [05:23<07:12,  7.34s/it]

✅ Aashiqui 2 → Mohit Suri → male (1.00)


 42%|█████████████████▋                        | 42/100 [05:30<07:04,  7.32s/it]

✅ Ladies VS. Ricky Bahl → Maneesh Sharma → male (1.00)


 43%|██████████████████                        | 43/100 [05:38<07:04,  7.45s/it]

✅ Jhalki → Brahmanand S. Singh → male (1.00)


 44%|██████████████████▍                       | 44/100 [05:46<07:01,  7.53s/it]

✅ Dil Bechara → Mukesh Chhabra → male (1.00)


 45%|██████████████████▉                       | 45/100 [05:54<07:12,  7.87s/it]

✅ Article 15 → Anubhav Sinha → male (1.00)


 46%|███████████████████▎                      | 46/100 [06:02<06:56,  7.71s/it]

✅ Total Siyapaa → Eeshwar Nivas → male (1.00)


 47%|███████████████████▋                      | 47/100 [06:09<06:41,  7.58s/it]

✅ Ramaiya Vastavaiya → Prabhu Deva → male (1.00)


 48%|████████████████████▏                     | 48/100 [06:19<07:18,  8.43s/it]

✅ Photograph → Ritesh Batra → male (1.00)


 49%|████████████████████▌                     | 49/100 [06:28<07:05,  8.35s/it]

✅ Kahaani → Sujoy Ghosh → male (1.00)


 50%|█████████████████████                     | 50/100 [06:36<07:02,  8.44s/it]

✅ Ankhon Dekhi → Rajat Kapoor → male (1.00)


 51%|█████████████████████▍                    | 51/100 [06:44<06:47,  8.32s/it]

✅ Night → Onir → male (1.00)


 52%|█████████████████████▊                    | 52/100 [06:52<06:34,  8.22s/it]

✅ Omertà → Hansal Mehta → male (1.00)


 53%|██████████████████████▎                   | 53/100 [07:00<06:20,  8.11s/it]

✅ Chittagong → Bedabrata Pain → male (1.00)


 54%|██████████████████████▋                   | 54/100 [07:07<05:58,  7.80s/it]

✅ Phata Poster Nikhla Hero → Rajkumar Santoshi → male (1.00)


 55%|███████████████████████                   | 55/100 [07:15<05:51,  7.82s/it]

✅ Ab Tak Chhappan 2 → Ejaz Gulab → male (1.00)


 56%|███████████████████████▌                  | 56/100 [07:24<06:01,  8.21s/it]

✅ F.A.L.T.U. → Remo D'Souza → male (1.00)


 57%|███████████████████████▉                  | 57/100 [07:33<06:03,  8.46s/it]

✅ Bajirao Mastani → Sanjay Leela Bhansali → male (1.00)


 59%|████████████████████████▊                 | 59/100 [07:45<04:46,  6.99s/it]

⚠️ No directors found for: Gollu Aur Pappu (tt4219300)
✅ Tanu Weds Manu: Returns → Aanand L. Rai → male (1.00)


 60%|█████████████████████████▏                | 60/100 [07:52<04:35,  6.89s/it]

✅ Ungli → Renzil D'Silva → male (1.00)


 61%|█████████████████████████▌                | 61/100 [07:59<04:31,  6.97s/it]

✅ Queen → Vikas Bahl → male (1.00)


 62%|██████████████████████████                | 62/100 [08:06<04:31,  7.15s/it]

✅ Parmanu: The Story of Pokhran → Abhishek Sharma → male (1.00)


 63%|██████████████████████████▍               | 63/100 [09:16<15:53, 25.78s/it]

✅ Azhar → Anthony D'Souza → male (1.00)


 64%|██████████████████████████▉               | 64/100 [09:24<12:20, 20.56s/it]

✅ Sanju → Rajkumar Hirani → male (1.00)


 65%|███████████████████████████▎              | 65/100 [09:32<09:50, 16.88s/it]

✅ Ae Dil Hai Mushkil → Karan Johar → male (1.00)


 66%|███████████████████████████▋              | 66/100 [09:40<07:58, 14.07s/it]

✅ Satyameva Jayate → Milap Zaveri → male (1.00)


 67%|████████████████████████████▏             | 67/100 [09:48<06:49, 12.42s/it]

✅ Agent Vinod → Sriram Raghavan → male (1.00)


 68%|████████████████████████████▌             | 68/100 [09:57<05:58, 11.21s/it]

✅ Phillauri → Anshai Lal → male (1.00)


 69%|████████████████████████████▉             | 69/100 [10:05<05:19, 10.32s/it]

✅ Tumhari Sulu → Suresh Triveni → male (1.00)


 71%|█████████████████████████████▊            | 71/100 [10:14<03:26,  7.12s/it]

❌ TMDb ID not found for IMDb ID tt4718678
✅ Dhoom 3 → Vijay Krishna Acharya → male (1.00)


 72%|██████████████████████████████▏           | 72/100 [10:22<03:24,  7.30s/it]

✅ M.S. Dhoni: The Untold Story → Neeraj Pandey → male (1.00)


 73%|██████████████████████████████▋           | 73/100 [10:32<03:37,  8.04s/it]

✅ Sons of Ram → Kushal Ruia → male (1.00)


 74%|███████████████████████████████           | 74/100 [10:39<03:26,  7.93s/it]

✅ Halkaa → Nila Madhab Panda → male (1.00)


 75%|███████████████████████████████▌          | 75/100 [10:46<03:11,  7.66s/it]

✅ Bhaiaji Superhitt → Neeraj Pathak → male (1.00)


 76%|███████████████████████████████▉          | 76/100 [10:55<03:08,  7.85s/it]

✅ Jagga Jasoos → Anurag Basu → male (1.00)


 77%|████████████████████████████████▎         | 77/100 [11:03<03:02,  7.92s/it]

✅ ?: A Question Mark → Allyson Patel → female (0.90)
✅ ?: A Question Mark → Yash Dave → male (1.00)


 78%|████████████████████████████████▊         | 78/100 [11:14<03:14,  8.83s/it]

✅ Good Newwz → Raj Mehta → male (1.00)


 79%|█████████████████████████████████▏        | 79/100 [11:22<02:58,  8.50s/it]

✅ Ghayal Once Again → Sunny Deol → male (1.00)


 81%|██████████████████████████████████        | 81/100 [11:32<02:05,  6.62s/it]

⚠️ No directors found for: Mummy Punjabi (tt2066925)
✅ Chakravyuh → Prakash Jha → male (1.00)


 82%|██████████████████████████████████▍       | 82/100 [11:39<02:00,  6.69s/it]

✅ Mirzya → Rakeysh Omprakash Mehra → male (1.00)


 83%|██████████████████████████████████▊       | 83/100 [11:47<02:01,  7.12s/it]

✅ Dabangg 2 → Arbaaz Khan → male (1.00)


 84%|███████████████████████████████████▎      | 84/100 [11:54<01:55,  7.19s/it]

✅ The Accidental Prime Minister → Vijay Ratnakar Gutte → male (1.00)


 85%|███████████████████████████████████▋      | 85/100 [12:02<01:52,  7.53s/it]

✅ Desi Kattey → Anand Kumar → male (1.00)


 87%|████████████████████████████████████▌     | 87/100 [12:11<01:13,  5.62s/it]

❌ TMDb ID not found for IMDb ID tt4384270
✅ Bombairiya → Pia Sukanya → female (1.00)


 88%|████████████████████████████████████▉     | 88/100 [12:19<01:14,  6.24s/it]

✅ Manjhi: The Mountain Man → Ketan Mehta → male (1.00)


 89%|█████████████████████████████████████▍    | 89/100 [12:26<01:13,  6.64s/it]

✅ Sanam Re → Divya Khosla Kumar → female (1.00)


 90%|█████████████████████████████████████▊    | 90/100 [12:33<01:07,  6.78s/it]

✅ Luckhnowi Ishq → Aanand Raut → male (1.00)


 91%|██████████████████████████████████████▏   | 91/100 [12:41<01:03,  7.02s/it]

✅ Baat Bann Gayi → Shuja Ali → male (1.00)


 93%|███████████████████████████████████████   | 93/100 [12:53<00:43,  6.17s/it]

⚠️ No directors found for: Jaane kyun de yaaron (tt7720254)
✅ Maatr → Ashtar Sayed → male (1.00)


 94%|███████████████████████████████████████▍  | 94/100 [13:02<00:42,  7.10s/it]

✅ Hum Chaar → Abhishek Dixit → male (1.00)


 95%|███████████████████████████████████████▉  | 95/100 [13:10<00:36,  7.26s/it]

✅ Padmaavat → Sanjay Leela Bhansali → male (1.00)


 96%|████████████████████████████████████████▎ | 96/100 [13:18<00:30,  7.69s/it]

✅ Badlapur → Sriram Raghavan → male (1.00)


 97%|████████████████████████████████████████▋ | 97/100 [13:26<00:23,  7.72s/it]

✅ Baby → Neeraj Pandey → male (1.00)


 99%|█████████████████████████████████████████▌| 99/100 [13:37<00:06,  6.47s/it]

⚠️ No directors found for: The Silent Heroes (tt4309284)
✅ Coffee with D → Vishal Mishra → male (1.00)


100%|█████████████████████████████████████████| 100/100 [13:45<00:00,  8.26s/it]


⚠️ Logged 9 failed cases to: /Users/rishabhbijani/Desktop/ra_app/director_gender_failed_cases.csv

✅ Saved final output to: /Users/rishabhbijani/Desktop/ra_app/director_gender_full_output.csv



