In [1]:
# Bollywood Sentiment Analysis – Raw Output to Console and .txt
# 26 July 2025

import os
import openai
import pandas as pd
import re
import time

# === API KEY ===
#Please enter key provided over email. 
openai.api_key = ""

# === SETTINGS ===
DATA_FOLDER = os.path.expanduser("~/Desktop/ra_app/data")
CSV_PATH = os.path.expanduser("~/Desktop/ra_app/movies_random_sample.csv")
OUTPUT_TXT_PATH = os.path.expanduser("~/Desktop/ra_app/sentiment_raw_output.txt")
MODEL = "gpt-4o"
CHUNK_SIZE = 6000

# === CLEANING FUNCTION ===
def clean_text(text):
    text = re.sub(r"<[^>]+>", "", text)
    text = re.sub(r"{[^}]+}", "", text)
    text = re.sub(r"\d+\n", "", text)
    text = re.sub(r"\d{2}:\d{2}:\d{2},\d{3} --> .*", "", text)
    text = re.sub(r"\n{2,}", "\n", text)
    return text.strip()

# === PROMPT TEMPLATE (Verbatim) ===
PROMPT_TEMPLATE = """
You are a social scientist and analyst who is tasked with analysing cultural and political themes in the subtitles and descriptions of Bollywood (i.e. Indian film industry) films since 2010.

Your task is to –
1. Detect whether each of the following themes appears –
   1. Hindu-Muslim Relations
   2. Gender Relations
   3. Nationalism
   4. LGBTQIA+ Themes

2. For each theme that is present, assess it on the following axes –
   1. Exclusionary v/s Inclusionary
   2. Negative v/s Positive
   3. Conservative v/s Progressive

3. For each axis, assign a score on a continuous scale from **-1 to +1**:
   - A score of **-1** represents the most Exclusionary, Negative, or Conservative representation possible.
   - A score of **+1** represents the most Inclusionary, Positive, or Progressive representation possible.
   - A score of **0** indicates a neutral or balanced portrayal.

Thematic definitions of the themes are as follows –

1. Hindu-Muslim Relations
In India, Hindus form about 79% of the population whereas Muslims form about 14%. Hindu-Muslim relations are characterised by periods of both, amity, cooperation and syncretism and also with strife and violence.

2. Gender Relations
UNICEF defines gender relations as follows: A specific sub-set of social relations uniting men and women as social groups in a particular community. Gender relations intersect with all other influences on social relations – age, ethnicity, race, religion – to determine the position and identity of people in a social group. Since gender relations are a social construct, they can be changed.

3. Nationalism
Oxford Reference defines nationalism as follows: A political ideology and associated movement intended to realize or further the aims of a nation, most notably for independent self-government in a defined territory. In a broader sense, nationalism also refers to sentiments of attachment to or solidarity with a national identity or purpose.

4. LGBTQIA+ Themes
Cambridge Dictionary defines LQBTBQIA+ themes as follows: abbreviation for lesbian, gay, bisexual, transgender, queer (or questioning), intersex, and asexual (or ally): relating to or characteristic of people whose sexual orientation is not heterosexual (= sexually or romantically attracted to women if you are a man, and men if you are a woman) or whose gender identity is not cisgender (= having a gender that matches the physical body you were born with).

Definitions of the axes are as follows:

1. Exclusionary v/s Inclusionary
Cambridge Dictionary defines exclusionary to be: limited to only one group or particular groups of people, in a way that is unfair; resulting in a person or thing not being included in something.  
Cambridge Dictionary defines inclusion to be: the act of including someone or something as part of a group, list, etc., or a person or thing that is included; the idea that everyone should be able to use the same facilities, take part in the same activities, and enjoy the same experiences, including people who have a disability or other disadvantage.

2. Negative v/s Positive
Cambridge Dictionary defines negative to be: not expecting good things, or likely to consider only the bad side of a situation; bad or harmful.  
Cambridge Dictionary defines positive to be: full of hope and confidence, or giving cause for hope and confidence.

3. Conservative v/s Progressive
Cambridge Dictionary defines conservative to be: not usually liking or trusting change, especially sudden change.  
Cambridge Dictionary defines progressive to be: Progressive ideas or systems are new and modern, encouraging change in society or in the way that things are done.

---

Please analyze the following **Movie Description and Subtitle Transcript** carefully and respond using this structure:

For each of the 4 themes, do the following:
- Presence: Present / Not Present / Ambiguous
- If Present, assign a score between -1 and +1 on each of the following axes:
  - Exclusionary–Inclusionary
  - Negative–Positive
  - Conservative–Progressive

---

Movie Plot Description:
{description}

---

Subtitle Transcript:
{subtitles}
"""

# === OPENAI CALLS ===
def call_openai(prompt):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    return response.choices[0].message.content.strip()

def analyze_with_fallback(description, subtitles):
    try:
        return call_openai(PROMPT_TEMPLATE.format(description=description, subtitles=subtitles))
    except Exception:
        print("⚠️ Full input failed. Retrying with chunked input...")
        return analyze_in_chunks(description, subtitles)

def analyze_in_chunks(description, subtitles):
    chunks = [subtitles[i:i+CHUNK_SIZE] for i in range(0, len(subtitles), CHUNK_SIZE)]
    combined = ""
    for idx, chunk in enumerate(chunks):
        print(f"  → Analyzing chunk {idx+1}/{len(chunks)}...")
        try:
            resp = call_openai(PROMPT_TEMPLATE.format(description=description, subtitles=chunk))
            combined += f"\n\n[Chunk {idx+1}]\n{resp}"
        except Exception as e:
            print(f"  ❌ Failed on chunk {idx+1}: {e}")
            continue
    return combined.strip()

# === MAIN LOOP ===
df = pd.read_csv(CSV_PATH)
output_lines = []

for _, row in df.iterrows():
    imdb_id = row["imdb_id"]
    title = row["original_title"]
    desc_path = os.path.join(DATA_FOLDER, "description", f"{imdb_id}.txt")
    sub_path = os.path.join(DATA_FOLDER, "subtitles", f"{imdb_id}.srt")

    if not (os.path.exists(desc_path) and os.path.exists(sub_path)):
        print(f"⚠️ Missing files for {imdb_id} – {title}. Skipping.")
        continue

    with open(desc_path, "r", encoding="utf-8") as f:
        description = clean_text(f.read())
    with open(sub_path, "r", encoding="utf-8", errors="ignore") as f:
        subtitles = clean_text(f.read())

    if len(subtitles) < 100:
        print(f"⚠️ Subtitles too short for {imdb_id} – {title}. Skipping.")
        continue

    print(f"\n🎬 Analyzing: {title} ({imdb_id})")
    try:
        response = analyze_with_fallback(description, subtitles)
    except Exception as e:
        print(f"❌ Error for {imdb_id} – {title}: {e}")
        continue

    full_entry = f"=== {title} ({imdb_id}) ===\n{response}\n{'='*80}\n"
    print(full_entry)
    output_lines.append(full_entry)

    time.sleep(1.5)

# === SAVE TO .TXT ===
with open(OUTPUT_TXT_PATH, "w", encoding="utf-8") as f:
    f.writelines(output_lines)

print(f"\n💾 All results saved to {OUTPUT_TXT_PATH}")



🎬 Analyzing: Mere Pyare Prime Minister (tt8207768)
⚠️ Full input failed. Retrying with chunked input...
  → Analyzing chunk 1/1...
  ❌ Failed on chunk 1: Error code: 401 - {'error': {'message': "You didn't provide an API key. You need to provide your API key in an Authorization header using Bearer auth (i.e. Authorization: Bearer YOUR_KEY), or as the password field (with blank username) if you're accessing the API from your browser and are prompted for a username and password. You can obtain an API key from https://platform.openai.com/account/api-keys.", 'type': 'invalid_request_error', 'param': None, 'code': None}}
=== Mere Pyare Prime Minister (tt8207768) ===




KeyboardInterrupt: 