In [47]:
import numpy as np
import pandas as pd

In [48]:
import pandas as pd


df = pd.read_csv('wiki_movie_plots_deduped.csv', header=0)
data = df.copy()
data

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...
...,...,...,...,...,...,...,...,...
34881,2014,The Water Diviner,Turkish,Director: Russell Crowe,Director: Russell Crowe\r\nCast: Russell Crowe...,unknown,https://en.wikipedia.org/wiki/The_Water_Diviner,"The film begins in 1919, just after World War ..."
34882,2017,Çalgı Çengi İkimiz,Turkish,Selçuk Aydemir,"Ahmet Kural, Murat Cemcir",comedy,https://en.wikipedia.org/wiki/%C3%87alg%C4%B1_...,"Two musicians, Salih and Gürkan, described the..."
34883,2017,Olanlar Oldu,Turkish,Hakan Algül,"Ata Demirer, Tuvana Türkay, Ülkü Duru",comedy,https://en.wikipedia.org/wiki/Olanlar_Oldu,"Zafer, a sailor living with his mother Döndü i..."
34884,2017,Non-Transferable,Turkish,Brendan Bradley,"YouTubers Shanna Malcolm, Shira Lazar, Sara Fl...",romantic comedy,https://en.wikipedia.org/wiki/Non-Transferable...,The film centres around a young woman named Am...


Load and Inspect Dataset

In [49]:
# Load the Wikipedia Movie Plots dataset
df = pd.read_csv("wiki_movie_plots_deduped.csv")
print(df.head())
print(f"Total movies: {len(df)}")

# Focus on key columns
movies = df[['Title', 'Plot', 'Genre', 'Release Year']].dropna()
plots = movies['Plot'].tolist()
titles = movies['Title'].tolist()

   Release Year                             Title Origin/Ethnicity  \
0          1901            Kansas Saloon Smashers         American   
1          1901     Love by the Light of the Moon         American   
2          1901           The Martyred Presidents         American   
3          1901  Terrible Teddy, the Grizzly King         American   
4          1902            Jack and the Beanstalk         American   

                             Director Cast    Genre  \
0                             Unknown  NaN  unknown   
1                             Unknown  NaN  unknown   
2                             Unknown  NaN  unknown   
3                             Unknown  NaN  unknown   
4  George S. Fleming, Edwin S. Porter  NaN  unknown   

                                           Wiki Page  \
0  https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...   
1  https://en.wikipedia.org/wiki/Love_by_the_Ligh...   
2  https://en.wikipedia.org/wiki/The_Martyred_Pre...   
3  https://en.wikipedia.

Exploratory Data Analysis (EDA)

**Basic Stats: Checks data size, missing values, and summary stats.

***Plot Length Distribution: Histogram shows how long plots are—important for RAG context limits.

**Genre Distribution: Bar plot reveals dominant genres—useful for query relevance.

**Release Year Trends: Histogram tracks movie distribution over time—context for temporal queries.

**Word Frequency: Identifies common terms (e.g., "love," "war")—helps sparse retrieval (BM25).

**Correlation: Scatter plot explores if older/newer movies have longer plots—insights for preprocessing.

In [50]:
# Data cleaning and preprocessing
def preprocess_data(df):
    df['Release Year'] = pd.to_numeric(df['Release Year'], errors='coerce')
    df = df.dropna(subset=['Release Year', 'Plot'])
    df['Release Year'] = df['Release Year'].astype(int)
    df['Decade'] = (df['Release Year'] // 10) * 10
    df['Genre'] = df['Genre'].fillna('unknown')
    df['Genre'] = df['Genre'].apply(lambda x: x.split(', ')[0])
    df['Cast'] = df['Cast'].fillna('[null]')
    df['Plot_Length'] = df['Plot'].apply(len)
    
    return df

def analyze_dataset_stats(df):
    print("Dataset Statistics:")
    print(f"Total number of movies: {len(df):,}")
    print(f"Date range: {df['Release Year'].min()} to {df['Release Year'].max()}")
    print(f"Number of unique titles: {df['Title'].nunique():,}")
    print(f"Number of unique genres: {df['Genre'].nunique()}")
    print(f"Number of unique origins: {df['Origin/Ethnicity'].nunique()}")
    print("\nTop 5 origins:")
    print(df['Origin/Ethnicity'].value_counts().head())
    print("\nTop 5 genres:")
    print(df['Genre'].value_counts().head())

In [51]:
df = preprocess_data(df)
analyze_dataset_stats(df)

Dataset Statistics:
Total number of movies: 34,886
Date range: 1901 to 2017
Number of unique titles: 32,432
Number of unique genres: 1350
Number of unique origins: 24

Top 5 origins:
Origin/Ethnicity
American     17377
British       3670
Bollywood     2931
Tamil         2599
Telugu        1311
Name: count, dtype: int64

Top 5 genres:
Genre
drama      6641
unknown    6083
comedy     5022
action     1546
horror     1276
Name: count, dtype: int64


In [52]:
print("\nUnique values in key columns:")
for col in ['Genre', 'Origin/Ethnicity', 'Decade']:
    print(f"{col}: {df[col].nunique()}")


Unique values in key columns:
Genre: 1350
Origin/Ethnicity: 24
Decade: 12


In [53]:
def create_context(df, max_chars=5000000):  
    """Create context string with size limit"""
    context = ""
    
    # Sample movies from different decades for better representation
    decades = df['Decade'].unique()
    sampled_df = pd.concat([
        df[df['Decade'] == decade].sample(
            n=min(100, len(df[df['Decade'] == decade])), 
            random_state=42
        ) for decade in decades
    ])
    
    for _, row in sampled_df.iterrows():
        entry = (f"Title: {row['Title']}\n"
                f"Year: {row['Release Year']}\n"
                f"Decade: {row['Decade']}\n"
                f"Genre: {row['Genre']}\n"
                f"Origin: {row['Origin/Ethnicity']}\n"
                f"Director: {row['Director']}\n"
                f"Plot: {row['Plot']}\n\n")
        
        if len(context) + len(entry) > max_chars:
            break
        context += entry
    
    print(f"Context size: {len(context):,} characters")
    return context