In [1]:
!pip list

Package                   Version
------------------------- ------------
accelerate                1.10.1
aiohappyeyeballs          2.6.1
aiohttp                   3.13.0
aiosignal                 1.4.0
annotated-types           0.7.0
anyio                     4.11.0
argon2-cffi               25.1.0
argon2-cffi-bindings      25.1.0
arrow                     1.4.0
asttokens                 3.0.0
async-lru                 2.0.5
attrs                     25.4.0
babel                     2.17.0
beautifulsoup4            4.14.2
bitsandbytes              0.48.1
bleach                    6.2.0
certifi                   2025.10.5
cffi                      2.0.0
charset-normalizer        3.4.4
comm                      0.2.3
cut-cross-entropy         25.1.1
datasets                  4.2.0
debugpy                   1.8.17
decorator                 5.2.1
defusedxml                0.7.1
diffusers                 0.35.2
dill                      0.4.0
docstring_parser          0.17.0
executing     

In [2]:
import sys
print(sys.executable)


/home/mrnzero321/qwenenv/bin/python3


In [1]:
import pandas as pd
import gzip
import shutil
import requests
import json
import random
from pathlib import Path
from typing import List, Dict, Tuple
from collections import defaultdict
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


In [2]:
class IMDbDataProcessor:
    """Download and process IMDb datasets"""
    
    # Dictionary containing all the download URLs for IMDb datasets
    # Each key is a friendly name, each value is the actual download URL
    URLS = {
        'title_basics': 'https://datasets.imdbws.com/title.basics.tsv.gz',  # Basic movie info (title, year, type)
        'title_principals': 'https://datasets.imdbws.com/title.principals.tsv.gz',  # Cast and crew members
        'name_basics': 'https://datasets.imdbws.com/name.basics.tsv.gz',  # People's names and info
        'title_crew': 'https://datasets.imdbws.com/title.crew.tsv.gz',  # Directors and writers list (NEW!)
        'title_ratings': 'https://datasets.imdbws.com/title.ratings.tsv.gz'  # Movie ratings
    }
    
    def __init__(self, data_dir='imdb_data'):
        """
        Initialize the processor
        data_dir: folder name where we'll store downloaded files
        """
        # Convert the folder name string to a Path object for easier manipulation
        self.data_dir = Path(data_dir)
        # Create the directory if it doesn't exist yet (exist_ok=True means no error if it exists)
        self.data_dir.mkdir(exist_ok=True)
        
    def download_datasets(self):
        """Download all required IMDb datasets from the internet"""
        # Log that we're starting the download process
        logger.info("Starting dataset downloads...")
        
        # Loop through each dataset name and its URL
        for name, url in self.URLS.items():
            # Create the full path where this file will be saved
            output_path = self.data_dir / f"{name}.tsv.gz"
            # Check if the file already exists on disk
            if output_path.exists():
                # If it exists, skip downloading and tell the user
                logger.info(f"{name} already exists, skipping...")
                continue
                
            # Log that we're downloading this specific dataset
            logger.info(f"Downloading {name}...")
            # Make an HTTP GET request to download the file (stream=True means download in chunks)
            response = requests.get(url, stream=True)
            # Open the output file in write-binary mode
            with open(output_path, 'wb') as f:
                # Copy the downloaded data to the file in chunks
                shutil.copyfileobj(response.raw, f)
            # Log successful download with a checkmark
            logger.info(f"✓ {name} downloaded")
    
    def load_and_filter_data(self):
        """Load and filter IMDb data for movies only (not TV shows, etc.)"""
        # Log that we're starting to load the datasets
        logger.info("Loading datasets...")
        
        # Load title basics (contains movie metadata like title, year, type)
        logger.info("Processing title.basics...")
        # Read the compressed TSV file into a pandas DataFrame
        titles = pd.read_csv(
            self.data_dir / 'title_basics.tsv.gz',  # File path
            sep='\t',  # Tab-separated values
            na_values='\\N',  # IMDb uses '\N' to represent missing data
            low_memory=False  # Don't try to optimize memory (more reliable)
        )
        
        # Filter: only movies, have year, minimum info
        # Create a copy of rows that meet all our criteria
        movies = titles[
            (titles['titleType'] == 'movie') &  # Only movies (not TV shows, episodes, etc.)
            (titles['startYear'].notna()) &  # Must have a year
            (titles['startYear'] != '\\N')  # Year is not the missing data marker
        ].copy()  # Create a copy to avoid warnings about modifying views
        
        # Convert the startYear column from text to numbers (some might fail, set to NaN)
        movies['startYear'] = pd.to_numeric(movies['startYear'], errors='coerce')
        # Keep only movies between 1920 and 2024 (reasonable range)
        movies = movies[movies['startYear'].between(1920, 2024)]
        
        # Log how many movies we found (with comma thousands separator)
        logger.info(f"Found {len(movies):,} movies")
        
        # Load name basics (contains information about actors, directors, writers)
        logger.info("Processing name.basics...")
        # Read the names dataset into a DataFrame
        names = pd.read_csv(
            self.data_dir / 'name_basics.tsv.gz',  # File path
            sep='\t',  # Tab-separated
            na_values='\\N',  # Missing data marker
            low_memory=False  # Don't optimize memory
        )
        
        # Load principals (links people to movies with their roles)
        logger.info("Processing title.principals...")
        # Read the principals dataset
        principals = pd.read_csv(
            self.data_dir / 'title_principals.tsv.gz',  # File path
            sep='\t',  # Tab-separated
            na_values='\\N',  # Missing data marker
            low_memory=False  # Don't optimize memory
        )
        
        # Filter: actors, actresses, directors, writers, producers only
        # Create a copy with only the crew/cast roles we care about
        crew_cast = principals[principals['category'].isin([
            'actor',  # Male actors
            'actress',  # Female actors (IMDb separates them)
            'director',  # Directors
            'writer',  # Writers
            'producer'  # Producers
        ])].copy()  # Create a copy
        
        # Log how many crew/cast entries we found
        logger.info(f"Found {len(crew_cast):,} crew/cast entries")
        
        # Load crew data (separate file with directors & writers organized by movie)
        logger.info("Processing title.crew...")
        # Read the crew dataset
        crew = pd.read_csv(
            self.data_dir / 'title_crew.tsv.gz',  # File path
            sep='\t',  # Tab-separated
            na_values='\\N',  # Missing data marker
            low_memory=False  # Don't optimize memory
        )
        
        # Log how many crew records we found
        logger.info(f"Found {len(crew):,} crew records")
        
        # Return all four DataFrames for further processing
        return movies, names, crew_cast, crew
    
    def create_knowledge_base(self, movies, names, crew_cast, crew):
        """Create structured knowledge base for query generation by merging all data"""
        # Log that we're building the knowledge base
        logger.info("Building knowledge base...")
        
        # Merge cast/crew with movie info
        # This combines the crew_cast data with movie details (title, year, genres)
        crew_movies = crew_cast.merge(
            movies[['tconst', 'primaryTitle', 'startYear', 'genres']],  # Columns we need from movies
            on='tconst',  # Join on movie ID
            how='inner'  # Only keep rows that exist in both DataFrames
        )
        
        # Merge with person names
        # Add the actual names of people to our dataset
        crew_movies = crew_movies.merge(
            names[['nconst', 'primaryName']],  # Person ID and name
            on='nconst',  # Join on person ID
            how='inner'  # Only keep rows that exist in both
        )
        
        # Clean data - remove rows where name is missing
        crew_movies = crew_movies[crew_movies['primaryName'].notna()]
        # Remove rows where movie title is missing
        crew_movies = crew_movies[crew_movies['primaryTitle'].notna()]
        
        # Focus on people with reasonable filmography (3-500 credits)
        # Count how many times each person appears in each role
        person_counts = crew_movies.groupby(['primaryName', 'category']).size()
        # Keep only people with 3-500 credits (removes obscure and fake entries)
        valid_people = person_counts[(person_counts >= 3) & (person_counts <= 500)].index
        # Filter our dataset to only include these valid people
        crew_movies = crew_movies[crew_movies.set_index(['primaryName', 'category']).index.isin(valid_people)]
        
        # Log statistics about our knowledge base
        logger.info(f"Knowledge base: {len(crew_movies):,} records")
        # Count and log number of actors/actresses
        logger.info(f"  Actors/Actresses: {len(crew_movies[crew_movies['category'].isin(['actor', 'actress'])]):,}")
        # Count and log number of directors
        logger.info(f"  Directors: {len(crew_movies[crew_movies['category'] == 'director']):,}")
        # Count and log number of writers
        logger.info(f"  Writers: {len(crew_movies[crew_movies['category'] == 'writer']):,}")
        # Count and log number of producers
        logger.info(f"  Producers: {len(crew_movies[crew_movies['category'] == 'producer']):,}")
        
        # Return the merged and cleaned knowledge base
        return crew_movies


In [3]:
class QAGenerator:
    """Generate training Q&A pairs (with crew support) for fine-tuning LLMs"""
    
    def __init__(self, knowledge_base: pd.DataFrame):
        """
        Initialize the QA generator
        knowledge_base: the processed movie/crew dataset
        """
        # Store the knowledge base as an instance variable
        self.kb = knowledge_base
        # Initialize an empty list to store generated question-answer pairs
        self.qa_pairs = []
        
    def generate_temporal_queries(self, n=200):
        """
        Single-hop: Person filmography in specific year
        Example: "How many movies did Tom Hanks act in 1994?"
        """
        # Log what we're doing
        logger.info("Generating temporal queries...")
        
        # Mix of actors and directors for variety
        # Get unique actor names from the knowledge base
        actors = self.kb[self.kb['category'].isin(['actor', 'actress'])]['primaryName'].unique()
        # Get unique director names
        directors = self.kb[self.kb['category'] == 'director']['primaryName'].unique()
        
        # Combine them: 70% actors, 30% directors
        people = list(actors[:int(n*0.7)]) + list(directors[:int(n*0.3)])
        # Randomize the order so we don't always get the same people
        random.shuffle(people)
        
        # Loop through the selected people (up to n of them)
        for person in people[:n]:
            # Determine their role (actor/director/etc)
            # Get all rows for this person
            person_data = self.kb[self.kb['primaryName'] == person]
            # If no data found, skip to next person
            if len(person_data) == 0:
                continue
            
            # Get their primary role (first row's category)
            role = person_data['category'].iloc[0]
            # Pick a random year from their filmography
            year = random.choice(person_data['startYear'].dropna().unique())
            # Get all movies they did in that specific year
            movies_that_year = person_data[person_data['startYear'] == year]
            
            # If they didn't do any movies that year, skip
            if len(movies_that_year) == 0:
                continue
            
            # Get the list of movie titles from that year
            movie_list = movies_that_year['primaryTitle'].tolist()
            
            # Generate question and answer based on their role
            if role in ['actor', 'actress']:
                # Question for actors
                question = f"How many movies did {person} act in {int(year)}?"
                # Start of answer
                answer = f"{person} appeared in {len(movie_list)} movie(s) in {int(year)}:\n"
            elif role == 'director':
                # Question for directors
                question = f"How many movies did {person} direct in {int(year)}?"
                answer = f"{person} directed {len(movie_list)} movie(s) in {int(year)}:\n"
            elif role == 'writer':
                # Question for writers
                question = f"How many movies did {person} write in {int(year)}?"
                answer = f"{person} wrote {len(movie_list)} movie(s) in {int(year)}:\n"
            else:  # producer or other
                # Question for producers
                question = f"How many movies did {person} produce in {int(year)}?"
                answer = f"{person} produced {len(movie_list)} movie(s) in {int(year)}:\n"
            
            # Add numbered list of movies (up to 10)
            answer += "\n".join([f"{i+1}. {title}" for i, title in enumerate(movie_list[:10])])
            
            # If there are more than 10 movies, indicate that
            if len(movie_list) > 10:
                answer += f"\n... and {len(movie_list) - 10} more"
            
            # Add this Q&A pair to our collection in ChatML format
            self.qa_pairs.append({
                'messages': [
                    {'role': 'user', 'content': question},
                    {'role': 'assistant', 'content': answer}
                ],
                'metadata': {
                    'type': 'temporal_single',
                    'complexity': 'single-hop'
                }
            })
            
            # If we've generated enough, stop
            if len(self.qa_pairs) >= n:
                break
    
    def generate_range_queries(self, n=200):
        """
        Two-hop: Movies in year range
        Example: "Which films did Steven Spielberg direct between 1990-1995?"
        """
        # Log what we're generating
        logger.info("Generating range queries...")
        
        # Get all unique people from knowledge base
        people = self.kb['primaryName'].unique()
        # Shuffle to randomize
        random.shuffle(people)
        
        # Try more people than needed to ensure we get enough valid queries
        for person in people[:n*2]:  # Try more to get enough valid ones
            # Get all data for this person
            person_data = self.kb[self.kb['primaryName'] == person]
            # Get sorted list of years they worked
            years = sorted(person_data['startYear'].dropna().unique())
            
            # Skip if they don't have at least 3 years of work
            if len(years) < 3:
                continue
            
            # Get their primary role
            role = person_data['category'].iloc[0]
            # Pick a random starting point in their career
            start_idx = random.randint(0, len(years) - 3)
            # Pick a random range size (3-5 years)
            range_size = random.randint(3, min(5, len(years) - start_idx))
            
            # Convert the start and end years to integers
            start_year = int(years[start_idx])
            end_year = int(years[start_idx + range_size - 1])
            
            # Get all movies in this year range
            movies_in_range = person_data[
                person_data['startYear'].between(start_year, end_year)
            ]
            
            # Get the list of movie titles
            movie_list = movies_in_range['primaryTitle'].tolist()
            
            # Choose appropriate verb based on role
            if role in ['actor', 'actress']:
                verb = "appear in"
            elif role == 'director':
                verb = "direct"
            elif role == 'writer':
                verb = "write"
            else:  # producer
                verb = "produce"
            
            # Create the question
            question = f"Which films did {person} {verb} between {start_year}-{end_year}?"
            # Start the answer with count
            answer = f"{person} worked on {len(movie_list)} film(s) between {start_year}-{end_year}:\n"
            # Add numbered list with titles and years (up to 10)
            answer += "\n".join([f"{i+1}. {title} ({int(year)})" 
                                for i, (title, year) in enumerate(
                                    zip(movies_in_range['primaryTitle'][:10],
                                        movies_in_range['startYear'][:10]))])
            
            # If more than 10, indicate that
            if len(movie_list) > 10:
                answer += f"\n... and {len(movie_list) - 10} more"
            
            # Add this Q&A pair to collection in ChatML format
            self.qa_pairs.append({
                'messages': [
                    {'role': 'user', 'content': question},
                    {'role': 'assistant', 'content': answer}
                ],
                'metadata': {
                    'type': 'temporal_range',
                    'complexity': 'two-hop'
                }
            })
            
            # If we've generated enough, stop
            if len(self.qa_pairs) >= n:
                break
    
    def generate_actor_director_queries(self, n=150):
        """
        Three-hop: Actor working with specific director
        Example: "Which movies did Tom Hanks act in that were directed by Steven Spielberg?"
        """
        # Log what we're generating
        logger.info("Generating actor-director collaboration queries...")
        
        # Counter for successfully generated queries
        generated = 0
        # Counter for total attempts
        attempts = 0
        # Maximum attempts before giving up
        max_attempts = n * 20
        
        # Keep trying until we have enough queries or hit max attempts
        while generated < n and attempts < max_attempts:
            # Increment attempt counter
            attempts += 1
            
            # Pick a random movie from the knowledge base
            movie = self.kb.sample(1).iloc[0]
            # Get its unique ID
            movie_id = movie['tconst']
            
            # Get all people who worked on this movie
            movie_crew = self.kb[self.kb['tconst'] == movie_id]
            
            # Get all actors who worked on this movie
            actors = movie_crew[movie_crew['category'].isin(['actor', 'actress'])]['primaryName'].unique()
            # Get all directors who worked on this movie
            directors = movie_crew[movie_crew['category'] == 'director']['primaryName'].unique()
            
            # Skip if no actors or directors found
            if len(actors) == 0 or len(directors) == 0:
                continue
            
            # Pick a random actor from this movie
            actor = random.choice(actors)
            # Pick a random director from this movie
            director = random.choice(directors)
            
            # Find all movies where this actor worked (as actor)
            actor_movies = set(self.kb[
                (self.kb['primaryName'] == actor) & 
                (self.kb['category'].isin(['actor', 'actress']))
            ]['tconst'])
            
            # Find all movies this director directed
            director_movies = set(self.kb[
                (self.kb['primaryName'] == director) & 
                (self.kb['category'] == 'director')
            ]['tconst'])
            
            # Find intersection: movies where actor worked AND director directed
            collabs = actor_movies & director_movies
            
            # Skip if no collaborations found
            if len(collabs) == 0:
                continue
            
            # Get movie details for all collaborations
            # Group by movie ID and take first row (removes duplicates)
            collab_movies = self.kb[self.kb['tconst'].isin(collabs)].groupby('tconst').first()
            
            # Create the question
            question = f"Which movies did {actor} act in that were directed by {director}?"
            # Start the answer with count
            answer = f"{actor} acted in {len(collabs)} movie(s) directed by {director}:\n"
            # Add numbered list of movies (up to 10)
            answer += "\n".join([f"{i+1}. {row['primaryTitle']} ({int(row['startYear'])})" 
                                for i, (_, row) in enumerate(collab_movies.head(10).iterrows())])
            
            # If more than 10 collaborations, indicate that
            if len(collabs) > 10:
                answer += f"\n... and {len(collabs) - 10} more"
            
            # Add this Q&A pair to collection in ChatML format
            self.qa_pairs.append({
                'messages': [
                    {'role': 'user', 'content': question},
                    {'role': 'assistant', 'content': answer}
                ],
                'metadata': {
                    'type': 'actor_director',
                    'complexity': 'three-hop'
                }
            })
            
            # Increment successful generation counter
            generated += 1
    
    def generate_costar_queries(self, n=200):
        """
        Three-hop: Find movies with specific actor pairs
        Example: "Which movies did Leonardo DiCaprio and Kate Winslet act together in?"
        """
        # Log what we're generating
        logger.info("Generating co-star queries...")
        
        # Group by movie: for each movie, get list of all actors
        movie_actors = self.kb[self.kb['category'].isin(['actor', 'actress'])].groupby('tconst')['primaryName'].apply(list)
        # Get basic movie info (grouped by movie ID, take first row)
        movie_info = self.kb.groupby('tconst').first()
        
        # Counter for successfully generated queries
        generated = 0
        # Counter for total attempts
        attempts = 0
        # Maximum attempts before giving up
        max_attempts = n * 10
        
        # Keep trying until we have enough queries or hit max attempts
        while generated < n and attempts < max_attempts:
            # Increment attempt counter
            attempts += 1
            
            # Pick a random movie that has at least 2 actors
            movie_id = random.choice(movie_actors[movie_actors.apply(len) >= 2].index)
            # Get the list of actors in this movie
            actors = movie_actors[movie_id]
            
            # Skip if less than 2 actors (shouldn't happen but safety check)
            if len(actors) < 2:
                continue
            
            # Pick 2 random actors from this movie
            actor1, actor2 = random.sample(actors, 2)
            
            # Find all movies where actor1 acted
            actor1_movies = set(self.kb[
                (self.kb['primaryName'] == actor1) & 
                (self.kb['category'].isin(['actor', 'actress']))
            ]['tconst'])
            
            # Find all movies where actor2 acted
            actor2_movies = set(self.kb[
                (self.kb['primaryName'] == actor2) & 
                (self.kb['category'].isin(['actor', 'actress']))
            ]['tconst'])
            
            # Find intersection: movies where both actors appeared
            common_movies = actor1_movies & actor2_movies
            
            # Skip if they never worked together
            if len(common_movies) == 0:
                continue
            
            # Get details of their collaborations
            collabs = self.kb[self.kb['tconst'].isin(common_movies)].groupby('tconst').first()
            
            # Create the question
            question = f"Which movies did {actor1} and {actor2} act together in?"
            # Start the answer with count
            answer = f"{actor1} and {actor2} appeared together in {len(common_movies)} movie(s):\n"
            # Add numbered list of movies (up to 10)
            answer += "\n".join([f"{i+1}. {row['primaryTitle']} ({int(row['startYear'])})" 
                                for i, (_, row) in enumerate(collabs.head(10).iterrows())])
            
            # If more than 10 collaborations, indicate that
            if len(common_movies) > 10:
                answer += f"\n... and {len(common_movies) - 10} more"
            
            # Add this Q&A pair to collection in ChatML format
            self.qa_pairs.append({
                'messages': [
                    {'role': 'user', 'content': question},
                    {'role': 'assistant', 'content': answer}
                ],
                'metadata': {
                    'type': 'costar',
                    'complexity': 'three-hop'
                }
            })
            
            # Increment successful generation counter
            generated += 1
    
    def generate_director_comparison_queries(self, n=100):
        """
        Aggregation: Compare directors' output in a specific decade
        Example: "Who directed more movies in the 1990s: Spielberg or Scorsese?"
        """
        # Log what we're generating
        logger.info("Generating director comparison queries...")
        
        # Get all unique directors from knowledge base
        directors = self.kb[self.kb['category'] == 'director']['primaryName'].unique()
        # Shuffle to randomize
        random.shuffle(directors)
        
        # Counter for successfully generated queries
        generated = 0
        
        # Loop through directors in pairs (i and i+1)
        for i in range(0, len(directors)-1, 2):
            # If we have enough queries, stop
            if generated >= n:
                break
                
            # Get the two directors to compare
            director1 = directors[i]
            director2 = directors[i+1]
            
            # Get all movies directed by director1
            dir1_data = self.kb[
                (self.kb['primaryName'] == director1) & 
                (self.kb['category'] == 'director')
            ]
            # Get all movies directed by director2
            dir2_data = self.kb[
                (self.kb['primaryName'] == director2) & 
                (self.kb['category'] == 'director')
            ]
            
            # Skip if either director has less than 3 movies (not enough data)
            if len(dir1_data) < 3 or len(dir2_data) < 3:
                continue
            
            # Pick a random decade to compare
            decade = random.choice([1980, 1990, 2000, 2010])
            
            # Get director1's movies in this decade (decade to decade+9 = 10 years)
            dir1_decade = dir1_data[dir1_data['startYear'].between(decade, decade+9)]
            # Get director2's movies in this decade
            dir2_decade = dir2_data[dir2_data['startYear'].between(decade, decade+9)]
            
            # Skip if neither director worked in this decade
            if len(dir1_decade) == 0 and len(dir2_decade) == 0:
                continue
            
            # Count movies for each director
            count1 = len(dir1_decade)
            count2 = len(dir2_decade)
            
            # Create the question
            question = f"Who directed more movies in the {decade}s: {director1} or {director2}?"
            
            # Create the answer based on who directed more
            if count1 > count2:
                # Director1 directed more
                answer = f"{director1} directed more movies in the {decade}s with {count1} films compared to {director2}'s {count2} films."
            elif count2 > count1:
                # Director2 directed more
                answer = f"{director2} directed more movies in the {decade}s with {count2} films compared to {director1}'s {count1} films."
            else:
                # They directed the same amount
                answer = f"Both {director1} and {director2} directed the same number of films in the {decade}s ({count1} each)."
            
            # Add this Q&A pair to collection in ChatML format
            self.qa_pairs.append({
                'messages': [
                    {'role': 'user', 'content': question},
                    {'role': 'assistant', 'content': answer}
                ],
                'metadata': {
                    'type': 'director_comparison',
                    'complexity': 'aggregation'
                }
            })
            
            # Increment successful generation counter
            generated += 1
    
    def generate_genre_queries(self, n=150):
        """
        Aggregation: Genre-specific queries
        Example: "How many Action movies has Tom Cruise appeared in?"
        """
        # Log what we're generating
        logger.info("Generating genre queries...")
        
        # Get all unique people from knowledge base
        people = self.kb['primaryName'].unique()
        # Shuffle to randomize
        random.shuffle(people)
        
        # List of genres to query about
        genres = ['Action', 'Drama', 'Comedy', 'Thriller', 'Horror', 'Sci-Fi']
        
        # Loop through people (up to n)
        for person in people[:n]:
            # Get all data for this person
            person_data = self.kb[self.kb['primaryName'] == person]
            # Get their primary role
            role = person_data['category'].iloc[0]
            
            # Pick a random genre from our list
            genre = random.choice(genres)
            
            # Count movies in that genre (genres column contains comma-separated genres)
            genre_movies = person_data[
                person_data['genres'].str.contains(genre, na=False)
            ]
            
            # Skip if they have no movies in this genre
            if len(genre_movies) == 0:
                continue
            
            # Create question and answer based on role
            if role in ['actor', 'actress']:
                question = f"How many {genre} movies has {person} appeared in?"
                answer = f"{person} has appeared in {len(genre_movies)} {genre} movie(s)."
            elif role == 'director':
                question = f"How many {genre} movies has {person} directed?"
                answer = f"{person} has directed {len(genre_movies)} {genre} movie(s)."
            else:
                # Skip other roles (writers, producers)
                continue
            
            # Add this Q&A pair to collection in ChatML format
            self.qa_pairs.append({
                'messages': [
                    {'role': 'user', 'content': question},
                    {'role': 'assistant', 'content': answer}
                ],
                'metadata': {
                    'type': 'genre_count',
                    'complexity': 'aggregation'
                }
            })
            
            # If we've generated enough, stop
            if len(self.qa_pairs) >= n:
                break
    
    def generate_all(self, total_samples=2000):
        """
        Generate dataset with custom distribution:
        30% single-hop, 40% two-hop, 20% three-hop, 10% aggregation
        """
        # Single-hop: temporal queries
        self.generate_temporal_queries(n=int(total_samples * 0.30))
        
        # Two-hop: range queries
        self.generate_range_queries(n=int(total_samples * 0.40))
        
        # Three-hop: actor-director & co-star combined
        # Split 20%: 10% actor-director, 10% co-star
        self.generate_actor_director_queries(n=int(total_samples * 0.10))
        self.generate_costar_queries(n=int(total_samples * 0.10))
        
        # Aggregation/statistical: director comparison & genre
        # Split 10%: 5% director comparison, 5% genre
        self.generate_director_comparison_queries(n=int(total_samples * 0.05))
        self.generate_genre_queries(n=int(total_samples * 0.05))
        
        logger.info(f"Generated {len(self.qa_pairs)} total Q&A pairs")
        return self

    def split_and_save(self, output_dir='data'):
        """
        Split dataset into train (80%), validation (10%), test (10%)
        and save to separate JSONL files in ChatML format
        """
        # Log that we're starting the split and save process
        logger.info(f"Splitting {len(self.qa_pairs)} Q&A pairs into train/val/test...")
        
        # Shuffle the data to randomize
        random.shuffle(self.qa_pairs)
        
        # Calculate split indices
        total = len(self.qa_pairs)
        train_end = int(total * 0.80)
        val_end = int(total * 0.90)
        
        # Split the data
        train_data = self.qa_pairs[:train_end]
        val_data = self.qa_pairs[train_end:val_end]
        test_data = self.qa_pairs[val_end:]
        
        # Save each split to a separate file
        splits = {
            'train.jsonl': train_data,
            'validation.jsonl': val_data,
            'test.jsonl': test_data
        }
        
        for filename, data in splits.items():
            filepath = f"{output_dir}/{filename}"
            logger.info(f"Saving {len(data)} examples to {filepath}...")
            
            with open(filepath, 'w', encoding='utf-8') as f:
                for qa in data:
                    # Write only the messages (ChatML format), not metadata
                    f.write(json.dumps({'messages': qa['messages']}, ensure_ascii=False) + '\n')
            
            logger.info(f"✓ Saved {len(data)} examples to {filepath}")
        
        # Print summary
        print("\n" + "="*50)
        print("Dataset Split Summary")
        print("="*50)
        print(f"Train:      {len(train_data)} examples ({len(train_data)/total*100:.1f}%)")
        print(f"Validation: {len(val_data)} examples ({len(val_data)/total*100:.1f}%)")
        print(f"Test:       {len(test_data)} examples ({len(test_data)/total*100:.1f}%)")
        print(f"Total:      {total} examples")
        print("="*50 + "\n")
    
    def get_statistics(self):
        """
        Generate and display statistics about the generated dataset
        Shows distribution of query types and complexities
        """
        # Extract metadata for analysis
        metadata_list = [qa['metadata'] for qa in self.qa_pairs]
        df = pd.DataFrame(metadata_list)
        
        # Print header for statistics
        print("\n" + "="*50)
        print("Dataset Statistics")
        print("="*50)
        
        # Print total number of Q&A pairs
        print(f"\nTotal Q&A pairs: {len(self.qa_pairs)}")
        
        # Print distribution by query type
        print("\nDistribution by Query Type:")
        # Count how many of each type we have
        type_counts = df['type'].value_counts()
        # Loop through each type and its count
        for query_type, count in type_counts.items():
            # Calculate percentage of total
            percentage = (count / len(df)) * 100
            # Print type, count, and percentage
            print(f"  {query_type}: {count} ({percentage:.1f}%)")
        
        # Print distribution by complexity level
        print("\nDistribution by Complexity:")
        # Count how many of each complexity level we have
        complexity_counts = df['complexity'].value_counts()
        # Loop through each complexity and its count
        for complexity, count in complexity_counts.items():
            # Calculate percentage of total
            percentage = (count / len(df)) * 100
            # Print complexity, count, and percentage
            print(f"  {complexity}: {count} ({percentage:.1f}%)")
        
        # Print closing line
        print("="*50 + "\n")
    
    def sample_examples(self, n=5):
        """
        Display random sample of generated Q&A pairs for quality checking
        n: number of examples to display
        """
        # Print header
        print("\n" + "="*50)
        print(f"Sample Q&A Pairs (showing {n} random examples)")
        print("="*50 + "\n")
        
        # Get n random samples from our Q&A pairs
        samples = random.sample(self.qa_pairs, min(n, len(self.qa_pairs)))
        
        # Loop through each sample with index
        for i, qa in enumerate(samples, 1):
            # Print example number and type/complexity
            print(f"Example {i} [{qa['metadata']['type']} - {qa['metadata']['complexity']}]")
            # Print separator line
            print("-" * 50)
            # Print the question with Q: prefix
            print(f"Q: {qa['messages'][0]['content']}")
            # Print the answer with A: prefix
            print(f"A: {qa['messages'][1]['content']}")
            # Print blank line between examples
            print()


In [4]:
# Updated usage code
# Step 0: Create processor
processor = IMDbDataProcessor()
# Step 1: Download datasets
processor.download_datasets()
# Step 2: Load and filter data
movies, names, crew_cast, crew = processor.load_and_filter_data()
# Step 3: Create knowledge base
knowledge_base = processor.create_knowledge_base(movies, names, crew_cast, crew)
# Step 4: Initialize QAGenerator
generator = QAGenerator(knowledge_base)
# Step 5: Generate Q&A pairs
generator.generate_all(total_samples=2000)
# Step 6: Check statistics
generator.get_statistics()
# Step 7: Sample some examples
generator.sample_examples(n=5)
# Step 8: Split and save to train/validation/test files
generator.split_and_save(output_dir='.')

INFO:__main__:Starting dataset downloads...
INFO:__main__:title_basics already exists, skipping...
INFO:__main__:title_principals already exists, skipping...
INFO:__main__:name_basics already exists, skipping...
INFO:__main__:title_crew already exists, skipping...
INFO:__main__:title_ratings already exists, skipping...
INFO:__main__:Loading datasets...
INFO:__main__:Processing title.basics...
INFO:__main__:Found 590,284 movies
INFO:__main__:Processing name.basics...
INFO:__main__:Processing title.principals...
INFO:__main__:Found 66,450,257 crew/cast entries
INFO:__main__:Processing title.crew...
INFO:__main__:Found 11,987,707 crew records
INFO:__main__:Building knowledge base...
INFO:__main__:Knowledge base: 3,786,277 records
INFO:__main__:  Actors/Actresses: 2,599,190
INFO:__main__:  Directors: 394,638
INFO:__main__:  Writers: 449,289
INFO:__main__:  Producers: 343,160
INFO:__main__:Generating temporal queries...
INFO:__main__:Generating range queries...
INFO:__main__:Generating acto


Dataset Statistics

Total Q&A pairs: 1301

Distribution by Query Type:
  temporal_single: 600 (46.1%)
  temporal_range: 200 (15.4%)
  actor_director: 200 (15.4%)
  costar: 200 (15.4%)
  director_comparison: 100 (7.7%)
  genre_count: 1 (0.1%)

Distribution by Complexity:
  single-hop: 600 (46.1%)
  three-hop: 400 (30.7%)
  two-hop: 200 (15.4%)
  aggregation: 101 (7.8%)


Sample Q&A Pairs (showing 5 random examples)

Example 1 [actor_director - three-hop]
--------------------------------------------------
Q: Which movies did Rooholah Mofidi act in that were directed by Armaees Aghamaliyan?
A: Rooholah Mofidi acted in 1 movie(s) directed by Armaees Aghamaliyan:
1. The Humans (1964)

Example 2 [temporal_single - single-hop]
--------------------------------------------------
Q: How many movies did George Edwardes-Hall write in 1920?
A: George Edwardes-Hall wrote 4 movie(s) in 1920:
1. Foolish Monte Carlo
2. Desire
3. Desire
4. The Temptress

Example 3 [temporal_single - single-hop]
-------

In [6]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
udev             15G     0   15G   0% /dev
tmpfs           3.0G  556K  3.0G   1% /run
/dev/sda1       148G   18G  124G  13% /
tmpfs            15G     0   15G   0% /dev/shm
tmpfs           5.0M     0  5.0M   0% /run/lock
/dev/sda15      124M   12M  113M  10% /boot/efi
tmpfs           3.0G     0  3.0G   0% /run/user/1000
