In [None]:
import pandas as pd
import numpy as np
import requests
import gzip
import shutil
from pathlib import Path
from datetime import datetime

DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

DATASETS = {
    'name_basics': 'https://datasets.imdbws.com/name.basics.tsv.gz',
    'title_basics': 'https://datasets.imdbws.com/title.basics.tsv.gz',
    'title_ratings': 'https://datasets.imdbws.com/title.ratings.tsv.gz',
    'title_crew': 'https://datasets.imdbws.com/title.crew.tsv.gz',
    'title_akas': 'https://datasets.imdbws.com/title.akas.tsv.gz'
}

def download_dataset(name, url):
    gz_path = DATA_DIR / f"{name}.tsv.gz"
    tsv_path = DATA_DIR / f"{name}.tsv"
    
    if tsv_path.exists():
        print(f"{name} already exists")
        return tsv_path
    
    print(f"Downloading {name}...")
    response = requests.get(url, stream=True)
    
    with open(gz_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    
    print(f"Extracting {name}...")
    with gzip.open(gz_path, 'rb') as f_in:
        with open(tsv_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    
    gz_path.unlink()
    return tsv_path

# Download datasets
print("Downloading IMDB datasets...")
for name, url in DATASETS.items():
    download_dataset(name, url)

# Load datasets
print("\nLoading datasets...")
df_names = pd.read_csv(DATA_DIR / 'name_basics.tsv', sep='\t', na_values='\\N')
df_titles = pd.read_csv(DATA_DIR / 'title_basics.tsv', sep='\t', na_values='\\N')
df_ratings = pd.read_csv(DATA_DIR / 'title_ratings.tsv', sep='\t', na_values='\\N')
df_crew = pd.read_csv(DATA_DIR / 'title_crew.tsv', sep='\t', na_values='\\N')
df_akas = pd.read_csv(DATA_DIR / 'title_akas.tsv', sep='\t', na_values='\\N')

# Question 1: Total people
total_people = len(df_names)
print(f"\nQuestion 1: {total_people:,} people")

# Question 2: Earliest birth year
df_names['birthYear_num'] = pd.to_numeric(df_names['birthYear'], errors='coerce')
earliest_year = int(df_names['birthYear_num'].min())
print(f"Question 2: {earliest_year}")

# Question 3: Years ago
current_year = datetime.now().year
years_ago = current_year - earliest_year
print(f"Question 3: {years_ago} years ago")

# Question 4: Is it correct?
print(f"Question 4: Likely incorrect - a birth year of {earliest_year} is unrealistic")

# Question 5: Most recent birth
recent_year = int(df_names['birthYear_num'].max())
print(f"Question 5: {recent_year}")

# Question 6: Missing birth dates
missing = df_names['birthYear'].isna().sum()
percentage = (missing / total_people) * 100
print(f"Question 6: {percentage:.2f}%")

# Question 7: Longest short
df_titles['startYear_num'] = pd.to_numeric(df_titles['startYear'], errors='coerce')
df_titles['runtime_num'] = pd.to_numeric(df_titles['runtimeMinutes'], errors='coerce')

shorts = df_titles[(df_titles['titleType'] == 'short') & 
                   (df_titles['startYear_num'] > 1900) &
                   (df_titles['runtime_num'].notna())]
longest_short = int(shorts['runtime_num'].max())
print(f"Question 7: {longest_short} minutes")

# Question 8: Shortest movie
movies = df_titles[(df_titles['titleType'] == 'movie') & 
                   (df_titles['startYear_num'] > 1900) &
                   (df_titles['runtime_num'] > 0)]
shortest_movie = int(movies['runtime_num'].min())
print(f"Question 8: {shortest_movie} minute(s)")

# Question 9: All genres
all_genres = set()
for genres in df_titles['genres'].dropna():
    all_genres.update(genres.split(','))
all_genres = sorted(all_genres)
print(f"\nQuestion 9: {len(all_genres)} genres")
for genre in all_genres:
    print(f"  - {genre}")

# Question 10: Best comedy
comedies = df_titles[(df_titles['titleType'] == 'movie') &
                     (df_titles['genres'].str.contains('Comedy', na=False))]
comedies_rated = comedies.merge(df_ratings, on='tconst')
comedies_rated = comedies_rated.sort_values(['averageRating', 'numVotes'], 
                                            ascending=[False, False])
best_comedy = comedies_rated.iloc[0]
print(f"\nQuestion 10: {best_comedy['primaryTitle']}")
print(f"  Rating: {best_comedy['averageRating']}/10")
print(f"  Votes: {best_comedy['numVotes']:,}")

# Question 11: Director
best_id = best_comedy['tconst']
crew_info = df_crew[df_crew['tconst'] == best_id].iloc[0]
director_ids = crew_info['directors']
if pd.notna(director_ids):
    director_id = director_ids.split(',')[0]
    director = df_names[df_names['nconst'] == director_id].iloc[0]
    print(f"\nQuestion 11: {director['primaryName']}")

# Question 12: Alternate titles
alts = df_akas[df_akas['titleId'] == best_id]
print(f"\nQuestion 12: {len(alts)} alternate titles")
for idx, row in alts.head(5).iterrows():
    print(f"  - {row['title']} ({row['region']})")

# Export answers
answers = f"""# IMDB Data Analysis Answers

Group Members: [Add your names]
Date: {datetime.now().strftime('%Y-%m-%d')}

## Question 1: Total people in dataset
{total_people:,} people

## Question 2: Earliest birth year
{earliest_year}

## Question 3: Years ago
{years_ago} years ago

## Question 4: Is this correct?
This date is likely incorrect. A birth year of {earliest_year} would be unrealistic 
for any person in the dataset. This is most likely a data entry error.

## Question 5: Most recent birth year
{recent_year}

## Question 6: Percentage without birth date
{percentage:.2f}%

## Question 7: Longest short after 1900
{longest_short} minutes

## Question 8: Shortest movie after 1900
{shortest_movie} minute(s)

## Question 9: All genres
{len(all_genres)} genres: {', '.join(all_genres)}

## Question 10: Highest rated comedy
{best_comedy['primaryTitle']} ({best_comedy['averageRating']}/10, {best_comedy['numVotes']:,} votes)

## Question 11: Director
{director['primaryName'] if pd.notna(director_ids) else 'Not available'}

## Question 12: Alternate titles
{len(alts)} alternate titles found
"""

with open('ANSWERS.md', 'w') as f:
    f.write(answers)

print("\nAnswers saved to ANSWERS.md")