In [None]:
import pandas as pd
import numpy as np
import requests
import gzip
import shutil
from pathlib import Path
from datetime import datetime

DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

DATASETS = {
    'name_basics': 'https://datasets.imdbws.com/name.basics.tsv.gz',
    'title_basics': 'https://datasets.imdbws.com/title.basics.tsv.gz',
    'title_ratings': 'https://datasets.imdbws.com/title.ratings.tsv.gz',
    'title_crew': 'https://datasets.imdbws.com/title.crew.tsv.gz',
    'title_akas': 'https://datasets.imdbws.com/title.akas.tsv.gz'
}

def download_dataset(name, url):

    ## 
    gz_path = DATA_DIR / f"{name}.tsv.gz"
    tsv_path = DATA_DIR / f"{name}.tsv"
    
    if tsv_path.exists():
        return tsv_path
    
    response = requests.get(url, stream=True)
    with open(gz_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    
    with gzip.open(gz_path, 'rb') as f_in:
        with open(tsv_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    
    gz_path.unlink()
    return tsv_path

# Download all datasets
for name, url in DATASETS.items():
    download_dataset(name, url)

# Load datasets with memory optimization
df_names = pd.read_csv(DATA_DIR / 'name_basics.tsv', sep='\t', na_values='\\N')

title_cols = ['tconst', 'titleType', 'primaryTitle', 'startYear', 'runtimeMinutes', 'genres']
df_titles = pd.read_csv(DATA_DIR / 'title_basics.tsv', sep='\t', usecols=title_cols, na_values='\\N')
df_titles['titleType'] = df_titles['titleType'].astype('category')
df_titles['startYear'] = pd.to_numeric(df_titles['startYear'], errors='coerce', downcast='integer')
df_titles['runtimeMinutes'] = pd.to_numeric(df_titles['runtimeMinutes'], errors='coerce', downcast='integer')

df_ratings = pd.read_csv(DATA_DIR / 'title_ratings.tsv', sep='\t', na_values='\\N')
df_crew = pd.read_csv(DATA_DIR / 'title_crew.tsv', sep='\t', na_values='\\N')

akas_cols = ['titleId', 'title', 'region']
chunks = []
for chunk in pd.read_csv(DATA_DIR / 'title_akas.tsv', sep='\t', usecols=akas_cols, na_values='\\N', chunksize=5_000_000):
    chunks.append(chunk)
df_akas = pd.concat(chunks, ignore_index=True)

# 2: Total number of people in the dataset
total_people = len(df_names)
print(f"Question 2: {total_people:,} people")

# 3: Earliest birth year
df_names['birthYear_num'] = pd.to_numeric(df_names['birthYear'], errors='coerce')
earliest_year = int(df_names['birthYear_num'].min())
print(f"Question 3: {earliest_year}")

# 4: Years ago
current_year = datetime.now().year
years_ago = current_year - earliest_year
print(f"Question 4: {years_ago} years ago")

# 5: Check if birth year is realistic
# A birth year this early is likely incorrect; probably a data entry error
print(f"Question 5: Likely incorrect - a birth year of {earliest_year} is unrealistic")

# 6: Most recent birth year
recent_year = int(df_names['birthYear_num'].max())
print(f"Question 7: {recent_year}")

# 7: Percentage of people without a birth date
missing = df_names['birthYear'].isna().sum()
percentage = (missing / total_people) * 100
print(f"Question 8: {percentage:.2f}%")

# 8: Longest short after 1900
shorts = df_titles[(df_titles['titleType'] == 'short') & 
                   (df_titles['startYear'] > 1900) & 
                   (df_titles['runtimeMinutes'].notna())]
longest_short = int(shorts['runtimeMinutes'].max())
print(f"Question 9: {longest_short} minutes")

# 9: Shortest movie after 1900
movies = df_titles[(df_titles['titleType'] == 'movie') & 
                   (df_titles['startYear'] > 1900) & 
                   (df_titles['runtimeMinutes'] > 0)]
shortest_movie = int(movies['runtimeMinutes'].min())
print(f"Question 10: {shortest_movie} minute(s)")

# 10: List of all genres
all_genres = set()
for genres in df_titles['genres'].dropna():
    all_genres.update(genres.split(','))
all_genres = sorted(all_genres)
print(f"Question 11: {len(all_genres)} genres")
for genre in all_genres:
    print(f"  - {genre}")

# 11: Highest rated comedy movie
comedies = df_titles[(df_titles['titleType'] == 'movie') & 
                     (df_titles['genres'].str.contains('Comedy', na=False))]
comedies_rated = comedies.merge(df_ratings, on='tconst')
comedies_rated = comedies_rated.sort_values(['averageRating', 'numVotes'], ascending=[False, False])
best_comedy = comedies_rated.iloc[0]
print(f"Question 12: {best_comedy['primaryTitle']} ({best_comedy['averageRating']}/10)")

# 12: Director of highest rated comedy
best_id = best_comedy['tconst']
crew_info = df_crew[df_crew['tconst'] == best_id].iloc[0]
director_ids = crew_info['directors']
if pd.notna(director_ids):
    director_id = director_ids.split(',')[0]
    director = df_names[df_names['nconst'] == director_id].iloc[0]
    print(f"Question 13: {director['primaryName']}")

# 13: Alternate titles for highest rated comedy
alts = df_akas[df_akas['titleId'] == best_id]
print(f"Question 14: {len(alts)} alternate titles")
for idx, row in alts.head(5).iterrows():
    print(f"  - {row['title']} ({row['region']})")


  df_titles = pd.read_csv(DATA_DIR / 'title_basics.tsv', sep='\t', usecols=title_cols, na_values='\\N')


Question 2: 14,946,473 people
Question 3: 4
Question 4: 2021 years ago
Question 5: Likely incorrect - a birth year of 4 is unrealistic
Question 7: 2025
Question 8: 95.58%
Question 9: 1311 minutes
Question 10: 1 minute(s)
Question 11: 28 genres
  - Action
  - Adult
  - Adventure
  - Animation
  - Biography
  - Comedy
  - Crime
  - Documentary
  - Drama
  - Family
  - Fantasy
  - Film-Noir
  - Game-Show
  - History
  - Horror
  - Music
  - Musical
  - Mystery
  - News
  - Reality-TV
  - Romance
  - Sci-Fi
  - Short
  - Sport
  - Talk-Show
  - Thriller
  - War
  - Western
Question 12: O La La (10.0/10)
Question 13: Sripad Pai
Question 14: 2 alternate titles
  - O La La (nan)
  - O La La (IN)


In [None]:
# answer file séparé 
answers = f"""# IMDB Data Analysis Answers

Group Members: [Add your names]
Date: {datetime.now().strftime('%Y-%m-%d')}

## Question 2: Total people in dataset
{total_people:,} people

## Question 3: Earliest birth year
{earliest_year}

## Question 4: Years ago
{years_ago} years ago

## Question 5: Check if birth year is realistic
This birth year is likely incorrect. A birth year of {earliest_year} is unrealistic.

## Question 6: Most recent birth year
{recent_year}

## Question 7: Percentage without birth date
{percentage:.2f}%

## Question 8: Longest short after 1900
{longest_short} minutes

## Question 9: Shortest movie after 1900
{shortest_movie} minute(s)

## Question 10: All genres
{len(all_genres)} genres: {', '.join(all_genres)}

## Question 11: Highest rated comedy movie
{best_comedy['primaryTitle']} ({best_comedy['averageRating']}/10)

## Question 12: Director
{director['primaryName'] if pd.notna(director_ids) else 'Not available'}

## Question 13: Alternate titles
{len(alts)} alternate titles found
"""

with open('ANSWERS.md', 'w') as f:
    f.write(answers)

print("Answers saved to ANSWERS.md")


Answers saved to ANSWERS.md
