In [None]:
import pandas as pd
import requests
import os
from PIL import Image
import io
from tqdm import tqdm
import logging
from dotenv import load_dotenv


load_dotenv()
TMDB_API_KEY = os.getenv("TMDB_API_KEY") or "YOUR_TMDB_API_KEY"  


logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)


TMDB_BASE_URL = "https://image.tmdb.org/t/p/w500"
IMAGE_DIR = "images/"


os.makedirs(IMAGE_DIR, exist_ok=True)

logger.info("Loading dataset...")
df = pd.read_json("cleaned_movies.json")
logger.info(f"Loaded {len(df)} movies")


def is_valid_image(file_path):
    try:
        with Image.open(file_path) as img:
            img.verify()
        return True
    except Exception as e:
        logger.warning(f"Invalid image {file_path}: {e}")
        return False


def download_poster(row, max_retries=3):
    poster_path = row["poster_path"]
    movie_id = row["id"]
    

    if pd.isna(poster_path) or not poster_path or poster_path == "":
        logger.warning(f"Missing poster_path for movie ID {movie_id}")
        return False
    

    filename = poster_path.lstrip("/")
    img_url = f"{TMDB_BASE_URL}/{filename}"
    img_path = os.path.join(IMAGE_DIR, filename)
    

    if os.path.exists(img_path) and is_valid_image(img_path):
        return True
    
    
    for attempt in range(max_retries):
        try:
            response = requests.get(img_url, timeout=10, headers={"Authorization": f"Bearer {TMDB_API_KEY}"})
            if response.status_code == 200:
                with open(img_path, "wb") as f:
                    f.write(response.content)
                if is_valid_image(img_path):
                    return True
                else:
                    os.remove(img_path)
                    logger.warning(f"Corrupt image for movie ID {movie_id}: {img_url}")
                    return False
            else:
                logger.warning(f"Failed to download {img_url}: Status {response.status_code}")
        except Exception as e:
            logger.error(f"Error downloading {img_url}: {e}")
        if attempt < max_retries - 1:
            logger.info(f"Retrying ({attempt+2}/{max_retries}) for {img_url}")
    
    return False


logger.info("Downloading posters...")
tqdm.pandas()
df["image_downloaded"] = df.progress_apply(download_poster, axis=1)


original_len = len(df)
df_cleaned = df[df["image_downloaded"]].drop(columns=["image_downloaded"])
logger.info(f"Filtered dataset: {len(df_cleaned)} movies (removed {original_len - len(df_cleaned)})")
if len(df_cleaned) == 0:
    logger.error("No valid images downloaded. Check TMDB API key or poster_path format.")
    raise ValueError("No valid images available")


logger.info("Updating train/val/test splits...")
train_df = pd.read_json("train_fixed.json")
val_df = pd.read_json("val_fixed.json")
test_df = pd.read_json("test_fixed.json")

train_df_cleaned = train_df[train_df["id"].isin(df_cleaned["id"])]
val_df_cleaned = val_df[val_df["id"].isin(df_cleaned["id"])]
test_df_cleaned = test_df[test_df["id"].isin(df_cleaned["id"])]


df_cleaned.to_json("cleaned_movies_updated.json", orient="records", indent=2)
train_df_cleaned.to_json("train_fixed_updated.json", orient="records", indent=2)
val_df_cleaned.to_json("val_fixed_updated.json", orient="records", indent=2)
test_df_cleaned.to_json("test_fixed_updated.json", orient="records", indent=2)

logger.info(f"Saved updated datasets: Train ({len(train_df_cleaned)}), Val ({len(val_df_cleaned)}), Test ({len(test_df_cleaned)})")
logger.info(f"Images saved in {IMAGE_DIR}")

2025-04-28 02:09:45,061 - INFO - Loading dataset...
2025-04-28 02:09:46,369 - INFO - Loaded 84094 movies
2025-04-28 02:09:46,370 - INFO - Downloading posters...
 38%|███▊      | 31899/84094 [1:38:29<2:03:56,  7.02it/s]2025-04-28 03:48:25,882 - ERROR - Error downloading https://image.tmdb.org/t/p/w500/v373WxuWPaUlincYeu6AIzcZUV8.jpg: HTTPSConnectionPool(host='image.tmdb.org', port=443): Read timed out. (read timeout=10)
2025-04-28 03:48:25,887 - INFO - Retrying (2/3) for https://image.tmdb.org/t/p/w500/v373WxuWPaUlincYeu6AIzcZUV8.jpg
100%|██████████| 84094/84094 [4:15:19<00:00,  5.49it/s]
2025-04-28 06:25:05,536 - INFO - Filtered dataset: 79583 movies (removed 4511)
2025-04-28 06:25:05,539 - INFO - Updating train/val/test splits...
2025-04-28 06:25:08,885 - INFO - Saved updated datasets: Train (63639), Val (7981), Test (7963)
2025-04-28 06:25:08,887 - INFO - Images saved in images/


In [None]:

import os
import pandas as pd
from pathlib import Path

project_dir = r"C:\Users\ranje\Jupyter Notebooks\Sem 2\BoAI\Project 1\New start"
json_files = [f for f in os.listdir(project_dir) if f.endswith(".json")]

print("JSON Files Found:")
for file in json_files:
    file_path = os.path.join(project_dir, file)
    size_mb = os.path.getsize(file_path) / (1024 * 1024)  # Size in MB
    try:
        df = pd.read_json(file_path)
        row_count = len(df)
        columns = list(df.columns)
        print(f"{file}: {row_count} rows, {size_mb:.2f} MB, Columns: {columns}")
    except Exception as e:
        print(f"{file}: Error reading ({e}), {size_mb:.2f} MB")

JSON Files Found:
cleaned_movies.json: 84094 rows, 108.00 MB, Columns: ['adult', 'backdrop_path', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id', 'imdb_id', 'origin_country', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'video', 'vote_average', 'vote_count', 'num_genres']
cleaned_movies_updated.json: 79583 rows, 127.87 MB, Columns: ['adult', 'backdrop_path', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id', 'imdb_id', 'origin_country', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'video', 'vote_average', 'vote_count', 'num_genres']
preprocessed_movies.json: 84094 rows, 121.79 MB, Columns: ['adult', 'backdrop_path', 'belong

In [None]:
import os


current_dir = os.getcwd()
print(f"Current Directory: {current_dir}\n")


print("Files in Current Folder:")
files = [f for f in os.listdir(current_dir) if os.path.isfile(os.path.join(current_dir, f))]
if not files:
    print("No files found.")
else:
    for file in files:
        file_path = os.path.join(current_dir, file)
        size_mb = os.path.getsize(file_path) / (1024 * 1024)  
        print(f"{file}: {size_mb:.2f} MB")
        

Current Directory: c:\Users\ranje\Jupyter Notebooks\Sem 2\BoAI\Project 1\New start

Files in Current Folder:
best_movie_genre_model.pth: 5.90 MB
best_text_model.pth: 6.04 MB
best_thresholds.npy: 0.00 MB
class_weights.npy: 0.00 MB
cleaned_movies_updated.json: 127.87 MB
cm_text_action.png: 0.02 MB
confusion_matrix_action.png: 0.07 MB
f1_scores_by_genre.png: 0.21 MB
loss_curves.png: 0.16 MB
loss_plot.png: 0.02 MB
movie_genre_model.pth: 5.08 MB
Poster_download.ipynb: 0.00 MB
Project.ipynb: 0.13 MB
test_fixed_updated.json: 16.18 MB
test_labels.npy: 0.61 MB
test_padded.npy: 3.21 MB
test_targets.npy: 1.22 MB
train_fixed_updated.json: 129.65 MB
train_labels.npy: 5.36 MB
train_padded.npy: 28.23 MB
train_targets.npy: 9.75 MB
val_fixed.json: 12.65 MB
val_fixed_updated.json: 16.30 MB
val_labels.npy: 0.61 MB
val_padded.npy: 3.21 MB
val_targets.npy: 1.22 MB
val_text_preds.npy: 0.61 MB
val_text_true.npy: 0.61 MB
