In [None]:
# Disney Character Popularity Analysis Notebook
# Requirements (pip):
# pip install pandas numpy scikit-learn matplotlib seaborn kaggle joblib

import os
import zipfile
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib
import warnings
warnings.filterwarnings('ignore')
sns.set(style="whitegrid")

DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

# ---------- (OPTIONAL) Download from Kaggle ----------
# If you want the notebook to download the dataset via Kaggle API:
# 1) Install kaggle (pip install kaggle)
# 2) Create ~/.kaggle/kaggle.json with your API token
# 3) Uncomment the following lines to download
#
# import subprocess
# datasets = [
#     "thedevastator/disney-character-success-a-comprehensive-analysi",
#     "suvroo/disney-movies-dataset"
# ]
# for d in datasets:
#     subprocess.run(["kaggle", "datasets", "download", "-d", d, "-p", str(DATA_DIR)], check=True)
#     # unzip
#     zips = list(DATA_DIR.glob("*.zip"))
#     for z in zips:
#         with zipfile.ZipFile(z, "r") as zf:
#             zf.extractall(DATA_DIR)
#         z.unlink()

# NOTE: If you already have the CSVs put them in ./data/ and skip Kaggle download above.

# ---------- 1) Load CSVs ----------
# These filenames are those commonly contained in the Character Success dataset.
# If your filenames differ, adjust the paths below.

chars_path = DATA_DIR / "disney-characters.csv"            # from thedevastator dataset
movies_gross_path = DATA_DIR / "disney_movies_total_gross.csv"
movies_all_path = DATA_DIR / "disney_movies_total_gross.csv"  # fallback to same if only one exists

# Try a few likely filenames (some Kaggle dumps use slightly different names)
candidates = list(DATA_DIR.glob("**/*characters*.csv")) + list(DATA_DIR.glob("**/*character*.csv"))
if candidates:
    chars_path = candidates[0]
movies_candidates = list(DATA_DIR.glob("**/*gross*.csv")) + list(DATA_DIR.glob("**/*movies*.csv"))
if movies_candidates:
    movies_gross_path = movies_candidates[0]

print("Using character file:", chars_path)
print("Using movie gross file:", movies_gross_path)

df_chars = pd.read_csv(chars_path)
df_movies = pd.read_csv(movies_gross_path)

print("chars shape:", df_chars.shape)
print("movies shape:", df_movies.shape)

# ---------- Quick look ----------
display(df_chars.head())
display(df_movies.head())

# ---------- 2) Merge & initial cleaning ----------
# We want: a per-movie row containing movie metadata + main_character + villain + songs + gross
# Identify the relevant columns present in the files and standardize them.

# Inspect columns
print("chars columns:", df_chars.columns.tolist())
print("movies columns:", df_movies.columns.tolist())

# Common column names: 'movie_title', 'title', 'release_date', 'total_gross', 'inflation_adjusted_gross'
# Normalize names:
if "movie_title" in df_chars.columns:
    df_chars = df_chars.rename(columns={"movie_title": "title"})
if "title" not in df_chars.columns and "movie" in df_chars.columns:
    df_chars = df_chars.rename(columns={"movie": "title"})

# For movies df
if "movie" in df_movies.columns and "title" not in df_movies.columns:
    df_movies = df_movies.rename(columns={"movie": "title"})
if "total_gross" not in df_movies.columns and "total_gross" in df_movies.columns:
    pass  # keep as is

# Merge on title (use left join of characters into movies)
# But first, ensure title string normalization
def norm_title(s):
    if pd.isna(s): return s
    return str(s).strip().lower()

df_chars['title_norm'] = df_chars['title'].astype(str).apply(norm_title)
df_movies['title_norm'] = df_movies['title'].astype(str).apply(norm_title)

# There may be duplicates if dataset has one row per character — reduce to main character rows
# The chars file often has 'main_character' column; if not, we try to infer by role.
if 'main_character' in df_chars.columns:
    df_main = df_chars.copy()
else:
    # fallback: try to filter by role indicator or first listed character per title
    df_main = df_chars.sort_values(by=['title_norm']).groupby('title_norm').first().reset_index()

# Merge: movie metadata + main-character-level info
df = pd.merge(df_movies, df_main, on='title_norm', how='left', suffixes=('_movie','_char'))
print("Merged DF shape:", df.shape)
display(df.head())

# ---------- 3) Create target: 'popular' ----------
# We'll make a binary target: popular vs not popular based on inflation_adjusted_gross.
# If inflation_adjusted_gross not present, use total_gross.

if 'inflation_adjusted_gross' in df.columns:
    gross_col = 'inflation_adjusted_gross'
elif 'adjusted_gross' in df.columns:
    gross_col = 'adjusted_gross'
elif 'total_gross' in df.columns:
    gross_col = 'total_gross'
else:
    # fallback: try to find any column with 'gross' in name
    gross_cols = [c for c in df.columns if 'gross' in c.lower()]
    gross_col = gross_cols[0] if gross_cols else None

print("Chosen gross column:", gross_col)
df[gross_col] = pd.to_numeric(df[gross_col], errors='coerce')

# Define popularity threshold: top 33% as 'popular'
df['popular'] = (df[gross_col] >= df[gross_col].quantile(0.67)).astype(int)
print(df['popular'].value_counts())

# ---------- 4) Feature engineering ----------
# Basic, interpretable features from the columns commonly present.
# - release_year, release_month
# - has_villain (boolean if villain name present)
# - songs_count (count of songs listed)
# - title_length (number of words in title)
# - main_character_name_length
# - main_character_freq (how many movies include same main character in dataset) — a proxy for franchise strength

def safe_len_str(s):
    if pd.isna(s): return 0
    return len(str(s).split())

# release date -> year/month
if 'release_date' in df.columns:
    # try to parse
    df['release_date_parsed'] = pd.to_datetime(df['release_date'], errors='coerce')
    df['release_year'] = df['release_date_parsed'].dt.year
    df['release_month'] = df['release_date_parsed'].dt.month
else:
    # try 'year' column
    if 'year' in df.columns:
        df['release_year'] = pd.to_numeric(df['year'], errors='coerce')
        df['release_month'] = np.nan
    else:
        df['release_year'] = np.nan
        df['release_month'] = np.nan

df['has_villain'] = (~df['villain'].isna()) & (df['villain'].astype(str).str.strip() != "")
df['songs_count'] = df['songs'].fillna("").apply(lambda s: len([x for x in str(s).split(';') if x.strip()!=""]))
df['title_len_words'] = df['title_movie'].fillna(df['title']).astype(str).apply(safe_len_str)
# main character name length
name_col = 'main_character' if 'main_character' in df.columns else 'character' if 'character' in df.columns else None
if name_col:
    df['main_char_name_len'] = df[name_col].fillna("").astype(str).apply(lambda s: len(s))
    # frequency of main_character in dataset
    df['main_char_freq'] = df[name_col].map(df[name_col].value_counts()).fillna(0)
else:
    df['main_char_name_len'] = 0
    df['main_char_freq'] = 0

# Genre / mpaa rating from movies df if present
genre_col = None
possible_genres = [c for c in df.columns if 'genre' in c.lower()]
if possible_genres:
    genre_col = possible_genres[0]
    # some rows have multiple genres -> keep first genre
    df['genre_primary'] = df[genre_col].fillna("").astype(str).apply(lambda s: s.split(',')[0].strip() if s else 'unknown')
else:
    df['genre_primary'] = 'unknown'

# Keep a working features DataFrame
features = [
    'release_year', 'release_month', 'has_villain', 'songs_count',
    'title_len_words', 'main_char_name_len', 'main_char_freq', 'genre_primary'
]
df_model = df[features + ['popular', gross_col]].copy()
print("Prepared model df shape:", df_model.shape)
display(df_model.head())

# ---------- 5) Split: time-based train / val / test ----------
# We'll train on movies released <= 2010, validate on 2011-2013, test on >=2014.
# This simulates predicting newer movies from historical patterns.

# If release_year is missing for many rows, fallback to random stratified split.
if df_model['release_year'].isna().mean() < 0.5:
    train_df = df_model[df_model['release_year'] <= 2010].copy()
    val_df   = df_model[(df_model['release_year'] > 2010) & (df_model['release_year'] <= 2013)].copy()
    test_df  = df_model[df_model['release_year'] >= 2014].copy()
    # If any set empty (small dataset) fallback to stratified split
    if len(train_df) < 10 or len(val_df) < 5:
        print("Time-split produced too-small sets, falling back to stratified split.")
        X = df_model.drop(columns=['popular', gross_col])
        y = df_model['popular']
        X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
        X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)
        # Recreate dataframes
        train_df = X_train.copy(); train_df['popular'] = y_train
        val_df = X_val.copy(); val_df['popular'] = y_val
        test_df = X_test.copy(); test_df['popular'] = y_test
else:
    # too many missing release_years -> use stratified split
    X = df_model.drop(columns=['popular', gross_col])
    y = df_model['popular']
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)
    train_df = X_train.copy(); train_df['popular'] = y_train
    val_df = X_val.copy(); val_df['popular'] = y_val
    test_df = X_test.copy(); test_df['popular'] = y_test

print("Train/Val/Test sizes:", len(train_df), len(val_df), len(test_df))
print("Train popular distribution:\n", train_df['popular'].value_counts(normalize=True))

# ---------- 6) Preprocessing pipelines ----------
# Numeric features:
num_features = ['release_year', 'release_month', 'songs_count', 'title_len_words', 'main_char_name_len', 'main_char_freq']
cat_features = ['genre_primary', 'has_villain']  # treat has_villain as categorical

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_features),
    ('cat', categorical_transformer, cat_features)
], remainder='drop')

# ---------- 7) Model pipeline ----------
clf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)

pipe = Pipeline(steps=[
    ('preproc', preprocessor),
    ('clf', clf)
])

# Separate X/y
X_train = train_df.drop(columns=['popular', gross_col], errors='ignore')
y_train = train_df['popular']
X_val = val_df.drop(columns=['popular', gross_col], errors='ignore')
y_val = val_df['popular']

# Fit baseline model
pipe.fit(X_train, y_train)

# ---------- 8) Validation evaluation ----------
y_val_pred = pipe.predict(X_val)
print("Validation accuracy:", accuracy_score(y_val, y_val_pred))
print("Validation F1 (macro):", f1_score(y_val, y_val_pred, average='macro'))
print("\nClassification report:\n", classification_report(y_val, y_val_pred))

# Confusion matrix
cm = confusion_matrix(y_val, y_val_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['not_popular','popular'], yticklabels=['not_popular','popular'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Validation Confusion Matrix")
plt.show()

# Feature importances (map back to feature names)
# Need to extract feature names from preprocessor
ohe = pipe.named_steps['preproc'].named_transformers_['cat'].named_steps['onehot']
ohe_features = ohe.get_feature_names_out(cat_features)
num_features_out = num_features
all_features = list(num_features_out) + list(ohe_features)
importances = pipe.named_steps['clf'].feature_importances_
feat_imp = pd.Series(importances, index=all_features).sort_values(ascending=False)
print("Top features by importance:\n", feat_imp.head(15))

# ---------- 9) Quick EDA plots (popularity vs features) ----------
plt.figure(figsize=(8,4))
sns.boxplot(x='popular', y='songs_count', data=pd.concat([train_df, val_df]))
plt.title("Songs count vs Popularity")
plt.show()

plt.figure(figsize=(8,4))
sns.countplot(x='genre_primary', hue='popular', data=pd.concat([train_df, val_df]))
plt.title("Genre vs Popularity (train+val)")
plt.xticks(rotation=45)
plt.show()

# ---------- 10) Save the pipeline ----------
joblib.dump(pipe, "disney_popularity_pipeline.joblib")
print("Saved pipeline to disney_popularity_pipeline.joblib")

# You're now ready to run final test evaluation on test_df if you wish. The notebook stops at validation per request.
