# 🎬 Movie Analysis – TMDb Movies

## 📄 Issue Info / Briefing:
A film studio wants to know the factors that influence a film's financial success. They want to optimise their production strategy based on genre, budget, cast popularity, and other metrics that can impact a film's profitability.


## 🧠 1. Business Understanding
**Objective:** Determine the film production strategy based on financial success analysis, including the influence of key cast and crew.

**Key Questions:**
- Does budget affect a film's success?
- What genres are the most profitable?
- What is the relationship between ratings and profit?
- Do popular actors or directors impact profitability?

---

## Import Library

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

import json
from kaggle.api.kaggle_api_extended import KaggleApi

## Download datasets from Kaggle

In [None]:
def download_dataset(kaggle_json_path, download_path="../data/raw"):
    os.environ['KAGGLE_CONFIG_DIR'] = os.path.dirname(kaggle_json_path)
    api = KaggleApi()
    api.authenticate()
    api.dataset_download_files("tmdb/tmdb-movie-metadata", path=download_path, unzip=True)
    print("✅ Dataset downloaded and extracted to", download_path)

In [None]:
kaggle_json_path = "../kaggle.json"  # Adjust path if needed
if not os.path.exists("../data/raw"):  # simple check if already extracted
    download_dataset(kaggle_json_path)

## 2. Data Acquisition

In [None]:
movies_df = pd.read_csv('../data/raw/tmdb_5000_movies.csv')
credits_df = pd.read_csv('../data/raw/tmdb_5000_credits.csv')

## 3. Data Assessing / Understanding

In [None]:
print("Initial Data Overview:")
print(movies_df[['budget', 'revenue', 'genres', 'vote_average', 'release_date']].head())
print(credits_df[['movie_id', 'title', 'cast', 'crew']].head())

## 4. Data Wrangling

### 4.1 Merge movies_df and credits_df

In [None]:
movies_df = movies_df.merge(credits_df, on='title')

### 4.2 Filter valid data and calculate profit

In [None]:
movies_df = movies_df[(movies_df['budget'] > 0) & (movies_df['revenue'] > 0)]
movies_df['profit'] = movies_df['revenue'] - movies_df['budget']

### 4.3 Convert the date and take the year of release

In [None]:
movies_df['release_date'] = pd.to_datetime(movies_df['release_date'], errors='coerce')
movies_df['release_year'] = movies_df['release_date'].dt.year

### 4.4 Extraction of main genres

In [None]:
def extract_main_genre(genre_str):
    try:
        genres = json.loads(genre_str.replace("'", '"'))
        if genres:
            return genres[0]['name']
    except:
        return None

movies_df['main_genre'] = movies_df['genres'].apply(extract_main_genre)

### 4.5 Extract information from credits_df: Director and Actor

In [None]:
def extract_director(crew_str):
    try:
        crew = json.loads(crew_str.replace("'", '"'))
        for person in crew:
            if person['job'] == 'Director':
                return person['name']
    except:
        return None

movies_df['director'] = movies_df['crew'].apply(extract_director)

In [None]:
def extract_main_actor(cast_str):
    try:
        cast = json.loads(cast_str.replace("'", '"'))
        if cast:
            return cast[0]['name']
    except:
        return None

movies_df['main_actor'] = movies_df['cast'].apply(extract_main_actor)

### 4.6 Drop missing values

In [None]:
movies_df = movies_df.dropna(subset=['release_year', 'main_genre', 'director', 'main_actor'])

## 5. Exploratory Data Analysis (EDA)

In [None]:
if not os.path.exists("../visualizations"):
    os.makedirs("../visualizations")

In [None]:
# Profit per genre
plt.figure(figsize=(10,6))
sns.barplot(data=movies_df.groupby('main_genre')['profit'].mean().sort_values(ascending=False).reset_index(), x='main_genre', y='profit')
plt.xticks(rotation=45)
plt.title('Average Profit by Main Genre')
plt.tight_layout()
plt.savefig('../visualizations/eda_genre_profit.png')
plt.show()

In [None]:
# Budget vs Revenue
plt.figure(figsize=(10,6))
sns.scatterplot(data=movies_df, x='budget', y='revenue', hue='main_genre', alpha=0.6)
plt.title('Budget vs Revenue')
plt.tight_layout()
plt.savefig('../visualizations/eda_budget_revenue.png')
plt.show()

In [None]:
top_directors = movies_df.groupby('director')['profit'].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(10,6))
top_directors.plot(kind='bar', color='skyblue')
plt.title('Top 10 Directors by Total Profit')
plt.ylabel('Total Profit')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('../visualizations/eda_top_directors.png')

In [None]:
director_profit_score = movies_df.groupby('director')['profit'].mean().to_dict()
movies_df['director_score'] = movies_df['director'].map(director_profit_score)

In [None]:
top_actors = movies_df.groupby('main_actor')['profit'].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(10,6))
top_actors.plot(kind='bar', color='salmon')
plt.title('Top 10 Actors by Total Profit')
plt.ylabel('Total Profit')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('../visualizations/eda_top_actors.png')

In [None]:
actor_profit_score = movies_df.groupby('main_actor')['profit'].mean().to_dict()
movies_df['actor_score'] = movies_df['main_actor'].map(actor_profit_score)

## 6. Modeling: Predict revenue from budget, vote_average, vote_count, runtime

In [None]:
features = ['budget', 'vote_average', 'vote_count', 'runtime', 'director_score', 'actor_score']
data_model = movies_df.dropna(subset=features + ['revenue'])
X = data_model[features]
y = data_model['revenue']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Evaluate model
y_pred = lr_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"R²   : {r2:.4f}")
print(f"MSE  : {mse:,.2f}")
print(f"RMSE : {rmse:,.2f}")
print(f"MAE  : {mae:,.2f}")

# Save modeling results
data_model['predicted_revenue'] = lr_model.predict(X)

## 7. Clustering

In [None]:
cluster_data = data_model[['vote_average', 'profit', 'budget']]
kmeans = KMeans(n_clusters=4, random_state=42)
data_model['cluster_label'] = kmeans.fit_predict(cluster_data)

In [None]:
output_path = "../data/processed"
os.makedirs(output_path, exist_ok=True)

data_model.to_csv(f"{output_path}/movie_modeling_with_credit.csv", index=False)
movies_df.to_csv(f"{output_path}/cleaned_movie_data.csv", index=False)

with pd.ExcelWriter(f"{output_path}/movie_dashboard_data.xlsx") as writer:
    movies_df.to_excel(writer, sheet_name='CleanedData', index=False)
    data_model.to_excel(writer, sheet_name='ModelingOutput', index=False)

print("✅ All files saved successfully.")