# 🎬 Movie Analysis – TMDb Movies

## 📄 Issue Info / Briefing:
A movie studio wants to know what makes a movie financially successful. They want to optimize their production strategy based on genre, budget, and other metrics that can affect a film's profitability.


## 🧠 1. Business Understanding
**Objective:** Determine movie production strategy based on financial success analysis.

**Key Questions:**
- Does budget affect the success of a movie?
- What genres are most profitable?
- How do movie ratings relate to profitability?

---

## Import Library

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error

import json
from kaggle.api.kaggle_api_extended import KaggleApi

## Download datasets from Kaggle

In [None]:
def download_dataset(kaggle_json_path, download_path="../data/raw"):
    os.environ['KAGGLE_CONFIG_DIR'] = os.path.dirname(kaggle_json_path)
    api = KaggleApi()
    api.authenticate()
    api.dataset_download_files("tmdb/tmdb-movie-metadata", path=download_path, unzip=True)
    print("✅ Dataset downloaded and extracted to", download_path)

In [None]:
kaggle_json_path = "../kaggle.json"  # Adjust path if needed
if not os.path.exists("../data/raw"):  # simple check if already extracted
    download_dataset(kaggle_json_path)

## 2. Data Acquisition

In [None]:
movies_df = pd.read_csv('../data/raw/tmdb_5000_movies.csv')
credits_df = pd.read_csv('../data/raw/tmdb_5000_credits.csv')

## 3. Data Assessing / Understanding

In [None]:
print("Initial Data Overview:")
print(movies_df[['budget', 'revenue', 'genres', 'vote_average', 'release_date']].head())

## 4. Data Wrangling

In [None]:
data_movie = movies_df[(movies_df['budget'] > 0) & (movies_df['revenue'] > 0)]
data_movie['release_date'] = pd.to_datetime(data_movie['release_date'], errors='coerce')
data_movie['release_year'] = data_movie['release_date'].dt.year
data_movie['profit'] = data_movie['revenue'] - data_movie['budget']

def extract_main_genre(genre_str):
    try:
        genres = json.loads(genre_str.replace("'", '"'))
        if genres:
            return genres[0]['name']
    except:
        return None

data_movie['main_genre'] = data_movie['genres'].apply(extract_main_genre)

# Drop missing release year or genre
movies_df = data_movie.dropna(subset=['release_year', 'main_genre'])

## 5. Exploratory Data Analysis (EDA)

In [None]:
if not os.path.exists("../visualizations"):
    os.makedirs("../visualizations")

plt.figure(figsize=(10,6))
sns.barplot(data=movies_df.groupby('main_genre')['profit'].mean().sort_values(ascending=False).reset_index(), x='main_genre', y='profit')
plt.xticks(rotation=45)
plt.title('Average Profit by Main Genre')
plt.tight_layout()
plt.savefig('../visualizations/eda_genre_profit.png')
plt.show()


In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(data=movies_df, x='budget', y='revenue', hue='main_genre', alpha=0.6)
plt.title('Budget vs Revenue')
plt.tight_layout()
plt.savefig('../visualizations/eda_budget_revenue.png')
plt.show()

## 6. Modeling: Predict revenue from budget, vote_average, vote_count, runtime

In [None]:
features = ['budget', 'vote_average', 'vote_count', 'runtime']
data_model = movies_df.dropna(subset=features)
X = data_model[features]
y = data_model['revenue']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)

print("R2 Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))

# Save modeling results
data_model['predicted_revenue'] = lr_model.predict(X)

## 7. Clustering

In [None]:
cluster_data = data_model[['vote_average', 'profit', 'budget']]
kmeans = KMeans(n_clusters=4, random_state=42)
data_model['cluster_label'] = kmeans.fit_predict(cluster_data)

In [None]:
output_path = "../data/processed"
os.makedirs(output_path, exist_ok=True)

data_model.to_csv(f"{output_path}/movie_modeling_output.csv", index=False)
movies_df.to_csv(f"{output_path}/cleaned_movie_data.csv", index=False)

with pd.ExcelWriter(f"{output_path}/movie_dashboard_data.xlsx") as writer:
    movies_df.to_excel(writer, sheet_name='CleanedData', index=False)
    data_model.to_excel(writer, sheet_name='ModelingOutput', index=False)

print("✅ All files saved successfully.")
