# 🎬 Comprehensive Movie Dataset EDA & Clustering

In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv("movie_cluster_data.csv")
df.head()


## 1. 📊 Single-Variable Analysis

In [None]:
import plotly.express as px

# Histograms
for col in ["popularity", "runtime", "vote_average"]:
    fig = px.histogram(df, x=col, nbins=20, title=f"Distribution of {col.title()}")
    fig.show()

# Boxplots by cluster
for col in ["popularity", "runtime", "vote_average"]:
    fig = px.box(df, x="cluster", y=col, points="all", title=f"{col.title()} by Cluster")
    fig.show()

# Genre proportions
genre_cols = [col for col in df.columns if col.startswith("is_")]
genre_mean = df[genre_cols].mean().sort_values()
fig = px.bar(x=genre_mean.index, y=genre_mean.values, title="Overall Genre Distribution")
fig.update_layout(xaxis_title="Genre", yaxis_title="Proportion")
fig.show()


## 2. 🔁 Bivariate Analysis

In [None]:
# Scatter matrix
fig = px.scatter_matrix(df, dimensions=["popularity", "runtime", "vote_average"],
                        color="cluster", title="Scatter Matrix of Key Features")
fig.show()

# Correlation heatmap
import numpy as np
import plotly.figure_factory as ff

corr = df[["popularity", "runtime", "vote_average"]].corr()
fig = ff.create_annotated_heatmap(
    z=np.round(corr.values, 2),
    x=list(corr.columns),
    y=list(corr.index),
    colorscale="Viridis",
    showscale=True
)
fig.update_layout(title="Correlation Heatmap")
fig.show()

# Scatter by pair with cluster color
fig = px.scatter(df, x="popularity", y="vote_average", color="cluster", hover_data=["title"],
                 title="Popularity vs. Vote Average by Cluster")
fig.show()


## 3. 🌐 Embedding and Clustering

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

# Feature selection and scaling
features = ["popularity", "runtime", "vote_average"] + [col for col in df.columns if col.startswith("is_")]
X = df[features]
X_scaled = StandardScaler().fit_transform(X)

# PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(X_scaled)
df["pca_x"], df["pca_y"] = pca_result[:, 0], pca_result[:, 1]

# t-SNE
tsne = TSNE(n_components=2, perplexity=30, learning_rate="auto", init="pca", random_state=42, n_iter=500)
tsne_result = tsne.fit_transform(X_scaled)
df["tsne_x"], df["tsne_y"] = tsne_result[:, 0], tsne_result[:, 1]

# Clustering
kmeans = KMeans(n_clusters=5, random_state=42)
df["cluster"] = kmeans.fit_predict(df[["tsne_x", "tsne_y"]])


In [None]:
# Visualize PCA
fig = px.scatter(df, x="pca_x", y="pca_y", color="cluster", hover_data=["title"], title="PCA Projection")
fig.show()

# Visualize t-SNE
fig = px.scatter(df, x="tsne_x", y="tsne_y", color="cluster", hover_data=["title"], title="t-SNE Projection")
fig.show()


## 4. 🧠 Cluster Interpretation

In [None]:
for cluster_id in sorted(df["cluster"].unique()):
    subset = df[df["cluster"] == cluster_id]
    print(f"\n=== Cluster {cluster_id} ===")
    print(f"Size: {len(subset)}")
    print(f"Avg Rating: {subset['vote_average'].mean():.2f}")
    print(f"Avg Runtime: {subset['runtime'].mean():.1f} mins")
    print(f"Avg Popularity: {subset['popularity'].mean():.1f}")
    top_genres = subset[[c for c in df.columns if c.startswith("is_")]].mean().sort_values(ascending=False)
    print("Top Genres:")
    print(top_genres.head(3))
    print("Sample Titles:")
    print(subset['title'].head(3).to_string(index=False))


## 5. 🎞️ MP4 Animation Export (Planned)

In [None]:
print("A short and long MP4 animation will be generated to visualize cluster emergence over time. Coming soon!")
