# TMDB Movie Dataset Analysis

This notebook contains the analysis of the TMDB 10,000 Movies dataset.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA

df = pd.read_csv("../data/TMDB_10000_Movies_Dataset.csv")
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['overview'] = df['overview'].fillna("")
df = df.drop_duplicates()

features = df[['popularity', 'vote_average', 'vote_count']].dropna()
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

kmeans = KMeans(n_clusters=5, random_state=42)
df['cluster'] = kmeans.fit_predict(scaled_features)

X = df[['vote_average', 'vote_count']].dropna()
y = df.loc[X.index, 'popularity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))

pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_features)
df['pca1'] = pca_result[:,0]
df['pca2'] = pca_result[:,1]

plt.figure(figsize=(8,6))
sns.scatterplot(x='pca1', y='pca2', hue='cluster', data=df, palette='Set2')
plt.title("KMeans Clustering on TMDB Movies (PCA Reduced)")
plt.show()