In [None]:
!pip install pandas scikit-learn seaborn matplotlib


In [None]:
from google.colab import files

uploaded = files.upload()


In [None]:
import pandas as pd

df = pd.read_csv('netflix_titles.csv')

df.head()


In [None]:

df = df[['type', 'title', 'listed_in', 'duration', 'rating']]

df.dropna(inplace=True)


df = df[df['type'] == 'Movie']

df['duration'] = df['duration'].str.extract('(\d+)').astype(float)

df['main_genre'] = df['listed_in'].apply(lambda x: x.split(',')[0].strip())

df.head()


In [None]:
# One-Hot Encode the main_genre column
genre_dummies = pd.get_dummies(df['main_genre'])

# Combine genre, duration, and rating into one DataFrame
features = pd.concat([genre_dummies, df[['duration']]], axis=1)

# Check the result
features.head()


In [None]:
from sklearn.cluster import KMeans


kmeans = KMeans(n_clusters=5, random_state=42)

kmeans.fit(features)


df['Cluster'] = kmeans.labels_

df[['title', 'main_genre', 'duration', 'Cluster']].head()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))

# Plot using seaborn
sns.scatterplot(
    x='duration',
    y='main_genre',
    hue='Cluster',
    data=df,
    palette='Set2',
    s=100,
    alpha=0.7
)

plt.title("Clustering of Netflix Movies by Genre and Duration", fontsize=14)
plt.xlabel("Duration (minutes)")
plt.ylabel("Main Genre")
plt.legend(title='Cluster')
plt.grid(True)
plt.show()


In [None]:
#I used the Elbow Method to find the optimal number of clusters. The point where the WCSS curve bends ('elbow') is the ideal number of clusters
wcss = []
from sklearn.cluster import KMeans


for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(features)
    wcss.append(kmeans.inertia_)

# Plot the Elbow graph
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 4))
plt.plot(range(1, 11), wcss, marker='o', color='blue')
plt.title('Elbow Method - Optimal Number of Clusters')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS (Within Cluster Sum of Squares)')
plt.grid(True)
plt.show()


In [None]:
#This gives insights into each cluster — e.g., Cluster 0 mostly contains Comedy movies under 90 mins
# Group by Cluster and get average duration and movie count
cluster_summary = df.groupby('Cluster').agg({
    'title': 'count',
    'duration': 'mean',
    'main_genre': lambda x: x.value_counts().index[0]
}).rename(columns={
    'title': 'Movie Count',
    'duration': 'Avg Duration (min)',
    'main_genre': 'Most Common Genre'
})

cluster_summary


In [None]:
#Cluster 2 contains short Action films. A few examples include...
df[df['Cluster'] == 2][['title', 'main_genre', 'duration']].head(10)
