<a href="https://colab.research.google.com/github/Tanya-Verma/Apline_dashboard/blob/main/K_means_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##### K-means Clustering  #####

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Step 1: Simulate Netflix data (download real 'netflix_titles.csv' from Kaggle: https://www.kaggle.com/datasets/shivamb/netflix-shows)
shows = ['Stranger Things', 'Squid Game', 'The Crown', 'Bridgerton', 'Tiger King', 'The Witcher', 'Ozark', 'Narcos', 'Dark', 'Money Heist']

# Synthetic user ratings (5 users x 10 shows, scale 1-5)
np.random.seed(42)  # For reproducibility
user_ratings = np.random.randint(1, 6, size=(5, 10))
ratings_df = pd.DataFrame(user_ratings, columns=shows, index=['User1', 'User2', 'User3', 'User4', 'User5'])
print("User Ratings Matrix:")
print(ratings_df)
print()

# Step 2: Prepare data for K-Means (standardize ratings)
scaler = StandardScaler()
ratings_scaled = scaler.fit_transform(ratings_df)

# Step 3: Apply K-Means (K=2 clusters, like 'thriller fans' vs 'drama lovers')
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
clusters = kmeans.fit_predict(ratings_scaled)
print("User Clusters:", dict(zip(ratings_df.index, clusters)))
print()

# Step 4: Recommendation function (for a user, find cluster mates & top avg-rated shows)
def recommend_for_user(user_name, ratings_df, clusters, kmeans, top_n=3):
    user_idx = ratings_df.index.get_loc(user_name)
    user_cluster = clusters[user_idx]

    # Users in same cluster (exclude self)
    cluster_users = [u for u, c in zip(ratings_df.index, clusters) if c == user_cluster and u != user_name]

    if not cluster_users:
        return "No similar users found!"

    # Avg ratings in cluster for each show
    cluster_ratings = ratings_df.loc[cluster_users].mean()
    top_recs = cluster_ratings.sort_values(ascending=False).head(top_n)

    print(f"Recommendations for {user_name} (Cluster {user_cluster}):")
    for show, score in top_recs.items():
        print(f"- {show} (avg rating: {score:.1f})")
    return top_recs

# Demo: Recommend for User1
recommend_for_user('User1', ratings_df, clusters, kmeans)

# Visualize clusters (bonus for your Short!)
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.scatter(range(len(ratings_scaled[0])), ratings_scaled[0], label='User1', s=100)
plt.title('User1 Ratings (Scaled)')
plt.ylabel('Scaled Rating')

plt.subplot(1, 2, 2)
colors = ['red' if c==0 else 'blue' for c in clusters]
plt.scatter(range(len(ratings_scaled)), ratings_scaled[:, 0], c=colors, s=100)
plt.title('Users Clustered by Ratings')
plt.ylabel('Rating for Show 1')
plt.legend(['Cluster 0', 'Cluster 1'])
plt.tight_layout()
plt.show()
