<a href="https://colab.research.google.com/github/Thotasaiteja2004/python-for-data-science/blob/main/project%20code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# Collaborative filtering demo: user-kNN, SVD, NMF
# Expects uploaded CSV at /mnt/data/Netflix Dataset.csv
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import NMF
from scipy.sparse.linalg import svds

path = "/content/Netflix Dataset.csv"
df = pd.read_csv(path, engine='python')

# try to find user/item/rating columns (case-insensitive)
lower_cols = [c.lower() for c in df.columns]
def find_col(possible):
    for p in possible:
        if p in lower_cols:
            return df.columns[lower_cols.index(p)]
    return None

user_col = find_col(['user','userid','user id','user_id'])
item_col = find_col(['item','movie','movieid','movie id','movie_id','title','movie_name'])
rating_col = find_col(['rating','ratings','score'])

if user_col is None or item_col is None or rating_col is None:
    # no interaction table: create simulated ratings using movies from metadata file
    print("No user-item-rating columns detected; creating simulated ratings dataset for demo.")
    movie_name_col = find_col(['title','movie_name','movie','name'])
    if movie_name_col is None:
        movie_list = df.iloc[:,0].astype(str).astype('category').cat.categories.tolist()
    else:
        movie_list = df[movie_name_col].astype(str).astype('category').cat.categories.tolist()

    n_movies = len(movie_list)
    n_users = min(500, max(50, n_movies * 2))  # simulate up to 500 users
    rng = np.random.default_rng(42)
    rows = []
    for u in range(n_users):
        k = rng.integers(10, min(60, n_movies+1))
        mids = rng.choice(n_movies, size=k, replace=False)
        for m in mids:
            rating = rng.integers(1,6)  # 1..5
            rows.append({"userId": f"user_{u}", "movieId": movie_list[m], "rating": float(rating)})
    df_ratings = pd.DataFrame(rows)
else:
    df_ratings = df[[user_col, item_col, rating_col]].rename(columns={user_col:'userId', item_col:'movieId', rating_col:'rating'})

# Map to integer ids for matrix indices
user_map = {u:i for i,u in enumerate(df_ratings['userId'].unique())}
movie_map = {m:i for i,m in enumerate(df_ratings['movieId'].unique())}
df_ratings['uid'] = df_ratings['userId'].map(user_map)
df_ratings['mid'] = df_ratings['movieId'].map(movie_map)

n_users = df_ratings['uid'].nunique()
n_movies = df_ratings['mid'].nunique()
print(f"Users: {n_users}, Movies: {n_movies}, Ratings: {len(df_ratings)}")

# train/test split
train_df, test_df = train_test_split(df_ratings, test_size=0.2, random_state=42)

# build user-item rating matrices (NaN for missing)
def build_matrix(ratings_df):
    mat = np.zeros((n_users, n_movies)); mat[:] = np.nan
    for _, row in ratings_df.iterrows():
        mat[int(row['uid']), int(row['mid'])] = row['rating']
    return mat

R_train = build_matrix(train_df)
R_test = build_matrix(test_df)

global_mean = np.nanmean(R_train)
def rmse(preds, truths):
    mask = ~np.isnan(truths)
    if mask.sum() == 0:
        return np.nan
    return np.sqrt(mean_squared_error(truths[mask], preds[mask]))

# User-kNN (cosine over mean-centered vectors)
def user_knn_predict(R_train, k=20):
    user_means = np.nanmean(R_train, axis=1)
    R_centered = np.where(np.isnan(R_train), 0, R_train - user_means[:, None])
    norms = np.linalg.norm(R_centered, axis=1); norms[norms==0] = 1e-9
    R_normed = R_centered / norms[:, None]
    sim = R_normed.dot(R_normed.T)
    np.fill_diagonal(sim, 0)
    preds = np.zeros_like(R_train)
    for u in range(R_train.shape[0]):
        topk = np.argsort(sim[u])[-k:][::-1]
        sims = sim[u, topk]
        for i in range(R_train.shape[1]):
            neighbor_ratings = R_train[topk, i]
            mask = ~np.isnan(neighbor_ratings)
            if mask.sum() == 0:
                preds[u, i] = user_means[u]
            else:
                numer = np.dot(sims[mask], (neighbor_ratings[mask] - user_means[topk][mask]))
                denom = np.sum(np.abs(sims[mask])) + 1e-9
                preds[u, i] = user_means[u] + numer/denom
    return preds

print("Computing user-kNN predictions (k=30)...")
user_knn_preds = user_knn_predict(R_train, k=30)
print("User-kNN RMSE:", rmse(user_knn_preds, R_test))

# SVD (matrix factorization) - fill missing with user mean first, center, run svds
filled = np.where(np.isnan(R_train), np.nan, R_train)
user_means = np.nanmean(filled, axis=1)
filled = np.where(np.isnan(filled), user_means[:, None], filled)
M = filled - user_means[:, None]

k = 20  # latent dims (reduce if slow)
print("Running SVD (k={}) ...".format(k))
u, s, vt = svds(M, k=k)
u = u[:, ::-1]; s = s[::-1]; vt = vt[::-1, :]
S = np.diag(s)
pred_matrix_svd = (u.dot(S)).dot(vt) + user_means[:, None]
print("SVD RMSE:", rmse(pred_matrix_svd, R_test))

# NMF (non-negative factorization as concept-decomposition proxy)
nmf_fill = np.where(np.isnan(R_train), global_mean, R_train)
nmf_model = NMF(n_components=20, init='nndsvda', random_state=42, max_iter=200)
W = nmf_model.fit_transform(nmf_fill)
H = nmf_model.components_
pred_matrix_nmf = W.dot(H)
print("NMF RMSE:", rmse(pred_matrix_nmf, R_test))

# Save predictions (optional)
np.save("/content/pred_user_knn.npy", user_knn_preds)
np.save("/content/pred_svd.npy", pred_matrix_svd)
np.save("/content/pred_nmf.npy", pred_matrix_nmf)
print("Saved prediction matrices to /content/")

No user-item-rating columns detected; creating simulated ratings dataset for demo.
Users: 500, Movies: 6884, Ratings: 16731
Computing user-kNN predictions (k=30)...
User-kNN RMSE: 1.5063342739996335
Running SVD (k=20) ...
SVD RMSE: 1.4423785105719886
NMF RMSE: 1.40512354982538
Saved prediction matrices to /content/
