<a href="https://colab.research.google.com/github/SaiShreeja07/face-pay/blob/main/movie_pbl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Full end-to-end Movie Recommendation System (Item-based CF) - FIX APPLIED

import os
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import math
import warnings
from io import StringIO
from difflib import get_close_matches
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings("ignore")

# ----------------------------------------------------
# 1) Load Data Directly from Kaggle Path
# NOTE: This assumes the initial kagglehub.dataset_download() step has already run
# and the path is accessible.
# ----------------------------------------------------
KAGGLE_DATA_PATH = "/kaggle/input/movie-recommendation-system"
movies_path = os.path.join(KAGGLE_DATA_PATH, "movies.csv")
ratings_path = os.path.join(KAGGLE_DATA_PATH, "ratings.csv")

print("--- Data Loading from Confirmed Kaggle Path ---")
print("movies:", movies_path)
print("ratings:", ratings_path)

if not os.path.exists(movies_path) or not os.path.exists(ratings_path):
    raise FileNotFoundError(
        "Could not find movies.csv or ratings.csv at the expected path: "
        f"{KAGGLE_DATA_PATH}. Ensure the initial `kagglehub.dataset_download()` "
        "step was run successfully."
    )
# ----------------------------------------------------


# ----------------------------
# 2) Load CSVs (Using standard read for performance on clean Kaggle files)
# ----------------------------
try:
    movies = pd.read_csv(movies_path)
    ratings = pd.read_csv(ratings_path)
    print("Loaded dataframes successfully.")
except Exception as e:
    print(f"Error reading CSVs: {e}")
    # Fallback to safe_read_csv if standard fails (though less likely here)
    def safe_read_csv(path):
        try:
            return pd.read_csv(path, encoding='utf-8', on_bad_lines='skip')
        except:
            return pd.read_csv(path, encoding='latin-1', on_bad_lines='skip')
    movies = safe_read_csv(movies_path)
    ratings = safe_read_csv(ratings_path)


print("\nMovies sample:")
print(movies.head(3))
print("\nRatings sample:")
print(ratings.head(3))

# ----------------------------
# 3) Use only 2000 ratings (subsample)
# ----------------------------
N = 2000
if len(ratings) > N:
    ratings = ratings.head(N).reset_index(drop=True)
    print(f"\nSubsampled ratings to {N} rows.")
else:
    print(f"\nRatings count <= {N}, using full ratings ({len(ratings)})")

# ----------------------------
# 4) Ensure column names and types (standardize)
# ----------------------------
ratings['userId'] = ratings['userId'].astype(int)
ratings['movieId'] = ratings['movieId'].astype(int)
ratings['rating'] = pd.to_numeric(ratings['rating'], errors='coerce')
movies['movieId'] = movies['movieId'].astype(int)
movies['genres'] = movies['genres'].fillna('')
ratings = ratings.dropna(subset=['rating'])

MIN_RATING = ratings['rating'].min()
MAX_RATING = ratings['rating'].max()

print("\nAfter normalization:")
print("Ratings shape:", ratings.shape)
print("Movies shape:", movies.shape)
print(f"**DATASET RATING LIMITS: {MIN_RATING} to {MAX_RATING}**")

# ----------------------------
# 5) Merge and basic preprocessing
# ----------------------------
data = pd.merge(ratings, movies, on='movieId', how='left')
data = data.dropna(subset=['title']).reset_index(drop=True)
data = data[['userId', 'movieId', 'title', 'genres', 'rating']]

movie_stats = data.groupby('title')['rating'].agg(['count', 'mean']).reset_index()
movies = pd.merge(movies, movie_stats, on='title', how='left').fillna({'count': 0, 'mean': 0})

print("\nMerged data shape:", data.shape)
print("Unique users:", data['userId'].nunique())
print("Unique movies:", data['movieId'].nunique())
print("Ratings count:", len(data))

# ----------------------------
# --- 5.5) Visualizations ---
# ----------------------------
print("\nGenerating visualizations...")
sns.set(style="whitegrid")

# Plot 1: Distribution of ratings
plt.figure(figsize=(10, 6))
sns.countplot(x='rating', data=data, palette="viridis")
plt.title("Distribution of Movie Ratings")
plt.xlabel("Rating")
plt.ylabel("Number of Ratings")
plt.show() #

# Plot 2: Top 20 Most-Rated Movies
plt.figure(figsize=(12, 8))
top_movies = movies.sort_values('count', ascending=False).head(20)
sns.barplot(y='title', x='count', data=top_movies, palette="rocket")
plt.title("Top 20 Most-Rated Movies")
plt.xlabel("Number of Ratings")
plt.ylabel("Movie Title")
plt.tight_layout()
plt.show() #


# ----------------------------
# 6) Create user-item matrix
# ----------------------------
user_item_matrix = data.pivot_table(index='userId', columns='title', values='rating')
movie_ratings_filled = user_item_matrix.fillna(0)
print("\nUser-item matrix shape (users x movies):", user_item_matrix.shape)

# ----------------------------
# 7) Compute item-item similarity
# ----------------------------
print("\nComputing item-item (movie) cosine similarity matrix...")
item_similarity = cosine_similarity(movie_ratings_filled.T)
item_similarity_df = pd.DataFrame(item_similarity, index=movie_ratings_filled.columns, columns=movie_ratings_filled.columns)
print("Similarity matrix done. Shape:", item_similarity_df.shape)

# ----------------------------
# 8) Recommendation helpers
# ----------------------------
def get_similar_movies(movie_title, top_n=10):
    if movie_title not in item_similarity_df.index:
        return []
    sims = item_similarity_df[movie_title].sort_values(ascending=False)
    top = sims.iloc[1:top_n+1]
    return list(top.index)

def predict_rating_item_based(user_id, movie_title):
    if user_id not in user_item_matrix.index or movie_title not in item_similarity_df.columns:
        return np.nan
    user_ratings = user_item_matrix.loc[user_id]
    rated = user_ratings[user_ratings.notna()]
    if rated.empty:
        return np.nan
    sims = item_similarity_df.loc[movie_title, rated.index]
    numerator = (sims * rated).sum()
    denom = sims.sum()
    return (numerator / denom) if denom != 0 else np.nan

def recommend_movies_for_user(user_id, top_n=10):
    if user_id not in user_item_matrix.index:
        return []
    user_ratings = user_item_matrix.loc[user_id]
    unseen = user_ratings[user_ratings.isna()].index.tolist()
    preds = {}
    for m in unseen:
        pred = predict_rating_item_based(user_id, m)
        preds[m] = pred if not np.isnan(pred) else -1
    sorted_preds = sorted(preds.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return [title for title, score in sorted_preds]

def get_top_rated_movies(n=5, min_ratings=10):
    qualified_movies = movies[movies['count'] >= min_ratings]
    top_movies = qualified_movies.sort_values('mean', ascending=False).head(n)
    return top_movies[['title', 'mean', 'count']]

def get_top_movies_by_genre(genre, n=5, min_ratings=10):
    genre_movies = movies[movies['genres'].str.contains(genre, case=False, na=False)]
    qualified_movies = genre_movies[genre_movies['count'] >= min_ratings]
    top_movies = qualified_movies.sort_values('mean', ascending=False).head(n)
    return top_movies[['title', 'genres', 'mean', 'count']]


if not movie_ratings_filled.columns.empty:
    sample_movie = movie_ratings_filled.columns[0]
    print(f"\nExample: top-5 similar to '{sample_movie}':")
    print(get_similar_movies(sample_movie, top_n=5))

if not user_item_matrix.index.empty:
    sample_user = user_item_matrix.index[0]
    print(f"\nExample: top-5 recommendations for user {sample_user}:")
    print(recommend_movies_for_user(sample_user, top_n=5))

# ----------------------------
# 9) Train-test split and rating-prediction evaluation (RMSE, MAE)
# ----------------------------
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

preds = []
actuals = []
sample_predictions = []
N_SAMPLE = 5

for i, row in enumerate(test_df.itertuples(index=False)):
    user = row.userId
    title = row.title
    actual = row.rating
    pred = predict_rating_item_based(user, title)
    if not np.isnan(pred):
        preds.append(pred)
        actuals.append(actual)

        if len(sample_predictions) < N_SAMPLE:
             sample_predictions.append({
                 'User ID': user,
                 'Movie Title': title,
                 'Actual Rating (Standard)': actual,
                 'Predicted Rating (Model)': pred
             })

rmse = None
mae = None
if len(preds) == 0:
    print("\nWarning: No predictions could be made on the test set (cold-starts). RMSE/MAE cannot be computed.")
else:
    rmse = math.sqrt(mean_squared_error(actuals, preds))
    mae = mean_absolute_error(actuals, preds)

    print(f"\n{'='*70}")
    print("ðŸŽ¬ Rating Prediction Evaluation (Item-based Collaborative Filtering)")
    print(f"Predictions made: {len(preds)} / {len(test_df)} test rows")
    print(f"{'='*70}")

    print("## ðŸ“Š Error Metrics and Scale Context")
    print(f"* **Rating Scale Limits:** {MIN_RATING} to {MAX_RATING} (a range of {MAX_RATING - MIN_RATING} points)")
    print(f"* **RMSE** (Root Mean Squared Error): **{rmse:.4f}** (Less than 1.0 is considered good)")
    print(f"* **MAE** (Mean Absolute Error): **{mae:.4f}** (Indicates an average error of ~{mae:.2f} stars)")
    print(f"\n{'='*70}")

    print("## âœ¨ Sample Comparison: Actual vs. Predicted Ratings")
    print("The RMSE and MAE are calculated by comparing these values:")
    sample_df = pd.DataFrame(sample_predictions)
    # ------------------ FIX APPLIED HERE ------------------
    sample_df['Error (Abs Diff)'] = (sample_df['Actual Rating (Standard)'] - sample_df['Predicted Rating (Model)']).abs().round(4)
    # ------------------------------------------------------
    sample_df['Predicted Rating (Model)'] = sample_df['Predicted Rating (Model)'].round(4)
    print(sample_df.to_string(index=False))
    print(f"\n{'='*70}")

# ----------------------------
# 10) Precision@K and Recall@K evaluation for top-K recommendations (functions omitted for brevity but remain the same)
# ----------------------------
# ... (Evaluation logic remains the same)

def precision_at_k(actual_list, predicted_list, k=5):
    if not predicted_list: return 0.0
    pred_k = predicted_list[:k]
    actual_set = set(actual_list)
    return len(set(pred_k) & actual_set) / len(pred_k)

def recall_at_k(actual_list, predicted_list, k=5):
    if not actual_list: return 0.0
    pred_k = predicted_list[:k]
    actual_set = set(actual_list)
    return len(set(pred_k) & actual_set) / len(actual_set)

relevant_threshold = 4.0
user_test_group = test_df.groupby('userId')['movieId'].apply(list).to_dict()
movieid_to_title = movies.set_index('movieId')['title'].to_dict()
user_test_titles = {}
for u, mid_list in user_test_group.items():
    relevs = test_df[(test_df['userId'] == u) & (test_df['rating'] >= relevant_threshold)]['title'].tolist()
    user_test_titles[u] = relevs

Ks = [5, 10]
results = {}
for K in Ks:
    precisions = []
    recalls = []
    users_evaluated = 0
    for user in user_test_titles:
        actual = user_test_titles[user]
        if len(actual) == 0: continue
        recommended = recommend_movies_for_user(user, top_n=K)
        if not recommended: continue
        users_evaluated += 1
        precisions.append(precision_at_k(actual, recommended, k=K))
        recalls.append(recall_at_k(actual, recommended, k=K))

    avg_prec = np.mean(precisions) if users_evaluated > 0 else 0.0
    avg_rec = np.mean(recalls) if users_evaluated > 0 else 0.0

    results[K] = (avg_prec, avg_rec, users_evaluated)
    print(f"\nEvaluation @K={K}: evaluated users={users_evaluated}, Precision@{K}={avg_prec:.4f}, Recall@{K}={avg_rec:.4f}")

# -----------------------------------------------
# 11) Interactive Recommendation System
# -----------------------------------------------

def run_interactive_session():
    print("\n" + "="*50)
    print(" === Welcome to the Movie Recommendation System! ===")
    print("="*50)

    valid_users = list(user_item_matrix.index)
    valid_movies = list(item_similarity_df.index)

    N = 2000
    print(f"\n(Note: Using the first {N} ratings from the dataset)")
    print("You can now try User IDs like 1, 2, 3, 4, 5, ...")

    current_user = None

    while True:
        if current_user is None:
            user_id_input = input("\nType 'quit' to exit.\nPlease enter your User ID: ")

            if user_id_input.lower() == 'quit':
                break

            try:
                user_id = int(user_id_input)
                if user_id in valid_users:
                    current_user = user_id
                    print(f"\nWelcome, User {current_user}! What would you like to do?")
                else:
                    print(f"Error: User ID {user_id} not found in this (subsampled) dataset. Please try again.")
            except ValueError:
                print("Invalid input. Please enter a number.")
            continue

        print("\n--- Main Menu ---")
        print("1: Get My Top 5 Movie Recommendations")
        print("2: Find Top 5 Movies by Average Rating")
        print("3: Find Top 5 Movies by Genre")
        print("4: Find Movies Similar to One I Like")
        print("5: Logout (Return to User ID selection)")

        choice = input("Enter your choice (1-5): ")

        if choice == '1':
            print(f"\nGetting top 5 recommendations for User {current_user}...")
            recs = recommend_movies_for_user(current_user, top_n=5)
            if recs:
                for i, title in enumerate(recs):
                    print(f" {i+1}. {title}")
            else:
                print("Could not generate recommendations (perhaps you have rated no movies?).")

        elif choice == '2':
            try:
                min_reviews = input("Enter minimum number of ratings (e.g., 5): ")
                min_reviews = int(min_reviews) if min_reviews else 5

                print(f"\nFinding top 5 movies with at least {min_reviews} reviews...")
                top_rated = get_top_rated_movies(n=5, min_ratings=min_reviews)
                if not top_rated.empty:
                    print(top_rated.to_string(index=False))
                else:
                    print(f"No movies found with at least {min_reviews} reviews.")
            except ValueError:
                print("Invalid number. Using default of 5.")
                top_rated = get_top_rated_movies(n=5, min_ratings=5)
                print(top_rated.to_string(index=False))

        elif choice == '4':
            movie_input = input("Enter a movie title you like: ")
            if movie_input in valid_movies:
                print(f"\nFinding movies similar to '{movie_input}'...")
                sim_movies = get_similar_movies(movie_input, top_n=5)
                if sim_movies:
                    for i, title in enumerate(sim_movies):
                        print(f" {i+1}. {title}")
                else:
                    print("Could not find similar movies.")
            else:
                matches = get_close_matches(movie_input, valid_movies, n=3)
                print(f"Error: Movie '{movie_input}' not found.")
                if matches:
                    print(f"Did you mean: {matches}?")

        elif choice == '5':
            print(f"Logging out User {current_user}...")
            current_user = None

        elif choice == '3':
            genre_input = input("Enter a genre (e.g., 'Comedy', 'Sci-Fi', 'Action'): ")
            if not genre_input:
                print("No genre entered.")
                continue

            try:
                min_reviews = input("Enter minimum number of ratings (e.g., 5): ")
                min_reviews = int(min_reviews) if min_reviews else 5

                print(f"\nFinding top 5 '{genre_input}' movies with at least {min_reviews} reviews...")
                top_genre = get_top_movies_by_genre(genre_input, n=5, min_ratings=min_reviews)
                if not top_genre.empty:
                    print(top_genre.to_string(index=False))
                else:
                    print(f"No movies found for genre '{genre_input}' with at least {min_reviews} reviews.")
            except ValueError:
                print("Invalid number for minimum ratings.")

        else:
            print("Invalid choice. Please enter a number from 1 to 5.")

    print("\nThank you for using the Movie Recommendation System. Goodbye!")


# --- Run the interactive session ---
if movies_path is not None and ratings_path is not None:
    run_interactive_session()

--- Data Loading from Confirmed Kaggle Path ---
movies: /kaggle/input/movie-recommendation-system/movies.csv
ratings: /kaggle/input/movie-recommendation-system/ratings.csv


FileNotFoundError: Could not find movies.csv or ratings.csv at the expected path: /kaggle/input/movie-recommendation-system. Ensure the initial `kagglehub.dataset_download()` step was run successfully.