<a href="https://colab.research.google.com/github/Sriyansh-00/RECOMMENDATION-SYSTEM/blob/main/RECOMMENDATION_SYSTEM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# SETUP AND DATASET DOWNLOAD (No Kaggle needed)
!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip ml-100k.zip

--2025-06-19 09:56:45--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip.2’


2025-06-19 09:56:46 (8.50 MB/s) - ‘ml-100k.zip.2’ saved [4924029/4924029]

Archive:  ml-100k.zip
replace ml-100k/allbut.pl? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/

In [12]:
#  DATA LOADING AND PREPROCESSING (FIXED DUPLICATES ISSUE)
import pandas as pd
import numpy as np

# Load the data from the extracted files
ratings_path = 'ml-100k/u.data'
movies_path = 'ml-100k/u.item'

# Load ratings data
ratings_cols = ['user_id', 'item_id', 'rating', 'timestamp']
ratings_df = pd.read_csv(ratings_path, sep='\t', names=ratings_cols, encoding='latin-1')

# Load movie information
movies_cols = [
    'movie_id', 'movie_title', 'release_date', 'video_release_date',
    'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation',
    'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
    'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
    'Thriller', 'War', 'Western'
]
movies_df = pd.read_csv(movies_path, sep='|', names=movies_cols, encoding='latin-1')

# Merge movie titles with ratings
ratings_df = ratings_df.merge(movies_df[['movie_id', 'movie_title']], left_on='item_id', right_on='movie_id')
ratings_df.drop(columns=['movie_id', 'timestamp'], inplace=True)

# Handle duplicates by taking the mean rating for user-movie pairs
ratings_df = ratings_df.groupby(['user_id', 'movie_title'])['rating'].mean().reset_index()

# Create user-item matrix
user_item_matrix = ratings_df.pivot(index='user_id', columns='movie_title', values='rating').fillna(0)

# Split data into train and test
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=42)

# Data exploration
print("MovieLens 100K Dataset Overview:")
print(f"Total ratings: {len(ratings_df)}")
print(f"Unique users: {ratings_df['user_id'].nunique()}")
print(f"Unique movies: {ratings_df['movie_title'].nunique()}")
print("\nSample ratings:")
print(ratings_df.head())
print("\nSample movies:")
print(movies_df[['movie_id', 'movie_title']].head())

MovieLens 100K Dataset Overview:
Total ratings: 99693
Unique users: 943
Unique movies: 1664

Sample ratings:
   user_id                          movie_title  rating
0        1                101 Dalmatians (1996)     2.0
1        1                  12 Angry Men (1957)     5.0
2        1  20,000 Leagues Under the Sea (1954)     3.0
3        1         2001: A Space Odyssey (1968)     4.0
4        1                    Abyss, The (1989)     3.0

Sample movies:
   movie_id        movie_title
0         1   Toy Story (1995)
1         2   GoldenEye (1995)
2         3  Four Rooms (1995)
3         4  Get Shorty (1995)
4         5     Copycat (1995)


In [13]:
#  USER-USER COLLABORATIVE FILTERING
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def user_user_cf(user_item_matrix, target_user, k=5):
    """User-based collaborative filtering"""
    user_similarity = cosine_similarity(user_item_matrix)
    np.fill_diagonal(user_similarity, 0)

    user_idx = user_item_matrix.index.get_loc(target_user)
    similar_users = np.argsort(user_similarity[user_idx])[-k:]

    predictions = []
    target_ratings = user_item_matrix.iloc[user_idx]
    unrated_movies = target_ratings[target_ratings == 0].index

    for movie in unrated_movies:
        movie_idx = user_item_matrix.columns.get_loc(movie)
        numerator = user_similarity[user_idx, similar_users] @ user_item_matrix.iloc[similar_users, movie_idx]
        denominator = np.sum(np.abs(user_similarity[user_idx, similar_users]))
        if denominator > 0:
            pred_rating = numerator / denominator
            predictions.append((movie, pred_rating))

    predictions.sort(key=lambda x: x[1], reverse=True)
    return predictions[:10]

# Example usage
sample_user = 196
print(f"\nTop 10 Recommendations for User {sample_user}:")
for movie, rating in user_user_cf(user_item_matrix, sample_user):
    print(f"{movie}: {rating:.2f}")


Top 10 Recommendations for User 196:
Back to the Future (1985): 3.33
Fargo (1996): 3.28
Monty Python and the Holy Grail (1974): 3.15
Indiana Jones and the Last Crusade (1989): 3.12
Butch Cassidy and the Sundance Kid (1969): 2.91
Sting, The (1973): 2.91
When Harry Met Sally... (1989): 2.83
Dave (1993): 2.80
Pulp Fiction (1994): 2.78
Star Wars (1977): 2.75


In [14]:
#  ITEM-ITEM COLLABORATIVE FILTERING
def item_item_cf(user_item_matrix, target_user, k=5):
    """Item-based collaborative filtering"""
    item_similarity = cosine_similarity(user_item_matrix.T)
    np.fill_diagonal(item_similarity, 0)

    user_idx = user_item_matrix.index.get_loc(target_user)
    rated_movies = user_item_matrix.iloc[user_idx][user_item_matrix.iloc[user_idx] > 0]

    predictions = []
    for movie in user_item_matrix.columns:
        if not rated_movies.get(movie, False):
            movie_idx = user_item_matrix.columns.get_loc(movie)
            similar_movies = np.argsort(item_similarity[movie_idx])[-k:]

            similar_movies_rated = [
                (sim_movie, item_similarity[movie_idx, sim_movie], user_item_matrix.iloc[user_idx, sim_movie])
                for sim_movie in similar_movies
                if user_item_matrix.iloc[user_idx, sim_movie] > 0
            ]

            if similar_movies_rated:
                sim_scores = np.array([sim[1] for sim in similar_movies_rated])
                ratings = np.array([sim[2] for sim in similar_movies_rated])
                pred_rating = np.dot(sim_scores, ratings) / np.sum(sim_scores)
                predictions.append((movie, pred_rating))

    predictions.sort(key=lambda x: x[1], reverse=True)
    return predictions[:10]

# Example usage
print(f"\nTop 10 Recommendations for User {sample_user}:")
for movie, rating in item_item_cf(user_item_matrix, sample_user):
    print(f"{movie}: {rating:.2f}")


Top 10 Recommendations for User 196:
Billy Madison (1995): 5.00
Local Hero (1983): 5.00
Man Who Would Be King, The (1975): 5.00
Young Guns II (1990): 5.00
Ace Ventura: When Nature Calls (1995): 5.00
Airheads (1994): 5.00
Another Stakeout (1993): 5.00
Benny & Joon (1993): 5.00
Beverly Hills Cop III (1994): 5.00
Boomerang (1992): 5.00


In [15]:
# MATRIX FACTORIZATION (SVD) - FIXED SYNTAX
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt

def matrix_factorization(train_df, user_item_matrix, k=10):
    """Matrix factorization with SVD"""
    # Convert to numpy array and subtract user means
    R = user_item_matrix.values
    user_ratings_mean = np.mean(R, axis=1)
    R_demeaned = R - user_ratings_mean.reshape(-1, 1)

    # Perform SVD
    U, sigma, Vt = svds(R_demeaned, k=k)
    sigma = np.diag(sigma)

    # Make predictions
    all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
    preds_df = pd.DataFrame(all_user_predicted_ratings, columns=user_item_matrix.columns, index=user_item_matrix.index)

    return preds_df

# Train model
print("\nTraining Matrix Factorization...")
preds_df = matrix_factorization(train_df, user_item_matrix, k=10)

# Get recommendations for sample user - FIXED SYNTAX
def recommend_mf(user_id, preds_df, user_item_matrix, n=10):
    # Get and sort the user's predictions
    user_row = user_item_matrix.index.get_loc(user_id)
    sorted_user_predictions = preds_df.iloc[user_row].sort_values(ascending=False)

    # Get the user's data and merge in the movie information
    user_data = user_item_matrix.iloc[user_row]
    user_full = (user_data.to_frame()
                .reset_index()
                .rename(columns={user_data.name: 'actual'}))

    # Recommend the highest predicted rating movies not yet rated
    recommendations = (sorted_user_predictions.to_frame()
                      .reset_index()
                      .rename(columns={sorted_user_predictions.name: 'predicted'}))

    # Merge with user_full to filter out already rated movies
    recs = recommendations[~recommendations['movie_title'].isin(user_full['movie_title'])]
    recs = recs.sort_values('predicted', ascending=False).head(n)

    return recs

# Example usage
sample_user = 196
print(f"\nTop 10 MF Recommendations for User {sample_user}:")
recommendations = recommend_mf(sample_user, preds_df, user_item_matrix)
print(recommendations[['movie_title', 'predicted']])


Training Matrix Factorization...

Top 10 MF Recommendations for User 196:
Empty DataFrame
Columns: [movie_title, predicted]
Index: []
