In [None]:
import pandas as pd


# Load the ratings data
ratings = pd.read_csv('../dataset/movielens_dataset/ml-100k/u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])
# Load the movies data
movies = pd.read_csv('../dataset/movielens_dataset/ml-100k/u.item', sep='|', encoding='latin-1', 
                     names=['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url', 
                            'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 
                            'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 
                            'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])

# Load the users data
users = pd.read_csv('../dataset/movielens_dataset/ml-100k/u.user', sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])

# Display the first few rows of each dataset
print("Ratings Data:")
print(ratings.head())

print("\nMovies Data:")
print(movies.head())

print("\nUsers Data:")
print(users.head())

Ratings Data:
   user_id  movie_id  rating  timestamp
0      196       242       3  881250949
1      186       302       3  891717742
2       22       377       1  878887116
3      244        51       2  880606923
4      166       346       1  886397596

Movies Data:
   movie_id              title release_date  video_release_date  \
0         1   Toy Story (1995)  01-Jan-1995                 NaN   
1         2   GoldenEye (1995)  01-Jan-1995                 NaN   
2         3  Four Rooms (1995)  01-Jan-1995                 NaN   
3         4  Get Shorty (1995)  01-Jan-1995                 NaN   
4         5     Copycat (1995)  01-Jan-1995                 NaN   

                                            imdb_url  unknown  Action  \
0  http://us.imdb.com/M/title-exact?Toy%20Story%2...        0       0   
1  http://us.imdb.com/M/title-exact?GoldenEye%20(...        0       1   
2  http://us.imdb.com/M/title-exact?Four%20Rooms%...        0       0   
3  http://us.imdb.com/M/title-exact?G

In [1]:
# Extract relevant columns from ratings data
ratings = ratings[['user_id', 'movie_id', 'rating']]

# Display the shape of the ratings data
print("Ratings Data Shape:", ratings.shape)

NameError: name 'ratings' is not defined

In [None]:
# Check for missing values
print("Missing Values in Ratings Data:")
print(ratings.isnull().sum())

Missing Values in Ratings Data:
user_id     0
movie_id    0
rating      0
dtype: int64


In [None]:
# Filter out movies with fewer than 10 ratings
movie_counts = ratings['movie_id'].value_counts()
ratings = ratings[ratings['movie_id'].isin(movie_counts[movie_counts >= 10].index)]

# Filter out users with fewer than 10 ratings
user_counts = ratings['user_id'].value_counts()
ratings = ratings[ratings['user_id'].isin(user_counts[user_counts >= 10].index)]

# Display the shape of the cleaned ratings data
print("Cleaned Ratings Data Shape:", ratings.shape)

Cleaned Ratings Data Shape: (97953, 3)


In [None]:
from scipy.sparse import csr_matrix

# Create a user-item interaction matrix
user_item_matrix = ratings.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)

# Convert to a sparse matrix for efficiency
user_item_matrix_sparse = csr_matrix(user_item_matrix.values)

# Display the shape of the interaction matrix
print("User-Item Interaction Matrix Shape:", user_item_matrix_sparse.shape)

User-Item Interaction Matrix Shape: (943, 1152)


In [None]:
# Create mappings for user and movie IDs
user_id_to_index = {user_id: idx for idx, user_id in enumerate(user_item_matrix.index)}
movie_id_to_index = {movie_id: idx for idx, movie_id in enumerate(user_item_matrix.columns)}

# Display sample mappings
print("User ID to Index Mapping (Sample):", list(user_id_to_index.items())[:5])
print("Movie ID to Index Mapping (Sample):", list(movie_id_to_index.items())[:5])

User ID to Index Mapping (Sample): [(1, 0), (2, 1), (3, 2), (4, 3), (5, 4)]
Movie ID to Index Mapping (Sample): [(1, 0), (2, 1), (3, 2), (4, 3), (5, 4)]


In [5]:
# Load the MovieLens dataset
def load_movielens_data(data_path):
    """
    Load the MovieLens dataset from the specified path.
    """
    ratings = pd.read_csv(os.path.join(data_path, 'u.data'), sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])
    movies = pd.read_csv(os.path.join(data_path, 'u.item'), sep='|', encoding='latin-1', 
                         names=['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url', 
                                'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 
                                'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 
                                'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])
    users = pd.read_csv(os.path.join(data_path, 'u.user'), sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])
    return ratings, movies, users

# Preprocess the data
def preprocess_data(ratings):
    """
    Preprocess the ratings data to create a user-item interaction matrix.
    """
    # Create a user-item interaction matrix
    user_item_matrix = ratings.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
    user_item_matrix_sparse = csr_matrix(user_item_matrix.values)
    return user_item_matrix_sparse


In [None]:
import sys
import os

# Add the 'src' directory to the Python path
current_dir = os.getcwd()  # Use the current working directory
src_dir = os.path.abspath(os.path.join(current_dir, '..', 'src'))
sys.path.append(src_dir)

# Import the KNN classes
from recommenders.knn.item_knn import ItemKNN
from recommenders.knn.user_knn import UserKNN
# print(sys.path)  
# Initialize and fit the UserKNN mode
data_path = 'ml-100k

ratings, movies, users = load_movielens_data(data_path)
user_item_matrix_sparse = preprocess_data(ratings)

user_knn = UserKNN(k=5, similarity_metric='cosine')
user_knn.fit(user_item_matrix_sparse)

# Generate recommendations for user 1
user_recommendations = user_knn.recommend(user_id=1, top_n=5)
print("UserKNN Recommendations for User 1:", user_recommendations)

# Initialize and fit the ItemKNN model
item_knn = ItemKNN(k=5, similarity_metric='cosine')
item_knn.fit(user_item_matrix_sparse)

# Generate recommendations for user 1
item_recommendations = item_knn.recommend(user_id=1, top_n=5)
print("ItemKNN Recommendations for User 1:", item_recommendations)

# src/recommenders/similarity/compute_similarity.py

NameError: name 'user_item_matrix_sparse' is not defined

In [2]:
# Map recommended movie IDs to titles
def get_movie_titles(movie_ids, movie_id_to_index, movies):
    index_to_movie_id = {idx: movie_id for movie_id, idx in movie_id_to_index.items()}
    recommended_movie_ids = [index_to_movie_id[idx] for idx in movie_ids]
    return movies[movies['movie_id'].isin(recommended_movie_ids)]['title']

# Get movie titles for UserKNN recommendations
user_recommended_titles = get_movie_titles(user_recommendations, movie_id_to_index, movies)
print("UserKNN Recommended Movies for User 1:")
print(user_recommended_titles)

# Get movie titles for ItemKNN recommendations
item_recommended_titles = get_movie_titles(item_recommendations, movie_id_to_index, movies)
print("ItemKNN Recommended Movies for User 1:")
print(item_recommended_titles)

NameError: name 'user_recommendations' is not defined