In [3]:
import io, os, requests, zipfile
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

### Implementation classes for Dataset preparation, modelling and recommendation

In [4]:
class DatasetPreparation:
  def __init__(self, dataset_url):
    files = self.download_and_extract_zip(dataset_url)
    
    overall_stats = pd.read_csv('ml-100k/u.info', header=None)
    print("Movie lens dataset -: ",list(overall_stats[0]))

    # item id is same as movie id, item id column is renamed as movie id
    column_names1 = ['user id','movie id','rating','timestamp']
    dataset = pd.read_csv('ml-100k/u.data', sep='\t',header=None,names=column_names1)
    #print(dataset.head())

    cols = 'movie id | movie title | release date | video release date | IMDb URL | unknown | Action | Adventure | Animation | Children | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western'
    column_names2 = cols.split(' | ')

    items_dataset = pd.read_csv('ml-100k/u.item', sep='|',header=None,names=column_names2,encoding='latin-1')
    print(items_dataset.head())

    print(f"Movie id range : {min(dataset['movie id'])} to {max(dataset['movie id'])}")

    print(f"Total number of movies {items_dataset['movie id'].nunique()}")

    movie_dataset = items_dataset[['movie id','movie title']]
    #print(movie_dataset.head())

    print(f"Grouping on all execept movie id : {len(items_dataset.groupby(by=column_names2[1:]))}")

    print(f"Length of movie datset : {len(movie_dataset)}")

    merged_dataset = pd.merge(dataset, movie_dataset, how='inner', on='movie id')

    duplicates = items_dataset.groupby("movie title")["movie id"].nunique()
    movies_with_duplicates_ids = duplicates[duplicates > 1].index

    refined_dataset = merged_dataset.groupby(by=['user id','movie title'], as_index=False).agg({"rating":"mean"})
    refined_dataset.head()

    # Store the refined dataset for later use
    self.refined_dataset = refined_dataset
    refined_dataset.to_csv("refined_dataset.csv", index=False)

  # Fixed method definition - added self parameter
  def download_and_extract_zip(self, url, extract_to='.', chunk_size=8192):
      try:
          # Create the extraction directory if it doesn't exist
          os.makedirs(extract_to, exist_ok=True)

          # Download the file
          print(f"Downloading zip file from {url}...")
          response = requests.get(url, stream=True)
          response.raise_for_status()  # Raise an exception for HTTP errors

          # Download and store in memory
          zip_content = io.BytesIO()
          for chunk in response.iter_content(chunk_size=chunk_size):
              if chunk:
                  zip_content.write(chunk)

          # Reset the pointer to the beginning of the BytesIO object
          zip_content.seek(0)

          # Extract the zip file
          print(f"Extracting zip file to {extract_to}...")
          with zipfile.ZipFile(zip_content) as z:
              z.extractall(extract_to)
              extracted_files = z.namelist()

          #print(f"Successfully extracted {len(extracted_files)} files.")
          return extracted_files

      except requests.exceptions.RequestException as e:
          print(f"Error downloading the data: {e}")
          raise
      except zipfile.BadZipFile:
          print("The downloaded file is not a valid zip file.")
          raise
      except Exception as e:
          print(f"An error occurred: {e}")
          raise

In [5]:
import numpy as np

class ModelBuilder:
    def __new__(cls, prepared_data_path):
        instance = super(ModelBuilder, cls).__new__(cls)
        dataset = pd.read_csv(prepared_data_path)
        user_to_movie_df = dataset.pivot(
            index='user id', 
            columns='movie title', 
            values='rating'
        ).fillna(0)
        
        user_to_movie_sparse_df = csr_matrix(user_to_movie_df.values)
        model = NearestNeighbors(metric='cosine', algorithm='brute')
        model.fit(user_to_movie_sparse_df)
        
        return model


class MovieRecommender:
    def __init__(self, refined_dataset_path, knn_model):
        """
        Initialize the MovieRecommender with necessary data.

        Parameters:
        -----------
        refined_dataset_path : pandas.DataFrame
            Path to the dataset containing user-movie interactions with columns 'user id' and 'movie title'
        user_to_movie_df : pandas.DataFrame
            User-movie matrix where rows represent users and columns represent movies
        knn_model : sklearn.neighbors.NearestNeighbors
            Trained KNN model for finding similar users
        """
        self.refined_dataset = pd.read_csv(refined_dataset_path)
        self.user_to_movie_df = self.refined_dataset.pivot(index='user id',columns='movie title',values='rating').fillna(0)
        self.knn_model = knn_model
        self.movies_list = self.user_to_movie_df.columns

    def get_movies_seen_by_user(self, user_id):
        """
        Get list of movies seen by a specific user.

        Parameters:
        -----------
        user_id : int
            ID of the user

        Returns:
        --------
        list
            List of movie titles seen by the user
        """
        return list(self.refined_dataset[self.refined_dataset['user id'] == user_id]['movie title'])

    def get_similar_users(self, user_id, n_similar_users=5):
        """
        Find users similar to the given user based on movie ratings.

        Parameters:
        -----------
        user_id : int
            ID of the user
        n_similar_users : int, optional
            Number of similar users to find (default is 5)

        Returns:
        --------
        tuple
            (similar_users_ids, distances)
        """
        knn_input = np.asarray([self.user_to_movie_df.values[user_id-1]])

        distances, indices = self.knn_model.kneighbors(knn_input, n_neighbors=n_similar_users+1)

        print(f"Top {n_similar_users} users who are very much similar to the User-{user_id} are: ")
        print(" ")

        for i in range(1, len(distances[0])):
            print(f"{i}. User: {indices[0][i]+1}, separated by distance of {distances[0][i]}")
        print("")

        return indices.flatten()[1:] + 1, distances.flatten()[1:]

    def calculate_weighted_ratings(self, similar_user_list, distance_list):
        """
        Calculate weighted ratings based on similar users and their distances.

        Parameters:
        -----------
        similar_user_list : numpy.ndarray
            List of similar user IDs
        distance_list : numpy.ndarray
            List of distances for the similar users

        Returns:
        --------
        numpy.ndarray
            Weighted mean rating list for all movies
        """
        # Normalize the distance scores
        weightage_list = distance_list / np.sum(distance_list)

        # Ratings given to movies by similar users
        mov_rtngs_sim_users = self.user_to_movie_df.values[similar_user_list-1]

        weightage_list = weightage_list[:, np.newaxis] + np.zeros(len(self.movies_list))

        # Finding weighted rating - product of ratings given by similar users
        new_rating_matrix = weightage_list * mov_rtngs_sim_users

        # Find the sum of each movie
        mean_rating_list = new_rating_matrix.sum(axis=0)

        return mean_rating_list

    def get_filtered_movie_recommendations(self, user_id, mean_rating_list, n_movies=10):
        """
        Filter movie recommendations based on weighted ratings and user history.

        Parameters:
        -----------
        user_id : int
            ID of the user
        mean_rating_list : numpy.ndarray
            Weighted mean ratings for all movies
        n_movies : int, optional
            Number of movie recommendations to return (default is 10)

        Returns:
        --------
        list
            List of recommended movie titles
        """
        # Find the first index where 0 occurs in the mean rating list
        zero_indices = np.where(mean_rating_list == 0)[0]
        if len(zero_indices) > 0:
            first_zero_index = zero_indices[-1]
        else:
            first_zero_index = len(mean_rating_list) - 1

        # Sort the ratings
        sortd_index = np.argsort(mean_rating_list)[::-1]
        sortd_index = sortd_index[:list(sortd_index).index(first_zero_index)]

        # Limit the number of recommendations
        n = min(len(sortd_index), n_movies)

        # Get movies watched by current user
        movies_watched = self.get_movies_seen_by_user(user_id)

        # Filter movies not yet watched by the user
        filtered_movie_list = list(self.movies_list[sortd_index])
        count = 0
        final_movie_list = []

        for i in filtered_movie_list:
            if i not in movies_watched:
                count += 1
                final_movie_list.append(i)
            if count == n:
                break

        return final_movie_list

    def recommend_movies(self, user_id, n_similar_users=5, n_movies=10):
        """
        Generate movie recommendations for a user.

        Parameters:
        -----------
        user_id : int
            ID of the user
        n_similar_users : int, optional
            Number of similar users to consider (default is 5)
        n_movies : int, optional
            Number of movie recommendations to return (default is 10)

        Returns:
        --------
        list
            List of recommended movie titles
        """
        # Display movies seen by the user
        movies_seen = self.get_movies_seen_by_user(user_id)
        print("Movies seen by the User:")
        pprint(movies_seen)
        print("")

        # Find similar users
        similar_user_list, distance_list = self.get_similar_users(user_id, n_similar_users)

        # Calculate weighted ratings
        mean_rating_list = self.calculate_weighted_ratings(similar_user_list, distance_list)

        # Get filtered recommendations
        recommendations = self.get_filtered_movie_recommendations(user_id, mean_rating_list, n_movies)

        print("")
        print("Movies recommended based on similar users are:")
        print("")

        if len(recommendations) == 0:
            print("There are no movies left which are not seen by the input users and seen by similar users. "
                  "May be increasing the number of similar users who are to be considered may give a chance "
                  "of suggesting an unseen good movie.")
        else:
            print("Movies")
            #pprint(recommendations)

        return recommendations

In [6]:
# Example usage:
url = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'

dataset_preparation = DatasetPreparation(url)

print("Dataset prepared successfully")

refined_dataset_path = r"refined_dataset.csv"

model = ModelBuilder(refined_dataset_path)

recommender = MovieRecommender(refined_dataset_path, model)

Downloading zip file from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Extracting zip file to ....
Movie lens dataset -:  ['943 users', '1682 items', '100000 ratings']
   movie id        movie title release date  video release date  \
0         1   Toy Story (1995)  01-Jan-1995                 NaN   
1         2   GoldenEye (1995)  01-Jan-1995                 NaN   
2         3  Four Rooms (1995)  01-Jan-1995                 NaN   
3         4  Get Shorty (1995)  01-Jan-1995                 NaN   
4         5     Copycat (1995)  01-Jan-1995                 NaN   

                                            IMDb URL  unknown  Action  \
0  http://us.imdb.com/M/title-exact?Toy%20Story%2...        0       0   
1  http://us.imdb.com/M/title-exact?GoldenEye%20(...        0       1   
2  http://us.imdb.com/M/title-exact?Four%20Rooms%...        0       0   
3  http://us.imdb.com/M/title-exact?Get%20Shorty%...        0       1   
4  http://us.imdb.com/M/title-exact?Copycat%20(1

In [7]:
recommender.recommend_movies(user_id=307, n_similar_users=10, n_movies=5)

Movies seen by the User:


NameError: name 'pprint' is not defined

In [13]:
import pickle as p
model_name = "knn_model.pkl"
p.dump(model, open(model_name, 'wb'))

mdl = p.load(open(model_name, 'rb')) 

In [14]:
recommender = MovieRecommender(refined_dataset_path, mdl)
recommender.recommend_movies(user_id=307, n_similar_users=10, n_movies=5)

Movies seen by the User:
['12 Angry Men (1957)',
 '2001: A Space Odyssey (1968)',
 'Abyss, The (1989)',
 'Alien (1979)',
 'Apollo 13 (1995)',
 'Back to the Future (1985)',
 'Barbarella (1968)',
 'Batman (1989)',
 'Beauty and the Beast (1991)',
 'Blade Runner (1982)',
 'Blues Brothers, The (1980)',
 'Boot, Das (1981)',
 'Brady Bunch Movie, The (1995)',
 'Braveheart (1995)',
 'Brazil (1985)',
 'Casablanca (1942)',
 'Close Shave, A (1995)',
 'Contact (1997)',
 'Crying Game, The (1992)',
 'Dead Poets Society (1989)',
 'Dial M for Murder (1954)',
 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)',
 'Dragonheart (1996)',
 'E.T. the Extra-Terrestrial (1982)',
 'Empire Strikes Back, The (1980)',
 'English Patient, The (1996)',
 'Englishman Who Went Up a Hill, But Came Down a Mountain, The (1995)',
 'Escape from L.A. (1996)',
 'Fargo (1996)',
 'Fast, Cheap & Out of Control (1997)',
 'Field of Dreams (1989)',
 'Fish Called Wanda, A (1988)',
 'Four Weddings and a Funer

['Aliens (1986)',
 'Silence of the Lambs, The (1991)',
 'Willy Wonka and the Chocolate Factory (1971)',
 'Amadeus (1984)',
 'Groundhog Day (1993)']

In [1]:
from scripts.DatasetPreparation import DatasetPreparation
from scripts.ModelBuilder import ModelBuilder
from scripts.MovieRecommender import MovieRecommender

In [2]:
url = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'

dataset_preparation = DatasetPreparation(url)

refined_dataset_path = "refined_dataset.csv"

model = ModelBuilder(refined_dataset_path)

Downloading zip file from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Movie lens dataset -:  ['943 users', '1682 items', '100000 ratings']
Movie id range : 1 to 1682
Total number of movies 1682
Length of movie datset : 1682
Dataset stored successfully to data/
Model saved successfully to model\knn_model.pkl


In [3]:
from pprint import pprint

model_path = "knn_model.pkl"

recommender = MovieRecommender(refined_dataset_path, model_path)

recommender.recommend_movies(user_id=307, n_similar_users=10, n_movies=5)

['Aliens (1986)',
 'Silence of the Lambs, The (1991)',
 'Willy Wonka and the Chocolate Factory (1971)',
 'Amadeus (1984)',
 'Groundhog Day (1993)']