In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np

In [4]:
# reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols,encoding='latin-1')

In [5]:
ratings

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [6]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [16]:
#Read the movie Dataset
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
    'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
    'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols,encoding='latin-1')

In [17]:
# Select only needed Item
movie_content=items[['movie id','movie title','Action', 'Adventure',
    'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
    'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']]

In [19]:
#Setting the innge value from 1 to 1682
movie_content.index=range(1,1683)

In [20]:
movie_content

Unnamed: 0,movie id,movie title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
1,1,Toy Story (1995),0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,GoldenEye (1995),1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,3,Four Rooms (1995),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,4,Get Shorty (1995),1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
5,5,Copycat (1995),0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1678,1678,Mat' i syn (1997),0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1679,1679,B. Monkey (1998),0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1680,1680,Sliding Doors (1998),0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1681,1681,You So Crazy (1994),0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [22]:
#function to get the movie_vector categoriesin a single list for ecah movie_id
def movie_category_list(ds):
    # Step 1: Convert the dataset from wide format to long format
    # 'movie id' and 'movie title' are kept as identifiers, other columns are melted into 'category' and 'value'
    melted_df = pd.melt(ds, id_vars=['movie id', 'movie title'], var_name='category', value_name='value')
     # Step 2: Filter rows where 'value' is 1 (indicating the movie belongs to that category)
    filtered_df = melted_df[melted_df['value'] == 1]
    # Step 3: Group by 'movie id' and 'movie title', aggregating all matching 'category' values into a list
    movie_cate_df = filtered_df.groupby(['movie id', 'movie title'])['category'].apply(list).reset_index()
     # Step 4: Return the DataFrame containing each movie with its associated list of categories
    return movie_cate_df

In [23]:
#calling the function
movie_cate_df = movie_category_list(movie_content)

movie_cate_df

In [25]:
#retrieving the top five movies based on users watched times for each category
def movie_cate_counts(ds1,ds2,movie_count):
    flattened_df = ds1.explode('category').rename(columns={'movie id': 'movie_id'})
    movie_ratings = pd.merge(flattened_df, ds2.drop('unix_timestamp', axis=1), on='movie_id')
    movie_ratings_user_counts = movie_ratings.groupby(['user_id', 'category', 'movie_id','movie title']).size().reset_index(name='count')
    movie_category_count = movie_ratings_user_counts.groupby(['category', 'movie_id','movie title'])['user_id'].nunique().reset_index(name='count')
    sorted_movies = movie_category_count.sort_values(by=['category', 'count'], ascending=[True, False])
    top_movies_categorywise = sorted_movies.groupby('category').head(movie_count).reset_index(drop=True)
    unique_categories = top_movies_categorywise['category'].unique()
    categories_list=list(unique_categories)
    recommend_movies=[]
    for cate in categories_list:
        movie_with_count=top_movies_categorywise[top_movies_categorywise['category']==cate][['category','movie title','count']].values
        recommend_movies.append(movie_with_count)
    return recommend_movies

## This function calculates the most-watched movies for each category and returns the top movies for every category in a structured format

In [26]:
recommend_movies= movie_cate_counts(movie_cate_df,ratings,5)

## These are the movies to be recommended for new users under each movie categories

In [28]:
recommend_movies

[array([['Action', 'Star Wars (1977)', 583],
        ['Action', 'Return of the Jedi (1983)', 507],
        ['Action', 'Air Force One (1997)', 431],
        ['Action', 'Independence Day (ID4) (1996)', 429],
        ['Action', 'Raiders of the Lost Ark (1981)', 420]], dtype=object),
 array([['Adventure', 'Star Wars (1977)', 583],
        ['Adventure', 'Return of the Jedi (1983)', 507],
        ['Adventure', 'Raiders of the Lost Ark (1981)', 420],
        ['Adventure', 'Rock, The (1996)', 378],
        ['Adventure', 'Empire Strikes Back, The (1980)', 367]],
       dtype=object),
 array([['Animation', 'Toy Story (1995)', 452],
        ['Animation', 'Lion King, The (1994)', 220],
        ['Animation', 'Aladdin (1992)', 219],
        ['Animation', 'Beauty and the Beast (1991)', 202],
        ['Animation', 'Fantasia (1940)', 174]], dtype=object),
 array([["Children's", 'Toy Story (1995)', 452],
        ["Children's", 'Willy Wonka and the Chocolate Factory (1971)',
         326],
        ["Chil