In [107]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import ipywidgets as widgets
from IPython.display import display

# Performing Exploratory Data Analysis

In [108]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

In [109]:
ratings.shape

(105339, 4)

In [110]:
movies.shape

(10329, 3)

In [111]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [112]:
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [114]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,105339.0,105339.0,105339.0,105339.0
mean,364.924539,13381.312477,3.51685,1130424000.0
std,197.486905,26170.456869,1.044872,180266000.0
min,1.0,1.0,0.5,828565000.0
25%,192.0,1073.0,3.0,971100800.0
50%,383.0,2497.0,3.5,1115154000.0
75%,557.0,5991.0,4.0,1275496000.0
max,668.0,149532.0,5.0,1452405000.0


In [121]:
unique_users = ratings['userId'].nunique()
unique_movies = ratings['movieId'].nunique()
print("Number of unique users:", unique_users)
print("Number of unique movies:", unique_movies)

Number of unique users: 668
Number of unique movies: 10325


In [132]:
movie_genre_ratings = movies.merge(ratings, on='movieId')
average_genre_ratings = movie_genre_ratings.groupby('genres')['rating'].mean()
total_movies_genre = movie_genre_ratings['genres'].value_counts()
print("AVERAGE RATING BY GENRE: ")
print(average_genre_ratings)
print("\nTOTAL MOVIE BY GENRE :")
print(total_movies_genre)

AVERAGE RATING BY GENRE: 
genres
(no genres listed)                     3.071429
Action                                 2.836406
Action|Adventure                       3.739804
Action|Adventure|Animation             4.125000
Action|Adventure|Animation|Children    3.550000
                                         ...   
Sci-Fi|Thriller|IMAX                   3.500000
Thriller                               3.473430
Thriller|War                           3.500000
War                                    3.613636
Western                                3.500000
Name: rating, Length: 938, dtype: float64

TOTAL MOVIE BY GENRE :
Drama                                      7678
Comedy                                     6676
Comedy|Romance                             3733
Drama|Romance                              3407
Comedy|Drama                               3101
                                           ... 
Adventure|Drama|Romance|Sci-Fi|Thriller       1
Action|Comedy|Drama|Horror           

In [134]:
unique_genres = movies['genres'].nunique()
print("Number of unique genres considered:", unique_genres)

Number of unique genres considered: 938


# Task-1

In [6]:
import numpy as np
import pandas as pd

In [7]:
movies = pd.read_csv('movies.csv')

In [11]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [71]:
ratings = pd.read_csv('ratings.csv')

In [79]:
genre = input('Enter the genre: ')
genre_movies = movies[movies['genres'].str.contains(genre)]

threshold = int(input("Enter the minimum review threshold: "))
review_counts = ratings['movieId'].value_counts().rename('review_count')
genre_movies = genre_movies.merge(review_counts, left_on='movieId', right_index=True)
genre_movies = genre_movies[genre_movies['review_count'] >= threshold]

average_ratings = ratings.groupby('movieId')['rating'].mean().rename('average_rating')
genre_movies = genre_movies.merge(average_ratings, left_on='movieId', right_index=True)
movies_sorted = genre_movies.sort_values(by='average_rating', ascending=False)

N = int(input("Enter the number of recommendations: "))
recommended_movies = movies_sorted.head(N)

print(recommended_movies[['movieId', 'title', 'average_rating']])
pd.DataFrame(recommended_movies)

Enter the genre: Drama
Enter the minimum review threshold: 100
Enter the number of recommendations: 3
     movieId                             title  average_rating
279      318  Shawshank Redemption, The (1994)        4.454545
695      858             Godfather, The (1972)        4.392857
471      527           Schindler's List (1993)        4.296371


Unnamed: 0,movieId,title,genres,review_count,average_rating
279,318,"Shawshank Redemption, The (1994)",Crime|Drama,308,4.454545
695,858,"Godfather, The (1972)",Crime|Drama,210,4.392857
471,527,Schindler's List (1993),Drama|War,248,4.296371


# Task-2

In [106]:
import pandas as pd

movies = pd.read_csv('movies.csv')
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [87]:
movie_title = input('Enter the Movie Title: ')

N = int(input('Enter number of recommendations: '))

if movie_title not in movies['title'].values:
    print("Movie title not found in the dataset.")
else:
    selected_movie = movies[movies['title'] == movie_title].iloc[0]

    selected_movie_genres = selected_movie['genres']

    similar_movies = movies[movies['genres'].apply(lambda x: any(genre in x for genre in selected_movie_genres))]

    if len(similar_movies) > 0:
        similarity_scores = similar_movies.apply(lambda row: sum(genre in selected_movie_genres for genre in row['genres']), axis=1)
        similar_movies.loc[:, 'similarity_score'] = similarity_scores

        sorted_movies = similar_movies.sort_values(by='similarity_score', ascending=False)

        recommended_movies = sorted_movies.head(N)

        recommended_movies_df = pd.DataFrame(recommended_movies[['movieId', 'title', 'similarity_score']])
        print(recommended_movies_df)
    else:
        print("No similar movies found.")
pd.DataFrame(recommended_movies_df)

Enter the Movie Title: GoldenEye (1995)
Enter number of recommendations: 5
      movieId                                              title  \
8736    81132                                      Rubber (2010)   
6220    26701  Patlabor: The Movie (Kidô keisatsu patorebâ: T...   
8469    75408  Lupin III: Sweet Lost Night (Rupan Sansei: Swe...   
8492    76153  Lupin III: First Contact (Rupan Sansei: Faasut...   
7558    56152                                   Enchanted (2007)   

      similarity_score  
8736                57  
6220                43  
8469                42  
8492                42  
7558                42  


Unnamed: 0,movieId,title,similarity_score
8736,81132,Rubber (2010),57
6220,26701,Patlabor: The Movie (Kidô keisatsu patorebâ: T...,43
8469,75408,Lupin III: Sweet Lost Night (Rupan Sansei: Swe...,42
8492,76153,Lupin III: First Contact (Rupan Sansei: Faasut...,42
7558,56152,Enchanted (2007),42


# Task-3

In [88]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [92]:
ratings = pd.read_csv('ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [98]:
user_id = int(input('UserID: '))
N = int(input('Num Recommendations: '))
K = int(input('Threshold: '))

similarity_matrix = cosine_similarity(ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0))
similar_users_indices = similarity_matrix[user_id-1].argsort()[::-1][1:k+1]

movies_rated_by_similar_users = ratings[ratings['userId'].isin(similar_users_indices + 1)]
average_ratings = movies_rated_by_similar_users.groupby('movieId')['rating'].mean().rename('average_rating')

sorted_movies = average_ratings.sort_values(ascending=False)
recommended_movies = sorted_movies.head(N)

pd.DataFrame(recommended_movies)

UserID: 69
Num Recommendations: 5
Threshold: 150


Unnamed: 0_level_0,average_rating
movieId,Unnamed: 1_level_1
7034,5.0
35347,5.0
5007,5.0
218,5.0
31785,5.0


In [105]:
import pandas as pd
import ipywidgets as widgets
from IPython.display import display

movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

def recommend_movies(genre, threshold, N):
    genre_movies = movies[movies['genres'].str.contains(genre)]

    review_counts = ratings['movieId'].value_counts().rename('review_count')
    genre_movies = genre_movies.merge(review_counts, left_on='movieId', right_index=True)
    genre_movies = genre_movies[genre_movies['review_count'] >= threshold]

    average_ratings = ratings.groupby('movieId')['rating'].mean().rename('average_rating')
    genre_movies = genre_movies.merge(average_ratings, left_on='movieId', right_index=True)
    movies_sorted = genre_movies.sort_values(by='average_rating', ascending=False)

    recommended_movies = movies_sorted.head(N)
    return recommended_movies[['movieId', 'title', 'average_rating']]

genre_widget = widgets.Text(description='Genre:')
threshold_widget = widgets.IntSlider(description='Minimum review threshold:', min=0, max=500, step=10)
N_widget = widgets.IntText(description='Number of recommendations:', min=1, max=10, step=1)

button = widgets.Button(description='Recommend Movies')
output = widgets.Output()

def on_button_clicked(b):
    with output:
        output.clear_output()
        recommended_movies = recommend_movies(genre_widget.value, threshold_widget.value, N_widget.value)
        display(recommended_movies)

button.on_click(on_button_clicked)
display(genre_widget, threshold_widget, N_widget, button, output)


Text(value='', description='Genre:')

IntSlider(value=0, description='Minimum review threshold:', max=500, step=10)

IntText(value=0, description='Number of recommendations:')

Button(description='Recommend Movies', style=ButtonStyle())

Output()