In [58]:
import pandas as pd 
from datetime import datetime 
import seaborn as sns 
import matplotlib.pyplot as plt

The next step involves opening the datasets and examining the first five rows of the dataset to gain initial understanding of its structure.

In [59]:
# Reading the movies dataset 
df_movies = pd.read_csv('movies.csv')
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [60]:
# Reading the ratings dataset 
df_ratings = pd.read_csv('ratings.csv')
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [61]:
df_merged = df_movies.merge(df_ratings, how="left", on='movieId')

In [62]:
df_merged.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0


Step 3: Calculate item popularity
Calculate the popularity of each item by computing the average rating. You can group the data by "movie_id" and calculate the mean rating for each movie.

python

In [63]:
# item_popularity = df_merged.groupby('movieId')['rating'].mean().reset_index()


In [64]:
# item_popularity.head()


In [65]:
# item_popularity.tail()

Sort the items in descending order based on their average rating to find the most popular items.

In [66]:
# top_items = item_popularity.sort_values(by='rating', ascending=False)


In [67]:
# top_items.tail()

You can recommend the top-rated items to all users by selecting a fixed number of items from the top of the sorted list. For example, let's recommend the top 10 items:

In [47]:
# top_10_recommendations = top_items.head(10)


In [48]:
# top_10_recommendations

Unnamed: 0,movieId,rating
7656,88448,5.0
8107,100556,5.0
9083,143031,5.0
9094,143511,5.0
9096,143559,5.0
4251,6201,5.0
8154,102217,5.0
8148,102084,5.0
4246,6192,5.0
9122,145994,5.0


# Part Two of modeling with movie_name

In [49]:
# Part Two of modeling with movie_name

df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [50]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [51]:
# df_merged.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0


# trial 3 

In [68]:
# Baseline Model: Popularity-Based
# Calculate the average rating for each movie
movie_ratings = df_merged.groupby('title')['rating'].mean().reset_index()

# Sort movies by average rating in descending order
top_popular_movies = movie_ratings.sort_values(by='rating', ascending=False)

# Recommend the top 5 popular movies
top_5_popular_movies = top_popular_movies.head(5)
print("Top 5 Popular Movies:")
print(top_5_popular_movies)


Top 5 Popular Movies:
                                    title  rating
3360            Gena the Crocodile (1969)     5.0
8968                  True Stories (1986)     5.0
1991        Cosmic Scrat-tastrophe (2015)     5.0
5278              Love and Pigeons (1985)     5.0
7046  Red Sorghum (Hong gao liang) (1987)     5.0


In [11]:
!pip install scikit-surprise




In [72]:
# Importing Surprise libraries
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import KNNBasic
from surprise import accuracy

# Create a Reader object
reader = Reader(rating_scale=(1, 5))

# Load the dataset using Surprise's Dataset class
data = Dataset.load_from_df(df_merged[['userId', 'movieId', 'rating']], reader)

# Split the dataset into a train and test set
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Build a collaborative filtering model
sim_options = {
    'name': 'cosine',  # Use cosine similarity
    'user_based': True  # User-based collaborative filtering
}

model = KNNBasic(sim_options=sim_options)

# Train the model on the training set
model.fit(trainset)

# Make predictions on the test set
predictions = model.test(testset)

# Evaluate the model (you can use RMSE or other metrics)
rmse = accuracy.rmse(predictions)
print(f'RMSE: {rmse:.4f}')

# Function to get top N movie recommendations for a user
def get_top_n_recommendations(predictions, n=5):
    top_n = {}
    for uid, iid, true_r, est, _ in predictions:
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est))
    
    # Sort the recommendations for each user and get the top N
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    
    return top_n

# Replace 'user_id_to_recommend_for' with the target user's ID
user_id_to_recommend_for = 567

# Get top 5 movie recommendations for the user
top_n_recommendations = get_top_n_recommendations(predictions, n=5)
user_top_5_recommendations = top_n_recommendations.get(user_id_to_recommend_for, [])
print(f"Top 5 Movie Recommendations for User {user_id_to_recommend_for}:")
for movie_id, estimated_rating in user_top_5_recommendations:
    movie_title = df_movies[df_movies['movieId'] == movie_id]['title'].values[0]
    print(f"{movie_title} (Estimated Rating: {estimated_rating:.2f})")


Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9659
RMSE: 0.9659
Top 5 Movie Recommendations for User 567:
Come and See (Idi i smotri) (1985) (Estimated Rating: 5.00)
The Lego Batman Movie (2017) (Estimated Rating: 4.67)
Man Bites Dog (C'est arrivé près de chez vous) (1992) (Estimated Rating: 4.63)
Funny Games U.S. (2007) (Estimated Rating: 4.50)
Good Will Hunting (1997) (Estimated Rating: 4.30)


In [70]:
import pandas as pd

# Check for missing values in the df_merged DataFrame
missing_values = df_merged.isna().sum()

# Display columns with missing values (if any)
columns_with_missing_values = missing_values[missing_values > 0]
if not columns_with_missing_values.empty:
    print("Columns with Missing Values:")
    print(columns_with_missing_values)
else:
    print("No Missing Values Found in the Dataset")


Columns with Missing Values:
userId       18
rating       18
timestamp    18
dtype: int64


In [15]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100854 entries, 0 to 100853
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   movieId    100854 non-null  int64  
 1   title      100854 non-null  object 
 2   genres     100854 non-null  object 
 3   userId     100836 non-null  float64
 4   rating     100836 non-null  float64
 5   timestamp  100836 non-null  float64
dtypes: float64(3), int64(1), object(2)
memory usage: 4.6+ MB


In [71]:
df_merged.dropna(inplace=True)


In [33]:
df_merged.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0


## Trial 4

Here I use Average Rating-Based Reccomender 

In [34]:
import numpy as np

In [35]:
# from sklearn.model_selection import train_test_split

# # Split the dataset into a train and test set
# train_data, test_data = train_test_split(df_merged, test_size=0.2, random_state=42)

# # Calculate the average rating for each movie using the train_data
# movie_ratings = train_data.groupby('movieId')['rating'].mean().reset_index()

# # Merge the average ratings with the test_data to get the actual ratings for the test set movies
# testset_actual_ratings = pd.merge(test_data, movie_ratings, on='movieId', how='left')

# # Calculate the RMSE for the average rating-based baseline model
# baseline_rmse = np.sqrt(np.mean((testset_actual_ratings['rating'] - testset_actual_ratings['rating_x'])**2))
# print(f'Average Rating-Based Baseline Model RMSE: {baseline_rmse:.4f}')


In [36]:
import numpy as np
from sklearn.model_selection import train_test_split

# Split the dataset into a train and test set
train_data, test_data = train_test_split(df_merged, test_size=0.2, random_state=42)

# Calculate the average rating for each movie using the train_data
movie_ratings = train_data.groupby('movieId')['rating'].mean().reset_index()

# Merge the average ratings with the test_data to get the actual ratings for the test set movies
testset_actual_ratings = pd.merge(test_data, movie_ratings, on='movieId', how='left', suffixes=('_test', '_avg_rating'))

# Calculate the RMSE for the average rating-based baseline model
baseline_rmse = np.sqrt(np.mean((testset_actual_ratings['rating_test'] - testset_actual_ratings['rating_avg_rating'])**2))
print(f'Average Rating-Based Baseline Model RMSE: {baseline_rmse:.4f}')


Average Rating-Based Baseline Model RMSE: 0.9686


In [37]:
# Sort the movie_ratings DataFrame by average rating in descending order
top_rated_movies = movie_ratings.sort_values(by='rating', ascending=False)

# Get the top 5 movies with the highest average ratings
top_5_movies = top_rated_movies.head(5)

# Display the top 5 recommended movies
print("Top 5 Recommended Movies (Average Rating-Based):")
for index, row in top_5_movies.iterrows():
    movie_id = row['movieId']
    movie_title = df_movies[df_movies['movieId'] == movie_id]['title'].values[0]
    avg_rating = row['rating']
    print(f"{movie_title} (Average Rating: {avg_rating:.2f})")


Top 5 Recommended Movies (Average Rating-Based):
Particle Fever (2013) (Average Rating: 5.00)
Trinity and Sartana Are Coming (1972) (Average Rating: 5.00)
Winnie Pooh (1969) (Average Rating: 5.00)
Winnie the Pooh Goes Visiting (1971) (Average Rating: 5.00)
Winnie the Pooh and the Day of Concern (1972) (Average Rating: 5.00)


In [28]:
# Specify the user ID for whom you want to make recommendations
specific_user_id = 456

# Calculate the average rating for each movie using the train_data
movie_ratings = train_data.groupby('movieId')['rating'].mean().reset_index()

# Filter movies that the specific user has not rated
movies_not_rated_by_user = df_movies[~df_movies['movieId'].isin(train_data[train_data['userId'] == specific_user_id]['movieId'])]

# Merge the average ratings with the movies_not_rated_by_user DataFrame
movie_ratings = pd.merge(movies_not_rated_by_user, movie_ratings, on='movieId', how='left')

# Sort the movies by average rating in descending order
movie_ratings_sorted = movie_ratings.sort_values(by='rating', ascending=False)

# Recommend the top 5 movies to the specific user
top_5_recommendations = movie_ratings_sorted.head(5)

# Display the top 5 recommended movies
print(f"Top 5 Recommended Movies for User {specific_user_id} (Average Rating-Based):")
for index, row in top_5_recommendations.iterrows():
    movie_title = row['title']
    avg_rating = row['rating']
    print(f"{movie_title} (Average Rating: {avg_rating:.2f})")


Top 5 Recommended Movies for User 456 (Average Rating-Based):
Gena the Crocodile (1969) (Average Rating: 5.00)
Winter in Prostokvashino (1984) (Average Rating: 5.00)
Bill Hicks: Revelations (1993) (Average Rating: 5.00)
Crossing Delancey (1988) (Average Rating: 5.00)
American Friend, The (Amerikanische Freund, Der) (1977) (Average Rating: 5.00)
