Importing Libraries

In [151]:
import pandas as pd
import numpy as np

## Making of the user genre matrix 

Loading the data 

In [200]:
ratings_df = pd.read_csv('ratings.dat', sep='::', engine='python', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], encoding='ISO-8859-1')
users_df = pd.read_csv('users.dat', sep='::', engine='python', header=None, names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], encoding='ISO-8859-1')
movies_df = pd.read_csv('movies.dat', sep='::', engine='python', header=None, names=['MovieID', 'Title', 'Genres'], encoding='ISO-8859-1')

In [201]:
movies_df.shape

(3883, 3)

In [202]:
ratings_df


Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


Making of User Gnere Matrix 

In [203]:
# Merge ratings and movies data on MovieID
merged_df = pd.merge(ratings_df, movies_df, on='MovieID')
merged_df1 = merged_df.assign(Genres=merged_df['Genres'].str.split('|')).explode('Genres').reset_index(drop=True)
ratings = merged_df1.pivot_table(values='Rating', index='UserID', columns='Genres', aggfunc='mean')
ratings.head()

Genres,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,4.2,4.0,4.111111,4.25,4.142857,4.0,,4.428571,4.0,,,4.285714,,3.666667,4.333333,3.666667,5.0,
2,3.5,3.736842,,,3.56,3.583333,,3.898734,3.0,4.0,3.0,,3.333333,3.708333,3.588235,3.483871,3.733333,4.333333
3,3.956522,4.0,4.0,4.0,3.766667,,,4.0,4.5,,2.666667,4.0,3.0,3.8,3.833333,3.8,4.0,4.666667
4,4.157895,3.833333,,4.0,,5.0,,4.166667,4.5,,4.333333,,,4.0,3.555556,3.5,3.333333,4.5
5,2.612903,3.0,4.0,3.833333,3.410714,3.285714,3.666667,3.096154,,4.0,2.8,3.333333,3.125,3.1,3.066667,2.846154,3.5,4.0


In [204]:
ratings.shape

(6040, 18)

In [19]:
ratings_array = ratings.to_numpy()
#Replace NaN values with 0
ratings_array[np.isnan(ratings_array)] = 0
print(ratings_array)

[[4.2        4.         4.11111111 ... 3.66666667 5.         0.        ]
 [3.5        3.73684211 0.         ... 3.48387097 3.73333333 4.33333333]
 [3.95652174 4.         4.         ... 3.8        4.         4.66666667]
 ...
 [3.         4.         3.66666667 ... 0.         4.         0.        ]
 [4.         4.1        3.61538462 ... 4.14285714 4.11111111 4.5       ]
 [2.97619048 2.81818182 3.         ... 3.92682927 3.69565217 4.        ]]


In [153]:
# Save the array to a CSV file
np.savetxt('ratings_array.csv', ratings_array, delimiter=',', fmt='%d')

In [46]:
sparsity = np.sum(np.isnan(ratings_array) | (ratings_array == 0)) / ratings_array.size
sparsity_percentage = sparsity * 100

print("Sparsity: {:.2f}%".format(sparsity_percentage))


Sparsity: 13.64%


## Making of User_similarity Matrix 

In [37]:
def pearson_similarity(user1, user2):
    # Calculate mean for each user
    mean_user1 = np.mean(user1)
    mean_user2 = np.mean(user2)

    # Subtract mean from ratings
    adjusted_user1 = user1 - mean_user1
    adjusted_user2 = user2 - mean_user2

    # Calculate Pearson correlation coefficient
    numerator = np.sum(adjusted_user1 * adjusted_user2)
    denominator = np.sqrt(np.sum(adjusted_user1**2) * np.sum(adjusted_user2**2))

    # Avoid division by zero
    if denominator == 0:
        return 0

    similarity = numerator / denominator
    return similarity

In [38]:
def calculate_user_similarity(user_genre_matrix):
    num_users = user_genre_matrix.shape[0]
    user_similarity_matrix = np.zeros((num_users, num_users))

    for i in range(num_users):
        for j in range(i + 1, num_users):
            # Calculate Pearson correlation coefficient between user i and user j
            similarity = pearson_similarity(user_genre_matrix[i, :], user_genre_matrix[j, :])

            # Store the correlation coefficient in both symmetric positions
            user_similarity_matrix[i, j] = similarity
            user_similarity_matrix[j, i] = similarity

    # Set diagonal elements to 1, as a user is perfectly correlated with themselves
    np.fill_diagonal(user_similarity_matrix, 1)

    return user_similarity_matrix

In [39]:
user_similarity_matrix = calculate_user_similarity(ratings_array)

print("User Similarity Matrix:")
print(user_similarity_matrix)

User Similarity Matrix:
[[ 1.         -0.0585156   0.48528202 ...  0.5429454   0.2157542
  -0.30879632]
 [-0.0585156   1.          0.07193115 ...  0.10728314  0.60805073
  -0.17778243]
 [ 0.48528202  0.07193115  1.         ...  0.44675739  0.41339611
  -0.36576854]
 ...
 [ 0.5429454   0.10728314  0.44675739 ...  1.          0.17972952
  -0.62211261]
 [ 0.2157542   0.60805073  0.41339611 ...  0.17972952  1.
  -0.31988178]
 [-0.30879632 -0.17778243 -0.36576854 ... -0.62211261 -0.31988178
   1.        ]]


In [154]:
# Save the array to a CSV file
np.savetxt('user_similarity_matrix.csv', user_similarity_matrix, delimiter=',', fmt='%d')

## For Predicting 

In [174]:
def get_similar_users(user_id, user_similarity_matrix, k=10):
    user_index = user_id - 1  # Assuming user IDs start from 1
    similar_users_indices = np.argsort(user_similarity_matrix[user_index, :])[::-1][1:k+1]
    
    return similar_users_indices + 1  # Adding 1 to convert back to 1-based user IDs


In [175]:
def create_user_movie_matrix(df):
    # Get unique user and movie IDs
    unique_users = df['UserID'].unique()
    unique_movies = df['MovieID'].unique()

    # Create a user-movie matrix filled with zeros
    user_movie_matrix = np.zeros((len(unique_users), len(unique_movies)))

    # Map user and movie IDs to matrix indices
    user_id_to_index = {user_id: i for i, user_id in enumerate(unique_users)}
    movie_id_to_index = {movie_id: j for j, movie_id in enumerate(unique_movies)}

    # Fill in the matrix with ratings
    for _, row in df.iterrows():
        user_index = user_id_to_index[row['UserID']]
        movie_index = movie_id_to_index[row['MovieID']]
        user_movie_matrix[user_index, movie_index] = row['Rating']

    return user_movie_matrix, user_id_to_index, movie_id_to_index

User movie matrix 

In [43]:
# Example usage
user_movie_matrix, user_id_to_index, movie_id_to_index = create_user_movie_matrix(ratings_df)

print("User-Movie Matrix:")
print(user_movie_matrix)

User-Movie Matrix:
[[5. 3. 3. ... 0. 0. 0.]
 [5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]]


In [155]:
# Save the array to a CSV file
np.savetxt('user_movie_matrix.csv', user_movie_matrix, delimiter=',', fmt='%d')
user_movie_matrix.shape

(6040, 3706)

In [234]:
def get_movies_watched(user_id, user_movie_matrix):
    # Assuming user IDs start from 1
    user_index = user_id - 1

    # Get the non-zero indices in the row corresponding to the user
    watched_indices = np.nonzero(user_movie_matrix[user_index, :])[0]

    # Assuming the movies are represented by their indices
    return list(watched_indices + 1)  # Adding 1 to convert to 1-based movie IDs


Function to return the movies watched by rthe similar users which the given user have not watched 

In [176]:
def get_unwatched_movies(user_id, user_movie_matrix, similar_users, movies_watched):
    # Get the movies watched by the target user
    target_user_movies = set(movies_watched)

    # Get the movies watched by similar users
    similar_users_movies = set()
    for similar_user_id in similar_users:
        similar_users_movies.update(np.where(user_movie_matrix[similar_user_id - 1, :] > 0)[0] + 1)

    # Find the movies that similar users have watched but the target user has not
    unwatched_movies = similar_users_movies - target_user_movies

    return list(unwatched_movies)


In [177]:
def predict_ratings_for_unwatched_movies(user_id, unwatched_movies, user_movie_matrix, user_similarity_matrix, similar_users):
    # Get the index for the target user and similar users
    user_index = user_id - 1
    similar_users_indices = np.array(similar_users) - 1

    # Get the ratings for unwatched movies by similar users
    ratings_by_similar_users = user_movie_matrix[similar_users_indices[:, np.newaxis], np.array(unwatched_movies) - 1]
    #print("shape of ratings_by_similar_users:",ratings_by_similar_users.shape)  (10,350)

    # Transpose the ratings_by_similar_users array
    #ratings_by_similar_users = ratings_by_similar_users.T

    # Get the similarity weights for similar users
    similarity_weights = user_similarity_matrix[user_index, similar_users_indices]
    #print("similarity_weights:",similarity_weights)

    # Predict the ratings for unwatched movies using the weighted sum formula
    mean_user_rating = np.mean(user_movie_matrix[user_index, :])
    #predicted_ratings = mean_user_rating + np.sum(similarity_weights * ratings_by_similar_users, axis=0) / np.sum(np.abs(similarity_weights))
    predicted_ratings = []
    for movie_id in unwatched_movies:
        similar_user_ratings = ratings_by_similar_users[:, unwatched_movies.index(movie_id)]
        predicted_rating = mean_user_rating + np.sum(similarity_weights * similar_user_ratings) / np.sum(np.abs(similarity_weights))
        predicted_ratings.append(predicted_rating)
        
    # Normalize the predicted ratings to be within the range of 1 to 5
    min_rating, max_rating = 1, 5
    predicted_ratings = min_rating + (max_rating - min_rating) * (predicted_ratings - np.min(predicted_ratings)) / (np.max(predicted_ratings) - np.min(predicted_ratings))
    #print("shape of predicted_ratings:",predicted_ratings.shape)
    
    # Create a dictionary with movie IDs and their predicted ratings
    predictions = dict(zip(unwatched_movies, predicted_ratings))
    #print("shape of predictions:",predictions)

    return predictions


In [178]:
def get_top_10_rated_movies(movie_ratings):
    
  # Sort the dictionary by rating in descending order
  sorted_movies = sorted(movie_ratings.items(), key=lambda item: item[1], reverse=True)

  # Get the top 10 rated movies
  top_10_movies = sorted_movies[:10]

  # Extract the movie IDs from the top 10 rated movies
  top_10_movie_ids = [movie_id for movie_id, _ in top_10_movies]

  return top_10_movie_ids

In [179]:
# Create the movie_id_to_name dictionary
movie_id_to_name = dict(zip(movies_df['MovieID'], movies_df['Title']))

## Giving the movie reccomendation for a given user 

replace the user_id value with the user id you want to predict ratings for 

In [239]:

user_id = 28  # Replace with the target user ID

similar_users = get_similar_users(user_id, user_similarity_matrix, k=10)
#print("similar users :",similar_users)

movies_watched = get_movies_watched(user_id, user_movie_matrix) 
#print("movies watched:",movies_watched)

unwatched_movies = get_unwatched_movies(user_id, user_movie_matrix, similar_users, movies_watched)  # Replace with the list of unwatched movie IDs
#print("unwatched_movies:",movies_watched)

predictions = predict_ratings_for_unwatched_movies(user_id, unwatched_movies, user_movie_matrix, user_similarity_matrix, similar_users)

top_10_movie_ids = get_top_10_rated_movies(predictions)

print("Top 10 rated movies:")
for movie_id in top_10_movie_ids:
    predicted_rating = predictions.get(movie_id, "Not Predicted")
    movie_name = movie_id_to_name.get(movie_id, "Unknown Movie")

    # Print the movie name and predicted rating
    print(f"- Movie Name: {movie_name.ljust(40)} | Predicted Rating: {predicted_rating:.2f}")



Top 10 rated movies:
- Movie Name: Assassins (1995)                         | Predicted Rating: 5.00
- Movie Name: Brothers in Trouble (1995)               | Predicted Rating: 4.38
- Movie Name: Shopping (1994)                          | Predicted Rating: 4.29
- Movie Name: Juror, The (1996)                        | Predicted Rating: 4.28
- Movie Name: When Night Is Falling (1995)             | Predicted Rating: 4.28
- Movie Name: Addiction, The (1995)                    | Predicted Rating: 4.28
- Movie Name: Vampire in Brooklyn (1995)               | Predicted Rating: 4.28
- Movie Name: Captives (1994)                          | Predicted Rating: 4.08
- Movie Name: Crimson Tide (1995)                      | Predicted Rating: 3.97
- Movie Name: Johnny Mnemonic (1995)                   | Predicted Rating: 3.97


## Giving Movie reccommendation for a new user 

Taking input from user 

In [337]:
# Take user input for user ID
user_id = int(input("Enter User ID: "))

# Take the number of movies as input
num_movies = int(input("Enter the number of movies you want to rate: "))

# Create empty lists to store movie names and ratings
movie_names = []
ratings_new = []

# Use a loop to take input for each movie
for _ in range(num_movies):
    movie_name = input("Enter movie name: ")
    
    # Check if the entered movie name is in the DataFrame
    if movie_name in movies_df['Title'].values:
        rating = float(input(f"Enter rating for {movie_name}: "))
        movie_names.append(movie_name)
        ratings_new.append(rating)
    else:
        print(f"{movie_name} not found in the movie database. Please enter a valid movie name.")

# Create a DataFrame with user input
user_input_data = {'UserID': [user_id] * len(movie_names),
                   'MovieID': movies_df[movies_df['Title'].isin(movie_names)]['MovieID'].tolist(),
                   'Rating': ratings_new}

user_input_df = pd.DataFrame(user_input_data)

# Display the resulting DataFrame
print(user_input_df)

Enter User ID: 6041
Enter the number of movies you want to rate: 2
Enter movie name: Things to Do in Denver when You're Dead (1995)
Enter rating for Things to Do in Denver when You're Dead (1995): 4.5
Enter movie name: Angels and Insects (1995)
Enter rating for Angels and Insects (1995): 3
   UserID  MovieID  Rating
0    6041       81     4.5
1    6041       85     3.0


In [338]:
genres_list = ['Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
          'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

# Merge new user ratings with movies data
new_user_merged_df = pd.merge(user_input_df, movies_df, left_on='MovieID', right_on='MovieID')

# Explode genres and create a pivot table for the new user
new_user_genres_df = new_user_merged_df.assign(Genres=new_user_merged_df['Genres'].str.split('|')).explode('Genres').reset_index(drop=True)
new_user_ratings_pivot = new_user_genres_df.pivot_table(values='Rating', index='UserID', columns='Genres', aggfunc='mean')

# Fill NaN values with 0
new_user_ratings_pivot = new_user_ratings_pivot.fillna(0)

column_names_array = np.array(new_user_ratings_pivot.columns)

# Create an empty list to store the values
result_values = []

# Iterate through genres_list
for genre in genres_list:
    # Check if the genre is present in column_names_array
    if genre in column_names_array:
        # If present, append the corresponding value from the DataFrame
        result_values.append(new_user_ratings_pivot[genre].values[0])
    else:
        result_values.append(0)
        
new_user_genere_matrix = np.array(result_values)
user_genre_matrix = ratings_array

In [339]:
new_user_genere_matrix.shape

(18,)

In [340]:
num_users = user_genre_matrix.shape[0]
new_user_similarity_vector = np.zeros((1, num_users))

for i in range(num_users):
    # Calculate Pearson correlation coefficient between user i and user j
    similarity = pearson_similarity(user_genre_matrix[i, :],new_user_genere_matrix)
    # Store the correlation coefficient in both symmetric positions
    new_user_similarity_vector[0, i] = similarity

final_user_similarity_matrix = np.vstack([user_similarity_matrix, new_user_similarity_vector])

In [341]:
final_user_similarity_matrix.shape

(6041, 6040)

In [342]:
# Get unique MovieIDs from user_input_df
user_movie_ids = user_input_df['MovieID'].unique()

# Create a new DataFrame with all MovieIDs and merge it with user_input_df
all_movies_df = pd.DataFrame({'MovieID': movies_df['MovieID']})
user_input_with_zeros = pd.merge(all_movies_df, user_input_df, on=['MovieID'], how='left').fillna(0)

# Create a vector for the new user with ratings for specific movies and 0 for others
new_user_movie_vector = user_input_with_zeros.sort_values(by='MovieID')['Rating'].values

# Trim the new user vector to match the number of movies in the existing user-movie matrix
new_user_movie_vector = new_user_movie_vector[:user_movie_matrix.shape[1]]

final_user_movie_matrix = np.vstack([user_movie_matrix, new_user_movie_vector])

In [343]:
final_user_movie_matrix.shape

(6041, 3706)

In [344]:
user_id = user_input_df['UserID'].iloc[0]

similar_users = get_similar_users(user_id, final_user_similarity_matrix, k=10)
#print("similar users :",similar_users)

movies_watched = get_movies_watched(user_id, final_user_movie_matrix) 
#print("movies watched:",movies_watched)

unwatched_movies = get_unwatched_movies(user_id, final_user_movie_matrix, similar_users, movies_watched)  # Replace with the list of unwatched movie IDs
#print("unwatched_movies:",movies_watched)

predictions = predict_ratings_for_unwatched_movies(user_id, unwatched_movies, final_user_movie_matrix, final_user_similarity_matrix, similar_users)

top_10_movie_ids = get_top_10_rated_movies(predictions)

print("Top 10 rated movies:")
for movie_id in top_10_movie_ids:
    predicted_rating = predictions.get(movie_id, "Not Predicted")
    movie_name = movie_id_to_name.get(movie_id, "Unknown Movie")

    # Print the movie name and predicted rating
    print(f"- Movie Name: {movie_name.ljust(40)} | Predicted Rating: {predicted_rating:.2f}")

Top 10 rated movies:
- Movie Name: Bridges of Madison County, The (1995)    | Predicted Rating: 5.00
- Movie Name: Mighty Aphrodite (1995)                  | Predicted Rating: 3.90
- Movie Name: Pie in the Sky (1995)                    | Predicted Rating: 3.57
- Movie Name: Clueless (1995)                          | Predicted Rating: 3.54
- Movie Name: Once Upon a Time... When We Were Colored (1995) | Predicted Rating: 3.44
- Movie Name: Coneheads (1993)                         | Predicted Rating: 3.09
- Movie Name: Ransom (1996)                            | Predicted Rating: 3.09
- Movie Name: Net, The (1995)                          | Predicted Rating: 3.02
- Movie Name: I, Worst of All (Yo, la peor de todas) (1990) | Predicted Rating: 2.94
- Movie Name: Glass Shield, The (1994)                 | Predicted Rating: 2.84


## END 