In [1]:
import pandas as pd

Importing the data 

In [2]:
ratings_df = pd.read_csv('ratings.csv')
tags_df = pd.read_csv('tags.csv')
movies_df = pd.read_csv('movies.csv')
links_df = pd.read_csv('links.csv')

Display the first 5 rows of each DataFrame

In [3]:

print("First 5 rows of ratings_df:")
print(ratings_df.head().to_markdown(index=False, numalign="left", stralign="left"))

print("\nFirst 5 rows of tags_df:")
print(tags_df.head().to_markdown(index=False, numalign="left", stralign="left"))

print("\nFirst 5 rows of movies_df:")
print(movies_df.head().to_markdown(index=False, numalign="left", stralign="left"))

print("\nFirst 5 rows of links_df:")
print(links_df.head().to_markdown(index=False, numalign="left", stralign="left"))

First 5 rows of ratings_df:
| userId   | movieId   | rating   | timestamp   |
|:---------|:----------|:---------|:------------|
| 1        | 1         | 4        | 9.64983e+08 |
| 1        | 3         | 4        | 9.64981e+08 |
| 1        | 6         | 4        | 9.64982e+08 |
| 1        | 47        | 5        | 9.64984e+08 |
| 1        | 50        | 5        | 9.64983e+08 |

First 5 rows of tags_df:
| userId   | movieId   | tag             | timestamp   |
|:---------|:----------|:----------------|:------------|
| 2        | 60756     | funny           | 1445714994  |
| 2        | 60756     | Highly quotable | 1445714996  |
| 2        | 60756     | will ferrell    | 1445714992  |
| 2        | 89774     | Boxing story    | 1445715207  |
| 2        | 89774     | MMA             | 1445715200  |

First 5 rows of movies_df:
| movieId   | title                              | genres                                      |
|:----------|:-----------------------------------|:---------------------

Get information about the columns in each DataFrame

In [4]:
print("\nratings_df Information:")
print(ratings_df.info())

print("\ntags_df Information:")
print(tags_df.info())

print("\nmovies_df Information:")
print(movies_df.info())

print("\nlinks_df Information:")
print(links_df.info())


ratings_df Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
None

tags_df Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB
None

movies_df Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   N

Check for duplicates in each DataFrame

In [5]:
print("\nNumber of duplicate rows in ratings_df:", ratings_df.duplicated().sum())
print("Number of duplicate rows in tags_df:", tags_df.duplicated().sum())
print("Number of duplicate rows in movies_df:", movies_df.duplicated().sum())
print("Number of duplicate rows in links_df:", links_df.duplicated().sum())


Number of duplicate rows in ratings_df: 0
Number of duplicate rows in tags_df: 0
Number of duplicate rows in movies_df: 0
Number of duplicate rows in links_df: 0


according to the above information in the links.csv the tmdbId column has 8 missing values , In tags.csv, the timestamp column is currently of type int64, but it would be more suitable as a datetime type for further analysis. So next we will be doing exactly that.


In [6]:
import numpy as np

Drop rows with missing values in `tmdbId`


In [7]:
links_df.dropna(subset=['tmdbId'], inplace=True)

Convert `timestamp` columns to datetime


In [8]:
ratings_df['timestamp'] = pd.to_datetime(ratings_df['timestamp'], unit='s')
tags_df['timestamp'] = pd.to_datetime(tags_df['timestamp'], unit='s')

We will process the genres column in the movies_df DataFrame. Since the genres are pipe-separated, we'll split them into individual genres (eg . e.g., "Action|Adventure" into "Action", "Adventure") and create a new DataFrame where each row represents a movie and a single genre associated with it.

In [9]:
# Split genres into separate columns
split_genres_df = movies_df['genres'].str.split('|', expand=True)

# Stack the columns to create a Series of genres
stacked_genres = split_genres_df.stack()

# Reset index and rename columns
stacked_genres_df = stacked_genres.reset_index()
stacked_genres_df.columns = ['movieId', 'genre_index', 'genre']

# Merge with movies_df to get titles
movies_with_genres_df = pd.merge(movies_df[['movieId', 'title']], stacked_genres_df, on='movieId')

# Drop the unnecessary column 'genre_index'
movies_with_genres_df = movies_with_genres_df.drop('genre_index', axis=1)

# Display the first 10 rows
print(movies_with_genres_df.head(10).to_markdown(index=False, numalign="left", stralign="left"))

| movieId   | title                              | genre     |
|:----------|:-----------------------------------|:----------|
| 1         | Toy Story (1995)                   | Adventure |
| 1         | Toy Story (1995)                   | Children  |
| 1         | Toy Story (1995)                   | Fantasy   |
| 2         | Jumanji (1995)                     | Comedy    |
| 2         | Jumanji (1995)                     | Romance   |
| 3         | Grumpier Old Men (1995)            | Comedy    |
| 3         | Grumpier Old Men (1995)            | Drama     |
| 3         | Grumpier Old Men (1995)            | Romance   |
| 4         | Waiting to Exhale (1995)           | Comedy    |
| 5         | Father of the Bride Part II (1995) | Action    |


Build the Recommendation Engines

A. For Registered Users (Collaborative Filtering)

In [10]:
from sklearn.decomposition import TruncatedSVD
import pickle

# Aggregate ratings by userId and movieId, compute mean rating, and create pivot table
ratings_pivot = ratings_df.groupby(['userId', 'movieId'])['rating'].mean().reset_index().pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Create ratings matrix
ratings_matrix = ratings_pivot.values

# Apply Truncated SVD for matrix factorization
svd = TruncatedSVD(n_components=50)
user_factors = svd.fit_transform(ratings_matrix)
movie_factors = svd.components_
with open('model.pkl', 'wb') as file:
    pickle.dump(svd, file)


# Predict ratings for all user-movie pairs
predicted_ratings = user_factors.dot(movie_factors)

predicted_ratings

array([[ 2.28551015e+00,  2.94069336e-01,  1.27816971e+00, ...,
        -1.01902586e-02, -1.01902586e-02, -1.00039885e-01],
       [ 2.05232271e-01,  3.08313789e-02, -3.78895709e-03, ...,
         1.76256320e-02,  1.76256320e-02,  3.33952550e-02],
       [ 3.14101072e-02,  4.70936496e-03,  4.96997508e-02, ...,
        -3.09964464e-04, -3.09964464e-04,  2.17520775e-04],
       ...,
       [ 1.03277129e+00,  2.75679615e+00,  2.02183655e+00, ...,
        -1.71291141e-02, -1.71291141e-02,  1.24293597e-01],
       [ 8.76653451e-01,  5.05029719e-01,  8.30801406e-02, ...,
         8.45151857e-04,  8.45151857e-04, -3.20836172e-03],
       [ 5.28077380e+00, -1.29251414e-01, -9.75370914e-02, ...,
        -2.68716411e-02, -2.68716411e-02,  1.86380831e-02]])

In [11]:
def get_movie_recommendations(user_id, movies_with_genres_df, N=30):

    user_idx = ratings_pivot.index.get_loc(user_id)
    unrated_movies = ratings_pivot.loc[user_id][ratings_pivot.loc[user_id] == 0].index - 1
    valid_unrated_movies = list(set(unrated_movies).intersection(set(range(len(predicted_ratings[user_idx])))))
    predicted_ratings_for_user = predicted_ratings[user_idx][valid_unrated_movies]
    top_movie_indices = predicted_ratings_for_user.argsort()[::-1][:N]
    recommended_movie_ids = [valid_unrated_movies[i] + 1 for i in top_movie_indices]

    # Create DataFrame of recommendations
    recommendations_df = pd.DataFrame({'movieId': recommended_movie_ids,
                                       'predicted_rating': [predicted_ratings_for_user[i] for i in top_movie_indices]})

    # Merge with movies_df to get titles and genres
    recommendations_df = pd.merge(recommendations_df, movies_df[['movieId', 'title']], on='movieId')

    # Group by movieId and predicted_rating, aggregate genres
    recommendations_df = recommendations_df.groupby(['movieId', 'predicted_rating'])['title'].apply(lambda x: '|'.join(x)).reset_index()

    # Merge again with movies_df to get all genres for each movie
    recommendations_df = pd.merge(recommendations_df, movies_df[['movieId', 'genres']], on='movieId' )

    # Sort by predicted rating
    recommendations_df = recommendations_df.sort_values(by='predicted_rating', ascending=False)

    return recommendations_df



In [12]:

user_id = 600
recommendations = get_movie_recommendations(user_id, movies_with_genres_df)
print(f"\nRecommendations for User {user_id}:\n")
print(recommendations[['movieId', 'title', 'genres', 'predicted_rating']].round(2).to_markdown(index=False, numalign="left", stralign="left"))


Recommendations for User 600:

| movieId   | title                                       | genres                                    | predicted_rating   |
|:----------|:--------------------------------------------|:------------------------------------------|:-------------------|
| 44        | Mortal Kombat (1995)                        | Action|Adventure|Fantasy                  | 6.18               |
| 3634      | Seven Days in May (1964)                    | Thriller                                  | 6.1                |
| 907       | Gay Divorcee, The (1934)                    | Comedy|Musical|Romance                    | 5.72               |
| 1997      | Exorcist, The (1973)                        | Horror|Mystery                            | 5.46               |
| 1487      | Selena (1997)                               | Drama|Musical                             | 5.44               |
| 218       | Boys on the Side (1995)                     | Comedy|Drama                     

the next lines of code are  primarily focused on training a Singular Value Decomposition (SVD) model on movie rating data, evaluating its performance using Root Mean Square Error (RMSE), and preparing predicted ratings for later use in a recommendation system.

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
import numpy as np

In [14]:

# Data Splitting and Preparation

train_ratings, test_ratings = train_test_split(ratings_df, test_size=0.2, stratify=ratings_df['userId'], random_state=42)

train_pivot = train_ratings.groupby(['userId', 'movieId'])['rating'].mean().reset_index().pivot(index='userId', columns='movieId', values='rating').fillna(0)
test_pivot = test_ratings.groupby(['userId', 'movieId'])['rating'].mean().reset_index().pivot(index='userId', columns='movieId', values='rating').fillna(0)

common_movie_ids = train_pivot.columns.intersection(test_pivot.columns)
train_pivot_filtered = train_pivot[common_movie_ids]
test_pivot_filtered = test_pivot[common_movie_ids]

# Re-create train and test matrices
train_matrix_filtered = train_pivot_filtered.values
test_matrix_filtered = test_pivot_filtered.values


In [15]:
# SVD Model Training and Prediction 
svd = TruncatedSVD(n_components=50)
svd.fit(train_matrix_filtered)

# Predict ratings on filtered test data
predicted_ratings = svd.transform(test_matrix_filtered) @ svd.components_

predicted_ratings_reshaped = predicted_ratings.reshape(test_pivot_filtered.shape)


In [16]:
# Data Formatting for Evaluation

test_df = test_pivot_filtered.stack().reset_index()
test_df.columns = ['userId', 'movieId', 'rating']

user_ids = []
movie_ids = []
predicted_ratings = []

# Iterate over rows and columns of test_pivot_filtered
for user_idx, user_id in enumerate(test_pivot_filtered.index):
    for movie_idx, movie_id in enumerate(test_pivot_filtered.columns):
        user_ids.append(user_id)
        movie_ids.append(movie_id)
        predicted_ratings.append(predicted_ratings_reshaped[user_idx, movie_idx])

# Create DataFrame for predicted ratings
predictions_df = pd.DataFrame({'userId': user_ids, 'movieId': movie_ids, 'predicted_rating': predicted_ratings})

# Merge test and predicted ratings
merged_df = pd.merge(test_df, predictions_df, on=['userId', 'movieId'])

In [17]:

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(merged_df['rating'], merged_df['predicted_rating']))


print(f"RMSE: {rmse:.4f}")



RMSE: 0.2928


The RMSE of the SVD model on the test set is 0.2927. This indicates that, on average, the model's predictions are about 0.29 units away from the actual user ratings. Given that the ratings are on a scale of 1 to 5, this RMSE suggests that the model is performing quite well.

In [18]:
def get_user_genre_preferences():
    print("Please select your preferred genres (type 'done' when finished):")
    all_genres = movies_with_genres_df['genre'].unique()
    for i, genre in enumerate(all_genres):
        print(f"{i+1}. {genre}")

    selected_genres = []
    while True:
        choice = input("Enter genre number or 'done': ")
        if choice.lower() == 'done':
            break
        try:
            genre_index = int(choice) - 1
            if 0 <= genre_index < len(all_genres):
                selected_genres.append(all_genres[genre_index-1])
            else:
                print("Invalid genre number.")
        except ValueError:
            print("Invalid input.")

    return selected_genres

In [19]:
def get_movie_recommendations_for_new_user(selected_genres, movies_with_genres_df, N=20):
    

    # Filter movies by selected genres
    filtered_movies = movies_with_genres_df[movies_with_genres_df['genre'].isin(selected_genres)]

    # Aggregate ratings for filtered movies (Corrected line)
    filtered_ratings = pd.merge(filtered_movies, ratings_df, on='movieId')  # Merge before grouping
    average_ratings = filtered_ratings.groupby('movieId')['rating'].mean()

    # Get top N movies based on average ratings
    top_movies = average_ratings.sort_values(ascending=False).head(N)

    # Create DataFrame of recommendations
    recommendations_df = pd.DataFrame({'movieId': top_movies.index, 'average_rating': top_movies.values})

    # Merge with movies_df to get titles and genres
    recommendations_df = pd.merge(recommendations_df, movies_df[['movieId', 'title', 'genres']], on='movieId')

    return recommendations_df

In [20]:
while True:
    user_input = input("Enter user ID (or 'new' for a new user): ")
    if user_input.lower() == 'new':
        selected_genres = get_user_genre_preferences()
        recommendations = get_movie_recommendations_for_new_user(selected_genres, movies_with_genres_df)
        print("\nMovie recommendations for you:\n")
    else:
        try:
            user_id = int(user_input)
            if user_id in ratings_pivot.index:
                recommendations = get_movie_recommendations(user_id, movies_with_genres_df)
                print(f"\nRecommendations for User {user_id}:\n")
            else:
                print("Invalid user ID.")
                continue  # Ask for input again
        except ValueError:
            print("Invalid input.")
            continue
    
    #Display the recommendations
    print(recommendations[['movieId', 'title', 'genres', 'predicted_rating' if 'predicted_rating' in recommendations.columns else 'average_rating']].round(2).to_markdown(index=False, numalign="left", stralign="left"))

    another_recommendation = input("Do you want another recommendation? (yes/no): ")
    if another_recommendation.lower() != 'yes':
        break

Please select your preferred genres (type 'done' when finished):
1. Adventure
2. Children
3. Fantasy
4. Comedy
5. Romance
6. Drama
7. Action
8. Crime
9. Thriller
10. Horror
11. Animation
12. Mystery
13. Sci-Fi
14. Musical
15. War
16. Documentary
17. IMAX
18. Film-Noir
19. Western
20. (no genres listed)

Movie recommendations for you:

| movieId   | title                                                   | genres                       | average_rating   |
|:----------|:--------------------------------------------------------|:-----------------------------|:-----------------|
| 4495      | Crossing Delancey (1988)                                | Comedy|Romance               | 5                |
| 99        | Heidi Fleiss: Hollywood Madam (1995)                    | Documentary                  | 5                |
| 5513      | Martin Lawrence Live: Runteldat (2002)                  | Comedy|Documentary           | 5                |
| 5490      | The Big Bus (1976)                     