In [1]:
import pandas as pd

# Load the datasets
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

# Display the first few rows of each dataset
print("Movies dataset:")
print(movies.head(), "\n")

print("Ratings dataset:")
print(ratings.head(), "\n")

print("Tags dataset:")
print(tags.head())

Movies dataset:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy   

Ratings dataset:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931 

Tags dataset:
   userId  movieId              tag   timestamp
0       2    60756            funny  1445714994
1       2    60756  Highly quo

In [3]:
# Merge ratings and movies on 'movieId'
data = pd.merge(ratings, movies, on='movieId', how='inner')

# Display the first few rows of the merged dataset
print("Merged dataset:")
print(data.head())

Merged dataset:
   userId  movieId  rating   timestamp             title  \
0       1        1     4.0   964982703  Toy Story (1995)   
1       5        1     4.0   847434962  Toy Story (1995)   
2       7        1     4.5  1106635946  Toy Story (1995)   
3      15        1     2.5  1510577970  Toy Story (1995)   
4      17        1     4.5  1305696483  Toy Story (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1  Adventure|Animation|Children|Comedy|Fantasy  
2  Adventure|Animation|Children|Comedy|Fantasy  
3  Adventure|Animation|Children|Comedy|Fantasy  
4  Adventure|Animation|Children|Comedy|Fantasy  


In [4]:
# Extract the release year from the title using regex
data['year'] = data['title'].str.extract(r'\((\d{4})\)')  # Extract year in parentheses


In [5]:
data['year'] = pd.to_numeric(data['year'], errors='coerce')  # Convert to numeric



In [6]:
# Check for missing values and drop rows with missing data
data = data.dropna()

# Display the first few rows of the updated dataset
print("Dataset after extracting year and handling missing values:")
print(data.head())

Dataset after extracting year and handling missing values:
   userId  movieId  rating   timestamp             title  \
0       1        1     4.0   964982703  Toy Story (1995)   
1       5        1     4.0   847434962  Toy Story (1995)   
2       7        1     4.5  1106635946  Toy Story (1995)   
3      15        1     2.5  1510577970  Toy Story (1995)   
4      17        1     4.5  1305696483  Toy Story (1995)   

                                        genres    year  
0  Adventure|Animation|Children|Comedy|Fantasy  1995.0  
1  Adventure|Animation|Children|Comedy|Fantasy  1995.0  
2  Adventure|Animation|Children|Comedy|Fantasy  1995.0  
3  Adventure|Animation|Children|Comedy|Fantasy  1995.0  
4  Adventure|Animation|Children|Comedy|Fantasy  1995.0  


In [7]:
# Split the genres into individual categories and perform one-hot encoding
genres_onehot = data['genres'].str.get_dummies('|')

# Concatenate the one-hot encoded genres back to the dataset
data = pd.concat([data, genres_onehot], axis=1)

# Display the first few rows of the updated dataset
print("Dataset after one-hot encoding genres:")
print(data.head())


Dataset after one-hot encoding genres:
   userId  movieId  rating   timestamp             title  \
0       1        1     4.0   964982703  Toy Story (1995)   
1       5        1     4.0   847434962  Toy Story (1995)   
2       7        1     4.5  1106635946  Toy Story (1995)   
3      15        1     2.5  1510577970  Toy Story (1995)   
4      17        1     4.5  1305696483  Toy Story (1995)   

                                        genres    year  (no genres listed)  \
0  Adventure|Animation|Children|Comedy|Fantasy  1995.0                   0   
1  Adventure|Animation|Children|Comedy|Fantasy  1995.0                   0   
2  Adventure|Animation|Children|Comedy|Fantasy  1995.0                   0   
3  Adventure|Animation|Children|Comedy|Fantasy  1995.0                   0   
4  Adventure|Animation|Children|Comedy|Fantasy  1995.0                   0   

   Action  Adventure  ...  Film-Noir  Horror  IMAX  Musical  Mystery  Romance  \
0       0          1  ...          0       0     0

In [8]:
print(f"Number of unique users: {data['userId'].nunique()}")
print(f"Number of unique movies: {data['movieId'].nunique()}")
# Filter movies with at least 5 ratings
def filter_movies_by_count(data, min_ratings=5, chunk_size=10000):
    movie_rating_counts = data['movieId'].value_counts()
    for i in range(0, len(movie_rating_counts), chunk_size):
        chunk = movie_rating_counts.iloc[i:i+chunk_size]
        chunk_popular_movies = chunk[chunk >= min_ratings].index
        yield data[data['movieId'].isin(chunk_popular_movies)]

# Apply movie filtering incrementally
filtered_movie_data = pd.concat(filter_movies_by_count(data))  # Define filtered_movie_data
print(f"Filtered movies data shape: {filtered_movie_data.shape}")

Number of unique users: 610
Number of unique movies: 9711
Filtered movies data shape: (90274, 27)


In [13]:
 #Create the user-item interaction matrix
user_item_matrix = data.pivot_table(index='userId', columns='movieId', values='rating')

 #Display the first few rows of the matrix
print("User-Item Interaction Matrix:")
print(user_item_matrix.head())


User-Item Interaction Matrix:
movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           4.0     NaN     4.0     NaN     NaN     4.0     NaN     NaN   
2           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
3           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
4           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
5           4.0     NaN     NaN     NaN     NaN     NaN     NaN     NaN   

movieId  9       10      ...  193565  193567  193571  193573  193579  193581  \
userId                   ...                                                   
1           NaN     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN   
2           NaN     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN   
3           NaN     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN   
4           NaN     NaN  ...     NaN     NaN

In [14]:
# Fill NaN values with 0 (assuming no interaction/rating)
user_item_matrix = user_item_matrix.fillna(0)

# Display the first few rows of the updated matrix
print("User-Item Interaction Matrix after filling NaN values:")
print(user_item_matrix.head())

User-Item Interaction Matrix after filling NaN values:
movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           4.0     0.0     4.0     0.0     0.0     4.0     0.0     0.0   
2           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
5           4.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

movieId  9       10      ...  193565  193567  193571  193573  193579  193581  \
userId                   ...                                                   
1           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
2           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0    

In [15]:
# Save the user-item interaction matrix to a file for future use
user_item_matrix.to_csv('user_item_matrix.csv', index=True)

print("Preprocessed user-item interaction matrix saved successfully.")

Preprocessed user-item interaction matrix saved successfully.


In [16]:
# Normalize ratings by subtracting the user mean
user_mean = user_item_matrix.mean(axis=1)
user_item_matrix_normalized = user_item_matrix.sub(user_mean, axis=0)

# Fill NaN (caused by subtraction) with 0
user_item_matrix_normalized = user_item_matrix_normalized.fillna(0)

print("User-item matrix after normalization:")
print(user_item_matrix_normalized.head())

User-item matrix after normalization:
movieId    1         2         3         4         5         6         7       \
userId                                                                          
1        3.895685 -0.104315  3.895685 -0.104315 -0.104315  3.895685 -0.104315   
2       -0.011791 -0.011791 -0.011791 -0.011791 -0.011791 -0.011791 -0.011791   
3       -0.009783 -0.009783 -0.009783 -0.009783 -0.009783 -0.009783 -0.009783   
4       -0.079086 -0.079086 -0.079086 -0.079086 -0.079086 -0.079086 -0.079086   
5        3.983524 -0.016476 -0.016476 -0.016476 -0.016476 -0.016476 -0.016476   

movieId    8         9         10      ...    193565    193567    193571  \
userId                                 ...                                 
1       -0.104315 -0.104315 -0.104315  ... -0.104315 -0.104315 -0.104315   
2       -0.011791 -0.011791 -0.011791  ... -0.011791 -0.011791 -0.011791   
3       -0.009783 -0.009783 -0.009783  ... -0.009783 -0.009783 -0.009783   
4       -0.079

In [17]:
# Normalize ratings by subtracting the user mean
user_mean = user_item_matrix.mean(axis=1)
user_item_matrix_normalized = user_item_matrix.sub(user_mean, axis=0)

# Fill NaN (caused by subtraction) with 0
user_item_matrix_normalized = user_item_matrix_normalized.fillna(0)

print("User-item matrix after normalization:")
print(user_item_matrix_normalized.head())

User-item matrix after normalization:
movieId    1         2         3         4         5         6         7       \
userId                                                                          
1        3.895685 -0.104315  3.895685 -0.104315 -0.104315  3.895685 -0.104315   
2       -0.011791 -0.011791 -0.011791 -0.011791 -0.011791 -0.011791 -0.011791   
3       -0.009783 -0.009783 -0.009783 -0.009783 -0.009783 -0.009783 -0.009783   
4       -0.079086 -0.079086 -0.079086 -0.079086 -0.079086 -0.079086 -0.079086   
5        3.983524 -0.016476 -0.016476 -0.016476 -0.016476 -0.016476 -0.016476   

movieId    8         9         10      ...    193565    193567    193571  \
userId                                 ...                                 
1       -0.104315 -0.104315 -0.104315  ... -0.104315 -0.104315 -0.104315   
2       -0.011791 -0.011791 -0.011791  ... -0.011791 -0.011791 -0.011791   
3       -0.009783 -0.009783 -0.009783  ... -0.009783 -0.009783 -0.009783   
4       -0.079