## Need a user-item rating matrix 
    - creating a matrix that follows that format 


In [6]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 

In [15]:
movie_dataset = pd.read_csv('../../movie_dataset/ratings_small.csv')
movie_dataset.head(2), movie_dataset.info()  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


(   userId  movieId  rating   timestamp
 0       1       31     2.5  1260759144
 1       1     1029     3.0  1260759179,
 None)

In [16]:
movie_dataset = movie_dataset.sort_values(by=["userId", "rating"], ascending=[True, False])
movie_dataset

Unnamed: 0,userId,movieId,rating,timestamp
4,1,1172,4.0,1260759205
12,1,1953,4.0,1260759191
13,1,2105,4.0,1260759139
8,1,1339,3.5,1260759125
1,1,1029,3.0,1260759179
...,...,...,...,...
99964,671,3481,2.0,1064245565
99967,671,3897,2.0,1063503718
99984,671,5010,2.0,1066793004
99985,671,5218,2.0,1065111990


In [17]:
# getting top 5 movies for each user
top_movie_df = movie_dataset.groupby('userId').head(5)
top_movie_df.head(2), top_movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3355 entries, 4 to 99901
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   userId     3355 non-null   int64  
 1   movieId    3355 non-null   int64  
 2   rating     3355 non-null   float64
 3   timestamp  3355 non-null   int64  
dtypes: float64(1), int64(3)
memory usage: 131.1 KB


(    userId  movieId  rating   timestamp
 4        1     1172     4.0  1260759205
 12       1     1953     4.0  1260759191,
 None)

In [18]:
# getting how many users and how many movies are unique
unique_users_count = top_movie_df['userId'].nunique()
unique_movies_count = top_movie_df['movieId'].nunique()

print(f'There are {unique_users_count} unique users.')
print(f'There are {unique_movies_count} unique movies.')

# number of ratings by each user 
users_ratings_count = top_movie_df.userId.value_counts()
print(f'Users with higher ratings in descending order is:\n{users_ratings_count}')

# number of movies with high number of ratings 
movies_ratings_count = top_movie_df.movieId.value_counts()
print(f'\nMovies with high number of ratings in descending order is:\n{movies_ratings_count}')


There are 671 unique users.
There are 834 unique movies.
Users with higher ratings in descending order is:
userId
1      5
442    5
444    5
445    5
446    5
      ..
226    5
227    5
228    5
229    5
671    5
Name: count, Length: 671, dtype: int64

Movies with high number of ratings in descending order is:
movieId
318     133
260     104
296      92
50       91
356      77
       ... 
373       1
281       1
1361      1
2109      1
317       1
Name: count, Length: 834, dtype: int64


In [19]:
# user item matrix
user_movie_matrix = movie_dataset.pivot(index='userId', columns='movieId', values='rating')
user_movie_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,4.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,4.0,...,,,,,,,,,,
5,,,4.0,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,,,,,,4.0,,,,,...,,,,,,,,,,
668,,,,,,,,,,,...,,,,,,,,,,
669,,,,,,,,,,,...,,,,,,,,,,
670,4.0,,,,,,,,,,...,,,,,,,,,,


In [20]:
nan_count = user_movie_matrix.isnull().sum().sum()
non_nan_count = user_movie_matrix.notnull().sum().sum()
print(f'There are {nan_count} NaN values and {non_nan_count} full_values in the matrix out of {np.product(user_movie_matrix.shape)}')

There are 5983282 NaN values and 100004 full_values in the matrix out of 6083286


In [21]:
9035+1000

10035

In [22]:
# saving the matrix as csv file
user_movie_matrix.to_csv('user_movie_matrix.csv', encoding='utf-8', index=False)