In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

In [2]:
# Skip date
df1 = pd.read_csv('combined_data_1.txt', header = None, names = ['User_Id', 'Rating','Date'])

df1['Rating'] = df1['Rating'].astype(float)

print('Dataset 1 shape: {}'.format(df1.shape))
print('-Dataset examples-')
print(df1.iloc[::5000000, :])#START:STOP:STEP (5000000)

Dataset 1 shape: (24058263, 3)
-Dataset examples-
          User_Id  Rating        Date
0              1:     NaN         NaN
5000000   2560324     4.0  2005-12-06
10000000  2271935     2.0  2005-04-11
15000000  1921803     2.0  2005-01-31
20000000  1933327     3.0  2004-11-10


In [3]:
df1.index = np.arange(0,len(df1))
# get movie count (where rating ==NaN)
movie_count = df1.isnull().sum()[1] #df.isnull() searches for NaN values across all columns, we get in [1] column-rating

# get customer count (amount of unique user_id-movie_id)
user_count = df1['User_Id'].nunique() - movie_count

# get rating count (for ratings not only unique, all values except movies ids)
rating_count = df1['User_Id'].count() - movie_count

movie_count, user_count , rating_count

(4499, 470758, 24053764)

In [4]:
#Movie ID is really a mess import! 
#Looping through dataframe to add Movie ID column WILL make the Kernel run out of memory as it is too inefficient. 
#I achieve my task by first creating a numpy array with correct length 
#then add the whole array as column into the main dataframe!

#WORKING APPROXIMATELY 5 MIN

df_nan = pd.DataFrame(pd.isnull(df1.Rating)) #new df with indexes standard and one column "Rating" with True/False values
#df1.Rating gets indexes+"Rating" column values
#pd.isnull(df1.Rating) gets indexes+"Rating" mask column values (True=NaN, False-otherwise)

df_nan = df_nan[df_nan['Rating'] == True]#new df with old indexes and one column "Rating" with only True values
df_nan = df_nan.reset_index() # adds new indexes, old-moves to new column "index"


 # zip(df_nan['index'][1:],df_nan['index'][:-1]) returns iterator of tuples

movie_np = []
movie_id = 1 #we can start with 1 and sum it in loop,because all movie_id sorted in dataset from 1 to last (increase order)

# i-current movie_id j-previous movie_id , its needed for counting the difference because
#we need to know amount of rows=amount of ratings 
# so (i-j-1)=amount of not null ratings of whole users dataset for one movie (which has movie_id)
for i,j in zip(df_nan['index'][1:],df_nan['index'][:-1]):
    # numpy approach
    temp = np.full((1,i-j-1), movie_id) # 1st arg-shape, second-value add row , because movie_np is array 
    temp
    movie_np = np.append(movie_np, temp) # adds element to the end of array
    movie_id += 1 #we can sum it in loop дшлу +1 ,because all movie_id sorted in dataset from 1 to last (increase order)
    
# Account for last record and corresponding length
# numpy approach
last_record = np.full((1,len(df1) - df_nan.iloc[-1, 0] - 1),movie_id)
movie_np = np.append(movie_np, last_record)
#finally after loop we get array movie_np size=amount of non null ratings, at each index it has corresponding 
#movie_id value for tis rating +can be concatenated with cleaned df
print('Length: {}'.format(len(movie_np)))

Length: 24053764


In [5]:
# remove those Movie ID rows from initial df (rows where rating=NaN)
df1 = df1[pd.notnull(df1['Rating'])]

df1['Movie_Id'] = movie_np.astype(int) #Add movie_id column in df as movie_np received on previous step
df1['User_Id'] = df1['User_Id'].astype(int)
print('-Dataset examples-')
print(df1.iloc[::5000000, :])

-Dataset examples-
          User_Id  Rating        Date  Movie_Id
1         1488844     3.0  2005-09-06         1
5000996    501954     2.0  2004-08-26       996
10001962   404654     5.0  2005-08-29      1962
15002876   886608     2.0  2005-09-19      2876
20003825  1193835     2.0  2003-08-13      3825


In [6]:
# Get one hot encoding of column User_id
#one_hot_users = pd.get_dummies(df1['User_Id'])
#one_hot_users
# Drop column User_Id as it is now encoded
#df1 = df1.drop('User_Id',axis = 1)
# Join the encoded df
#df1 = df1.join(one_hot_users)
#df1  

#This approach with dummies wouldnt work, because dataset is too large

In [7]:
# importing one hot encoder from sklearn 
from sklearn.preprocessing import OneHotEncoder 
encoder = OneHotEncoder()
#X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']] 
#X-train matrix for encoder to learn categories
#enc.fit(X)  #process of encoder learning categories
one_hot_user_matrix = encoder.fit_transform(np.asarray(df1['User_Id']).reshape(-1,1)) 
one_hot_movie_matrix = encoder.fit_transform(np.asarray(df1['Movie_Id']).reshape(-1,1)) 
print(one_hot_movie_matrix.shape) #amount of rows=amount of ratings , amount of columns=amount of unique user_ids
#enc.transform([['female', 'from US', 'uses Safari'],['male', 'from Europe', 'uses Safari']]).toarray()
#finally transforming test matrix into one-hot-SLIGHTLY dont understand the output
#array([[1., 0., 0., 1., 0., 1.],[0., 1., 1., 0., 0., 1.]])

#df1 = df1.join(one_hot_user_matrix)
#print(df1.iloc[::5000000, :])

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


(24053764, 4499)


In [8]:
from scipy.sparse import coo_matrix, hstack
two_groups=hstack([one_hot_user_matrix,one_hot_movie_matrix]) #24053764x475257 matrix joined user_id and movie_id 2 groups
#import gc
#gc.collect()

In [9]:
np.who()

Name             Shape            Bytes            Type

movie_np         24053764         192430112        float64
temp             1 x 269          1076             int32
last_record      1 x 428          1712             int32

Upper bound on total bytes  =       192432900


In [10]:
Nu=1/np.sqrt(one_hot_user_matrix.sum(axis=0))


In [21]:
#import gc
#gc.collect()
#df1.drop('Date', axis=1, inplace=True)
#df_p = pd.pivot_table(df1.astype('float32'),values='Rating',index='User_Id',columns='Movie_Id')

res = df1.groupby(['User_Id', 'Movie_Id'])['Rating'].mean().astype('Sparse[int]')
res
#res.unstack(fill_value=0)
#print(res)

Movie_Id,1,2,3,4,5,6,7,8,9,10,...,4490,4491,4492,4493,4494,4495,4496,4497,4498,4499
User_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,5,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2649404,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2649409,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2649421,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2649426,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
print(res)

User_Id  Movie_Id
6        30          3
         157         3
         173         4
         175         5
         191         2
                    ..
2649429  4056        4
         4260        3
         4306        5
         4356        4
         4432        5
Name: Rating, Length: 24053764, dtype: Sparse[int32, 0]


In [29]:
res.unstack(fill_value=0)

#nnz_inds = res.nonzero()
#nnz_inds
#keep = np.array(x.nonzero())[0]
#n_keep = len(keep)
#b = csr_matrix((np.ones(n_keep), (nnz_inds[0][keep], nnz_inds[1][keep])), shape=res.shape)

Movie_Id,1,2,3,4,5,6,7,8,9,10,...,4490,4491,4492,4493,4494,4495,4496,4497,4498,4499
User_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,5,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2649404,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2649409,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2649421,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2649426,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
res1=res.sparse.to_coo()

In [40]:
#import gc
#gc.collect()
print(res1)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

