In [0]:
from datetime import datetime
import random
import numpy as np
import pandas as pd 
import os
from scipy import sparse
from scipy.sparse import csr_matrix 
import warnings 
warnings.filterwarnings('ignore')

In [15]:
start = datetime.now()
data = open('data.csv', mode='w')
    
row = list()
files=['combined_data_1.txt','combined_data_2.txt', 'combined_data_3.txt', 'combined_data_4.txt']
for file in files:
  print("Reading ratings from {}".format(file))
  with open(file) as f:
    for line in f: 
      del row[:]  
      line = line.strip()
      if line.endswith(':'):
      # All below are ratings for this movie, until another movie appears.
        movie_id = line.replace(':', '')
      else:
        row = [x for x in line.split(',')]
        row.insert(0, movie_id)
        data.write(','.join(row))
        data.write('\n')
  print("Done.\n")
data.close()
print('Time taken :', datetime.now() - start)

Reading ratings from combined_data_1.txt
Done.

Reading ratings from combined_data_2.txt
Done.

Reading ratings from combined_data_3.txt
Done.

Reading ratings from combined_data_4.txt
Done.

Time taken : 0:02:59.578168


In [18]:
df = pd.read_csv('data.csv', sep=',', names=['movie', 'user','rating','date'])
df.date = pd.to_datetime(df.date)
df.sort_values(by='date', inplace=True)
df.head()

Unnamed: 0,movie,user,rating,date
56431994,10341,510180,4,1999-11-11
9056171,1798,510180,5,1999-11-11
58698779,10774,510180,3,1999-11-11
48101611,8651,510180,2,1999-11-11
81893208,14660,510180,2,1999-11-11


In [19]:
df.describe()

Unnamed: 0,movie,user,rating
count,100480500.0,100480500.0,100480500.0
mean,9070.915,1322489.0,3.60429
std,5131.891,764536.8,1.085219
min,1.0,6.0,1.0
25%,4677.0,661198.0,3.0
50%,9051.0,1319012.0,4.0
75%,13635.0,1984455.0,4.0
max,17770.0,2649429.0,5.0


### Checking for Nan Values

In [0]:
print("No of Nan values : ", sum(df.isnull().any()))

### Removing Duplicates


In [20]:
dup_bool = df.duplicated(['movie','user','rating'])
dups = sum(dup_bool) 
print("There are {} duplicate rating entries in the data".format(dups))

There are 0 duplicate rating entries in the data..


In [21]:
print("\nTotal no of ratings :",df.shape[0])
print("Total No of Users   :", len(np.unique(df.user)))
print("Total No of movies  :", len(np.unique(df.movie)))


Total no of ratings : 100480507
Total No of Users   : 480189
Total No of movies  : 17770


# Train Test Split

In [0]:
df.iloc[:int(df.shape[0]*0.80)].to_csv("train.csv", index=False)
df.iloc[int(df.shape[0]*0.80):].to_csv("test.csv", index=False)

# Creating Sparse Data

In [0]:
train_df = pd.read_csv("train.csv", parse_dates=['date'])
test_df = pd.read_csv("test.csv")

## Creating Training Sparse Matrix

In [28]:
start = datetime.now()
train_sparse_matrix = sparse.csr_matrix((train_df.rating.values, (train_df.user.values, train_df.movie.values)),)
sparse.save_npz("train_sparse_matrix.npz", train_sparse_matrix)
print(datetime.now() - start)

# Checking Sparsity of the matrix
us,mv = train_sparse_matrix.shape
elem = train_sparse_matrix.count_nonzero()

print("Sparsity Of Train matrix : {} % ".format(  (1-(elem/(us*mv))) * 100) )

0:00:14.442633
Sparsity Of Train matrix : 99.8292709259195 % 


## Test Sparse Matrix

In [29]:
start = datetime.now()
test_sparse_matrix = sparse.csr_matrix((test_df.rating.values, (test_df.user.values, test_df.movie.values)))
sparse.save_npz("test_sparse_matrix.npz", test_sparse_matrix)    
print(datetime.now() - start)


us,mv = test_sparse_matrix.shape
elem = test_sparse_matrix.count_nonzero()

print("Sparsity Of Test matrix : {} % ".format(  (1-(elem/(us*mv))) * 100))

0:00:03.106193
Sparsity Of Test matrix : 99.95731772988694 % 
