# Preprocessing And Compressed Bit Vectors Evaluation

In [1]:
import os
import numpy as np
import pandas as pd
from scipy.sparse import *
from scipy import *
from sklearn.preprocessing import LabelEncoder

In [2]:
## Load dataset Movielens 100K or 1 million rating or bookrating(1million)

# Dataset source:Movielens 100k: https://grouplens.org/datasets/movielens/100k/
# Movielens 1 million: https://grouplens.org/datasets/movielens/1m/
# Book crossing: https://grouplens.org/datasets/book-crossing/

header_list = ["userid", "movieid", "movieRating", "timestamp"]
data = pd.read_csv('/content/drive/MyDrive/SJSU/classes/FALL_2021/CMPE_295B/295B_Project_Team_material/Dataset/movie1mratings.dat', sep='::', names=header_list)
# data = pd.read_csv('/content/drive/MyDrive/SJSU/classes/FALL_2021/CMPE_295B/295B_Project_Team_material/Dataset/movie100kratings.data', sep='\t')
nRow, nCol = data.shape
print(f'There are {nRow} rows and {nCol} columns')
data.head()

  


There are 1000209 rows and 4 columns


Unnamed: 0,userid,movieid,movieRating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
data.dtypes

userid         int64
movieid        int64
movieRating    int64
timestamp      int64
dtype: object

In [4]:
## final dataset sample
sample_data = data.drop(['timestamp'],axis=1)
sample_data.head(10)
#contain 1,000,209 anonymous ratings of approximately 3,900 movies made by 6,040 MovieLens

Unnamed: 0,userid,movieid,movieRating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
5,1,1197,3
6,1,1287,5
7,1,2804,5
8,1,594,4
9,1,919,4


In [5]:
frequency = sample_data.groupby('userid').count()
print(len(frequency))

test_users = []
for i in range(len(frequency)):
    fre = frequency.iloc[i]
    break
    if (fre[0] > 100):
        test_users.append(i+1)
# print(len(test_users))

6040


In [6]:
test_userid = []
test_movieid = []
test_movieRating = []

In [7]:
print(sample_data.userid.max())


6040


create testing set

In [8]:
# if the user has rated more than "50" movies then add him to test dataset
for user in range(1,6040): #max userid value from prev step
    filter_df = sample_data.query("userid == " + str(user))
    if (len(filter_df) < 50):
        continue
    # add 20% of that particular user data to the testdata set 
    portion = int(len(filter_df)*(0.2))
    random_indices = set(np.random.choice(len(filter_df), portion))
    for i in random_indices:
        row = filter_df.iloc[i]
        test_userid.append(row.userid)
        test_movieid.append(row.movieid)
        test_movieRating.append(row.movieRating)
 

In [9]:
print(len(test_userid))
print(len(test_movieid))
print(len(test_movieRating))

169962
169962
169962


In [10]:
test_df_dict = {"userid": test_userid, "movieid":test_movieid, "movieRating":test_movieRating}
convert_dict = {
    "userid" :      int,
    "movieid" :     int,
    "movieRating"  :   int
    }
test_df = pd.DataFrame(test_df_dict)
test_df = test_df.astype(convert_dict)
print(test_df.head())
test_df.shape

   userid  movieid  movieRating
0       1     1961            5
1       1      919            4
2       1     1029            5
3       1     2687            3
4       1     2018            4


(169962, 3)

In [11]:
test_df.to_csv("test_ratings.csv",index=False)
# test dataset csv

In [12]:
# data redundancy
# adding unique userid,movieid tuple to the test_tuple_set
#  trying to avoid users who have rated same movie twice
test_tuple_set = set()
print(len(test_userid))
for i in range(len(test_userid)):
    userid_movieid_tuple = (test_userid[i], test_movieid[i])
    if userid_movieid_tuple in test_tuple_set:
        print(userid_movieid_tuple)
        print(":)")
    test_tuple_set.add(userid_movieid_tuple)
print(len(test_tuple_set))
print(len(test_df))

169962
169962
169962


create Training set

In [13]:
def drop_rating(row):
    print(len(row))
    t = (row.userid, row.movieid)
    if t in test_tuple_set:
        row.movieRating = 0.0
    return row

In [14]:
# removing test data values/ratings from the training dataset and making them zero not deleting 
for i in range(len(sample_data)):
    row = sample_data.iloc[i]
    t = (row.userid, row.movieid)
    if t in test_tuple_set:
        sample_data.at[i,'movieRating'] = 0.0


In [15]:
sample_data.head(10)

Unnamed: 0,userid,movieid,movieRating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
5,1,1197,3
6,1,1287,5
7,1,2804,5
8,1,594,4
9,1,919,0


In [16]:
sample_data.to_csv("train_ratings.csv", index=False)

In [17]:
len(sample_data.movieid.unique())
# total unique movies/items
# sample_data = whole data

3706

In [18]:
# converting unique movies numpy ndarray to list
unique_movies = sample_data.movieid.unique()
type (list(unique_movies))
print(unique_movies)

[1193  661  914 ... 2845 3607 2909]


 ***ASSIGN* CONTINUOUS INDEX TO movieid**

In [19]:
# adding index to movie id thru a dict
original_movie_ids = []
mapped_movie_ids = []
movie_dict = {}
count = 1
for movie_id in unique_movies:
    original_movie_ids.append(movie_id)
    mapped_movie_ids.append(count)
    movie_dict[movie_id] = count
    count += 1

In [20]:
print(len(original_movie_ids))
print(len(mapped_movie_ids))

3706
3706


In [21]:
# creating a dataframe with colomn1 as uniquemovieids and column2 as mappedmovieids(aka index) , dataframe index vs movie index
movies_map_df_dict = {"original_movie_ids": original_movie_ids,
                      "mapped_movie_ids":mapped_movie_ids}
movies_map_df = pd.DataFrame(movies_map_df_dict)
print(movies_map_df.head())
movies_map_df.to_csv("mapped_movie_ids.csv",index=False)

   original_movie_ids  mapped_movie_ids
0                1193                 1
1                 661                 2
2                 914                 3
3                3408                 4
4                2355                 5


In [22]:
# adding a column(mapped_movie_id) to orginal wholedataset dataframe sample_data
# len(sample_data)
for i in range(len(sample_data)):
  mapped_movie_id = movie_dict[int(sample_data.iloc[i][1])]
#     print(sample_data.iloc[i][1] ,mapped_movie_id )
  sample_data.loc[sample_data.index[i], 'mapped_movie_id'] = int(mapped_movie_id)
    

KeyboardInterrupt: ignored

In [None]:
sample_data.head()

In [None]:
users = np.array(sample_data['userid'])
items = np.array(sample_data['movieid'])
ratings = np.array(sample_data['movieRating'])
mapped_movie_ids = np.array(sample_data['mapped_movie_id'])

In [None]:
print(users[3],items[3], ratings[3])

244 51 0


In [None]:
sample_data.movieid.max()

1682

In [None]:
len(users), len(items), len(ratings), len(mapped_movie_ids)

(100000, 100000, 100000, 100000)

In [None]:
len(sample_data.userid.unique())
# unique users in wholedataset

943

In [None]:
len(sample_data.movieid.unique())
# unique movies in wholedataset

1682

In [None]:
# sample_data.to_sparse(fill_value=0)

Create Utility Matrix

In [None]:
# converting original dataframe sample_data to a matrix representation/utility matrix with the help of csr_matrix func of scipy
utility_csr = csr_matrix((ratings, (users , mapped_movie_ids.astype(int))))

In [None]:
utility_matrix = csr_matrix((ratings, (users, mapped_movie_ids.astype(int)))).toarray() # Users x Items 
utility_matrix_t = utility_matrix.T
utility_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 5, 0, ..., 0, 0, 0],
       [0, 5, 5, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [None]:
print(utility_matrix[users[4]][items[4]])

0


In [None]:
# we have an extra expendable row(likely first row), column in both user_utility_matrix and item_utility_matrix
# shud be 943x1682 / 1682x943 , somewhere csr_matrix func is adding an extra row
print(utility_matrix.shape)
print(utility_matrix_t.shape)

(944, 1683)
(1683, 944)


In [None]:
np.savetxt('users_m.txt', utility_matrix, fmt="%d") # All rows as a USERS
np.savetxt('items_m.txt', utility_matrix_t, fmt="%d") # All rows as a ITEMS

In [None]:
test_data = pd.read_csv('test_ratings.csv')
test_users = test_data.userid.unique()
print(len(test_users))
test_users_list = test_users.tolist()
with open ('test_users.txt', 'w') as fo:
     fo.write(','.join(str(i) for i in test_users_list))
# unique test users in testdataset(test_ratings.csv) which is used in javacode for generating recommendations of nearestKneighbors efficiently with compressed bit vectors

568


In [None]:
## passing(copying) the above 3 txt files
# user_m txt file matrix and 
# item_m file matrix, and 
# test_users.txt (unique test users in testdataset)
## into java code(intelliJ) for generating recommendation through compressed bit vectors

In [None]:
# below four cells revist

In [None]:
mapped_movie_ids_dict = {}
for key in movie_dict:
    value = movie_dict[key]
    mapped_movie_ids_dict[value] = key

In [None]:
def get_original_movie_ids(mapped_movie_ids):
    original_ids = []
    for movie_id in mapped_movie_ids:
        if int(movie_id) in mapped_movie_ids_dict:
            original_ids.append(mapped_movie_ids_dict[int(movie_id)])
        else:
            original_ids.append(-1)
    return original_ids

In [None]:
temp_str = "6,15,18,19,24,25,26,27,31,32,33,231,291,321,462,470,471,473,477,620"

In [None]:
bsi_mapped_movie_ids = temp_str.split(",")
bsi_original_movie_ids = get_original_movie_ids(bsi_mapped_movie_ids)
print(bsi_original_movie_ids)

[474, 29, 274, 1042, 118, 1, 546, 95, 246, 98, 193, 520, 558, 97, 870, 44, 686, 729, 566, 372]


In [None]:
# now we evaluate compressed bit vector recommendations ( java code txt files)

In [None]:
import os
import numpy as np
import pandas as pd
from scipy.sparse import *
from scipy import *
from sklearn.preprocessing import LabelEncoder

In [None]:
test_data = pd.read_csv('test_ratings.csv')
nRow, nCol = test_data.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 15944 rows and 3 columns


In [None]:
mapped_movieID_df = pd.read_csv('mapped_movie_ids.csv')

In [None]:
test_data

Unnamed: 0,userid,movieid,movieRating
0,1,214,4
1,1,102,2
2,1,69,3
3,1,52,4
4,1,38,3
...,...,...,...
15939,943,393,2
15940,943,72,2
15941,943,24,4
15942,943,28,4


In [None]:
mapped_movieID_df.head()

Unnamed: 0,original_movie_ids,mapped_movie_ids
0,242,1
1,302,2
2,377,3
3,51,4
4,346,5


In [None]:
# create dictionaries for movieID mapping
mapped_movieID_dict = {}
# mapped_movieID_df.size
for i in range(len(mapped_movieID_df)):
    row = mapped_movieID_df.loc[i]
#     print(row)
    mapped_movieID_dict[row.original_movie_ids] = row.mapped_movie_ids

In [None]:
original_movieID_dict = {}
for i in range(len(mapped_movieID_df)):
    row = mapped_movieID_df.loc[i]
#     print(row)
    original_movieID_dict[row.mapped_movie_ids] = row.original_movie_ids

In [None]:
# unique users in test data for evaluation
test_users = test_data.userid.unique()
test_users_list = test_users.tolist()
print(test_users_list)
print(len(test_users_list))
# used in the java file to generate test_recommendation_bsi.txt and bsi_recommendations_for_all_user.txt

[1, 2, 3, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 18, 21, 22, 23, 24, 25, 26, 28, 37, 38, 41, 42, 43, 44, 48, 49, 52, 54, 56, 57, 58, 59, 60, 62, 63, 64, 65, 69, 70, 72, 73, 75, 76, 77, 79, 81, 82, 83, 84, 85, 87, 89, 90, 91, 92, 94, 95, 96, 97, 99, 100, 101, 102, 104, 106, 109, 110, 113, 115, 116, 117, 118, 119, 121, 122, 123, 125, 128, 130, 135, 138, 141, 144, 145, 148, 151, 152, 154, 157, 158, 159, 160, 161, 164, 167, 168, 174, 176, 177, 178, 180, 181, 183, 184, 186, 187, 188, 189, 190, 193, 194, 195, 197, 198, 200, 201, 206, 207, 210, 213, 214, 215, 216, 217, 218, 221, 222, 223, 224, 226, 227, 230, 232, 233, 234, 235, 236, 239, 243, 244, 246, 248, 249, 250, 251, 253, 254, 255, 256, 257, 262, 263, 264, 267, 268, 269, 270, 271, 272, 274, 275, 276, 277, 279, 280, 283, 286, 287, 288, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 301, 303, 305, 307, 308, 311, 312, 313, 314, 315, 316, 318, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 334, 336, 338, 339, 342, 343, 

In [None]:
def get_mapped_movieids(original_movieIds):
    mapped_movieIds= []
    for _id in original_movieIds:
        mapped_movieIds.append(mapped_movieID_dict[_id])
    return mapped_movieIds

In [None]:
# get Get movies which is rated (>3.0)/(change depending on dataset) by user in test data for BSI
def get_test_rec_movieIds(userId):
    user_df = test_data[test_data.userid == userId]
    user_movie_df = user_df[user_df.movieRating > 3.0]
    original_movieIds = user_movie_df.movieid.unique()
    mapped_movieIds = get_mapped_movieids(original_movieIds)
    return mapped_movieIds


In [None]:
def find_common(original_ids, rec_ids):
    original_ids_set = set(original_ids)
    common_count = 0
    if rec_ids[-1] == '':
        rec_ids.pop()
    for _id in rec_ids:
        if int(_id) in original_ids_set:
            common_count += 1
    return common_count

In [None]:
mapped_movieID_df.head()

Unnamed: 0,original_movie_ids,mapped_movie_ids
0,242,1
1,302,2
2,377,3
3,51,4
4,346,5


In [None]:
def getOrginalMoviesIds(mapped_movie_ids):
    original_movieIds= []
    for _id in mapped_movie_ids:
        original_movieIds.append(original_movieID_dict[int(_id)])
    return original_movieIds


In [None]:
print(getOrginalMoviesIds(['3','4']))

[377, 51]


In [None]:
### Calculate Precision and Recall

def getPrecision(total_count, common_count):
    if(total_count ==0):
        return 0
    return (common_count/total_count)

def getRecall(total_count, common_count):
    if(total_count ==0):
        return 1
    return (common_count/total_count)

# def hitratio(total_count, hits):
#     if(hits ==0):
#         return 1
#     return (hits/total_count)

def getf1score(precision, recall):
    if(precision == 0 and recall == 0 ):
        return 0
    return ((2 * precision * recall )/ ( precision + recall ))

In [None]:
### Evalute Bsi algorithm for all test users
## After executing the java code, copy the output
# copy test_recommendation_bsi.txt and bsi_recommendations_for_all_users.txt from the java IntelliJ env into colab jupyterNotebook env

In [None]:
# get_test_rec_movieIds
count = 0
input_file = open('test_recommendation_bsi.txt','r') # file pulled from java code. CBV

userIds = []
actual_movies_watched = []
recomendations = []
common_movies = []
similar_usres_k = []
precision = []
recall = []
f1score = []

while (True):
    line = input_file.readline()
    if not line:
        break
    count +=1
    
    values = line.split(", ")
    userid = values[0]
    userK = values[1] # neighborhood size
    no_recomendations = values[2] # no of recommendations per user
    recomendations_for_user = values[3:] # actual recommendations
    
    recomendations_for_user.pop() ## removing last null(\n) value
    
    actual_movies_wated_by_users = get_test_rec_movieIds(int(userid)) # movies in test data  # actaul movies watched by user which are greater > 3
    common_count = find_common(actual_movies_wated_by_users, recomendations_for_user) # movies in algo's predictions
    
#     print(userId,userK,no_recomendations)
#     print(common_count)

    userIds.append(userid)
    similar_usres_k.append(int(userK))
    actual_movies_watched.append(len(actual_movies_wated_by_users)) 
    recomendations.append(int(no_recomendations))
    common_movies.append(common_count)
    
    total_count_for_precision = int(no_recomendations) # actual results
    total_count_for_recall = len(actual_movies_wated_by_users)
    
    temp_Precision = getPrecision(total_count_for_precision, common_count)
    precision.append(temp_Precision)
    temp_recall = getRecall(total_count_for_recall, common_count)
    recall.append(temp_recall)
    f1score.append(getf1score(temp_Precision, temp_recall))
    
print(count)
# 568(test_users) * userK(4 len of list) * itemK(5 len of list) = 11360 test_recommendation_bsi.txt rows count

2840


In [None]:
# storing precision and recall in new dict and dataframe
result_df_dict = {"userId":userIds,"actual_movies_watched":actual_movies_watched,
                  "recomendations":recomendations,"common_movies":common_movies,
                  "similar_usres_k":similar_usres_k, "precision": precision, "recall": recall, "f1_score": f1score}

In [None]:
result_bsi_df = pd.DataFrame(result_df_dict)
result_bsi_df.head(25)
result_bsi_df = result_bsi_df.astype({"userId": int})
result_bsi_df

Unnamed: 0,userId,actual_movies_watched,recomendations,common_movies,similar_usres_k,precision,recall,f1_score
0,1,28,5,1,5,0.20,0.035714,0.060606
1,1,28,10,1,5,0.10,0.035714,0.052632
2,1,28,25,3,5,0.12,0.107143,0.113208
3,1,28,50,3,5,0.06,0.107143,0.076923
4,1,28,100,7,5,0.07,0.250000,0.109375
...,...,...,...,...,...,...,...,...
2835,943,22,5,1,5,0.20,0.045455,0.074074
2836,943,22,10,1,5,0.10,0.045455,0.062500
2837,943,22,25,1,5,0.04,0.045455,0.042553
2838,943,22,50,2,5,0.04,0.090909,0.055556


In [None]:
result_bsi_df.sort_values('precision', ascending=False)

Unnamed: 0,userId,actual_movies_watched,recomendations,common_movies,similar_usres_k,precision,recall,f1_score
1400,450,71,5,5,5,1.0,0.070423,0.131579
1435,457,45,5,4,5,0.8,0.088889,0.160000
160,56,21,5,4,5,0.8,0.190476,0.307692
940,311,34,5,4,5,0.8,0.117647,0.205128
2205,716,30,5,4,5,0.8,0.133333,0.228571
...,...,...,...,...,...,...,...,...
755,255,6,5,0,5,0.0,0.000000,0.000000
751,254,6,10,0,5,0.0,0.000000,0.000000
750,254,6,5,0,5,0.0,0.000000,0.000000
2080,665,14,5,0,5,0.0,0.000000,0.000000


In [None]:
result_bsi_df.sort_values('recall', ascending=False)

Unnamed: 0,userId,actual_movies_watched,recomendations,common_movies,similar_usres_k,precision,recall,f1_score
329,101,0,100,0,5,0.00,1.0,0.000000
534,183,0,100,0,5,0.00,1.0,0.000000
533,183,0,50,0,5,0.00,1.0,0.000000
1814,568,4,100,4,5,0.04,1.0,0.076923
532,183,0,25,0,5,0.00,1.0,0.000000
...,...,...,...,...,...,...,...,...
1377,440,7,25,0,5,0.00,0.0,0.000000
2360,774,5,5,0,5,0.00,0.0,0.000000
2361,774,5,10,0,5,0.00,0.0,0.000000
1376,440,7,10,0,5,0.00,0.0,0.000000


In [None]:
result_bsi_df.sort_values('f1_score', ascending=False)

Unnamed: 0,userId,actual_movies_watched,recomendations,common_movies,similar_usres_k,precision,recall,f1_score
241,206,5,10,4,5,0.4,0.800000,0.533333
1008,839,4,5,2,5,0.4,0.500000,0.444444
240,206,5,5,2,5,0.4,0.400000,0.400000
506,403,6,5,2,5,0.4,0.333333,0.363636
408,329,6,5,2,5,0.4,0.333333,0.363636
...,...,...,...,...,...,...,...,...
761,608,25,10,0,5,0.0,0.000000,0.000000
760,608,25,5,0,5,0.0,0.000000,0.000000
382,314,24,5,0,5,0.0,0.000000,0.000000
383,314,24,10,0,5,0.0,0.000000,0.000000


In [None]:
result_bsi_df_5 = result_bsi_df.loc[result_bsi_df['recomendations']==5]
result_bsi_df_10 = result_bsi_df.loc[result_bsi_df['recomendations']==10]
result_bsi_df_25 = result_bsi_df.loc[result_bsi_df['recomendations']==25] # final run
result_bsi_df_50 = result_bsi_df.loc[result_bsi_df['recomendations']==50]
result_bsi_df_100 = result_bsi_df.loc[result_bsi_df['recomendations']==100]

In [None]:
result_bsi_df_5.mean()

userId                   461.948944
actual_movies_watched     15.485915
recomendations             5.000000
common_movies              0.677817
similar_usres_k            5.000000
precision                  0.135563
recall                     0.049196
f1_score                   0.062842
dtype: float64

In [None]:
result_bsi_df_5.max()

userId                   943.000000
actual_movies_watched     71.000000
recomendations             5.000000
common_movies              5.000000
similar_usres_k            5.000000
precision                  1.000000
recall                     1.000000
f1_score                   0.444444
dtype: float64

In [None]:
result_bsi_df_10.mean()

userId                   461.948944
actual_movies_watched     15.485915
recomendations            10.000000
common_movies              1.193662
similar_usres_k            5.000000
precision                  0.119366
recall                     0.082985
f1_score                   0.086344
dtype: float64

In [None]:
result_bsi_df_10.max()

userId                   943.000000
actual_movies_watched     71.000000
recomendations            10.000000
common_movies              6.000000
similar_usres_k            5.000000
precision                  0.600000
recall                     1.000000
f1_score                   0.533333
dtype: float64

In [None]:
result_bsi_df_25.mean()

userId                   461.948944
actual_movies_watched     15.485915
recomendations            25.000000
common_movies              2.466549
similar_usres_k            5.000000
precision                  0.098662
recall                     0.163445
f1_score                   0.111076
dtype: float64

In [None]:
result_bsi_df_25.max()

userId                   943.000000
actual_movies_watched     71.000000
recomendations            25.000000
common_movies             12.000000
similar_usres_k            5.000000
precision                  0.480000
recall                     1.000000
f1_score                   0.421053
dtype: float64

In [None]:
result_bsi_df_50.mean()

userId                   461.948944
actual_movies_watched     15.485915
recomendations            50.000000
common_movies              3.996479
similar_usres_k            5.000000
precision                  0.079930
recall                     0.265683
f1_score                   0.113770
dtype: float64

In [None]:
result_bsi_df_100.mean()

userId                   461.948944
actual_movies_watched     15.485915
recomendations           100.000000
common_movies              6.186620
similar_usres_k            5.000000
precision                  0.061866
recall                     0.401279
f1_score                   0.102104
dtype: float64

In [None]:
###

In [None]:
userId = []
recommendations = []
input_file_1 = open('bsi_recommendations_for_all_users.txt','r')
while (True):
    line = input_file_1.readline()
    if not line:
        break
    count +=1
    values = line.split(", ")
    user_id = values[0]
    recomendations_for_user = values[1:]
    recomendations_for_user.pop()
    actual_movie_ids = getOrginalMoviesIds(recomendations_for_user)
    userId.append(user_id)
    recommendations.append(actual_movie_ids)
    
print("Finished")

Finished


In [None]:
result_all_user_df_dict = {"userID": userId, "recommendations": recommendations}

In [None]:
result_all_user_df = pd.DataFrame(result_all_user_df_dict)

In [None]:
result_all_user_df # one user is missing 685/686 rows are 942 # 3 user missing for 1 mil dataset , no rec for missing user

Unnamed: 0,userID,recommendations
0,1,"[1197, 2797, 2762, 2028, 2916, 1210, 1213, 161..."
1,2,"[1287, 2797, 2321, 1270, 527, 1022, 150, 1, 26..."
2,3,"[1193, 2804, 919, 2918, 2797, 1270, 527, 1097,..."
3,4,"[3408, 2355, 1197, 2791, 2797, 1270, 527, 2762..."
4,5,"[1193, 2804, 2797, 2321, 1270, 527, 2762, 260,..."
...,...,...
6032,6036,"[1197, 919, 2918, 2791, 2797, 1270, 527, 1097,..."
6033,6037,"[1197, 2918, 2791, 1270, 2762, 1, 2916, 1213, ..."
6034,6038,"[1197, 2804, 2918, 2791, 1270, 1097, 1, 260, 2..."
6035,6039,"[2355, 2918, 2762, 608, 1213, 368, 110, 589, 3..."


# Collaborative Filtering pyspark ( ALS Baseline)

In [23]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 31 kB/s 
[?25hCollecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 56.9 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805912 sha256=c9ba558d7fd9c1a294064826ffb9d0a8ec7be28bd40ffb856657031ad9105e00
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0


In [24]:
import os
import numpy as np
from pyspark.sql import SparkSession
from pyspark import SparkContext
import pandas as pd
from pyspark.mllib.recommendation import ALS
import math
import pyspark.sql
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.ml.evaluation import RegressionEvaluator
import time


In [25]:
# Calling spark session to register application
spark = SparkSession \
    .builder \
    .appName("Recom") \
    .config("spark.recom.demo", "1") \
    .getOrCreate()
# lambda word: (word, 1)

In [26]:
ratings_df = spark.read \
    .format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("train_ratings.csv")

In [27]:
ratings_df.show()
ratings_df.count()

+------+-------+-----------+
|userid|movieid|movieRating|
+------+-------+-----------+
|     1|   1193|          5|
|     1|    661|          3|
|     1|    914|          3|
|     1|   3408|          4|
|     1|   2355|          5|
|     1|   1197|          3|
|     1|   1287|          5|
|     1|   2804|          5|
|     1|    594|          4|
|     1|    919|          0|
|     1|    595|          5|
|     1|    938|          4|
|     1|   2398|          4|
|     1|   2918|          4|
|     1|   1035|          5|
|     1|   2791|          4|
|     1|   2687|          0|
|     1|   2018|          0|
|     1|   3105|          0|
|     1|   2797|          4|
+------+-------+-----------+
only showing top 20 rows



1000209

In [28]:
ratings_df = ratings_df.drop('') # reduntant> no values are dropped
ratings_df.count()

1000209

### Drop test data from training set

In [29]:
ratings_df = ratings_df[ratings_df.movieRating != 0.0]
ratings_df.summary
ratings_df.count()

830247

In [30]:
(trainingData,validationData,testData) = ratings_df.randomSplit([0.6,0.2,0.2],5) # randomSplit(weights, seed)

In [31]:
trainingData.show()
print(trainingData.count())
print(validationData.count())
print(testData.count())

+------+-------+-----------+
|userid|movieid|movieRating|
+------+-------+-----------+
|     1|      1|          5|
|     1|     48|          5|
|     1|    260|          4|
|     1|    527|          5|
|     1|    531|          4|
|     1|    588|          4|
|     1|    595|          5|
|     1|    608|          4|
|     1|    661|          3|
|     1|    745|          3|
|     1|    783|          4|
|     1|   1022|          5|
|     1|   1028|          5|
|     1|   1097|          4|
|     1|   1193|          5|
|     1|   1197|          3|
|     1|   1207|          4|
|     1|   1270|          5|
|     1|   1287|          5|
|     1|   1566|          4|
+------+-------+-----------+
only showing top 20 rows

497472
166700
166075


In [32]:
validation_for_predict = validationData.select('userid','movieid')
test_for_predict = testData.select('userid','movieid')

In [33]:
seed = 5 
iterations = 10
regularization_parameter = 0.1 
ranks = [4, 8, 12] 

In [34]:
min_error = 1000
for rank in ranks:
    model = ALS.train(ratings_df, rank, seed=seed, iterations=iterations,
                      lambda_=regularization_parameter)
    
    #converting prediction into key value pair like key=(userId,movieId) and value = rating
    predictions = model.predictAll(validation_for_predict.rdd).map(lambda r: ((r[0], r[1]), r[2]))
    
    #joining predicted rating and original ratings to calculate error
    rates_and_preds = validationData.rdd.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    
    error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean()) 

    print ('For rank',rank, "the RMSE is ", error)
    if error < min_error:
        min_error = error
        best_rank = rank

print ("The best model was trained with rank", best_rank)

For rank 4 the RMSE is  0.850316954280477
For rank 8 the RMSE is  0.8302567395217925
For rank 12 the RMSE is  0.8179931389419712
The best model was trained with rank 12


In [35]:
predictions_test = model.predictAll(test_for_predict.rdd).map(lambda r: ((r[0], r[1]), r[2]))

In [36]:
predictions_test.take(5)

[((4904, 1304), 4.795474940347013),
 ((4904, 2116), 4.032788194947812),
 ((4904, 1320), 3.9355773460942776),
 ((4904, 588), 4.29802423718202),
 ((4904, 2068), 4.13739412892656)]

### Get Recomendations For User

This method returns original movieId where CBV algorithm returns mapped movieId

In [37]:
def getRecommendations(user,ratings_df,trainDf,model, k): # 
    
    userDf = ratings_df.filter(ratings_df.userid == user) # for eg user = 1 ...getting all movies(rows) user1 has rated 

    mov = ratings_df.select('movieid').subtract(userDf.select('movieid')) # mov dataframe wont have whatever user1 has rated
    
    # measure below two only

    pred_rat = model.predictAll(mov.rdd.map(lambda x: (user, x[0]))).collect()
    
    recommendations = sorted(pred_rat, key=lambda x: x[2], reverse=True)[:k]
    
    return recommendations

In [38]:
model.predict(1,1084) # make sure product id is in the model

4.1485223959910416

In [39]:
user = 1
# how many recommendations you want
k= 10

# Call getRecommendations method
derived_rec = getRecommendations(user, ratings_df, trainingData, model, k)

print ("Movies recommended for:",user)
movie_ids = []
for i in range(len(derived_rec)):
    movie_ids.append(derived_rec[i][1])
#     print (i+1,derived_rec[i][1])

print(movie_ids)
#     movies_df.filter(movies_df.movieId==derived_rec[i][1]).select('title').show()

Movies recommended for: 1
[572, 3233, 989, 3172, 318, 919, 953, 858, 787, 3469]


In [None]:
temp_str = "70, 235, 316, 333, 423, 441, 457, 480, 552, 553, 590, 4006, 72378, 538, 36, 300, 344, 380, 531, 371"

In [None]:
bsi_movie_ids = temp_str.split(", ") # not using bsi_movie_ids
print(bsi_movie_ids)
print(movie_ids)

['70', '235', '316', '333', '423', '441', '457', '480', '552', '553', '590', '4006', '72378', '538', '36', '300', '344', '380', '531', '371']
[1512, 1449, 408, 1495, 390, 611, 1467, 1463, 511, 657]


In [40]:
test_data = pd.read_csv('test_ratings.csv')
test_users = test_data.userid.unique()
print(len(test_users))
test_users_list = test_users.tolist()
userKs = [5] # 10,20,25 final run
itemKs = [5,10,25,50,100] # 5, 10, 25, 50 ,100
len(test_users_list)

# 568(test_users) * userK(4 len of list) * itemK(5 len of list) = 11360 test_recommendation_bsi.txt rows 

4296


4296

In [41]:
# def recomendMoviesForAllUsers(test_users, itemKs,testData,trainingData,model):
#     output_file = open("test_recommendations_als_5_10.txt", "a")
#     for u in range(len(test_users)):
#         user = test_users[u]
#         for itemk in itemKs:
#             derived_rec = getRecommendations(user,testData,trainingData,model,itemk)
            
#             output_file.write(str(user)+", ")
#             output_file.write(str(5)+", ")
#             for i in range(5):
#                 output_file.write(str(derived_rec[i][1])+", ")
#             output_file.write("\n")
                
#             output_file.write(str(user)+", ")
#             output_file.write(str(itemk)+", ")
#             for i in range(len(derived_rec)):
#                 output_file.write(str(derived_rec[i][1])+", ")
#             output_file.write("\n")
## below snippet final run many itemKs 
def recomendMoviesForAllUsers(test_users, itemKs,testData,trainingData,model):
    output_file = open("test_recommendations_als.txt", "a")
    for u in range(len(test_users)):
      user = test_users[u]
      for userk in userKs:
        for itemk in itemKs:
            derived_rec = getRecommendations(user,testData,trainingData,model,itemk)    
            output_file.write(str(user)+", ")
            # output_file.write(str(userk)+", ")
            output_file.write(str(itemk)+", ")
            for i in range(len(derived_rec)):
                output_file.write(str(derived_rec[i][1])+", ")
            output_file.write("\n")

In [None]:
start = time.time()
derived_rec = recomendMoviesForAllUsers(test_users_list,itemKs,testData,trainingData,model)
end = time.time()
print(end - start)


In [None]:
###  Get movies which is rated (>3.0) by user in test data for ALS

def get_test_rec_movieIds_als(userId):
    user_df = test_data[test_data.userid == userId]
    user_movie_df = user_df[user_df.movieRating > 3.0]
    original_movieIds = user_movie_df.movieid.unique()
    return list(original_movieIds)

In [None]:
# get_test_rec_movieIds
count = 0
input_file = open('test_recommendations_als.txt','r')

userIds = []
actual_movies_watched = []
recomendations = []
common_movies = []
precision = []
recall = []
f1score = []

while (True):
    line = input_file.readline()
    if not line:
        break
    count +=1
    values = line.split(", ")
    userId = values[0]
    no_recomendations = values[1]
    recomendations_for_user = values[2:]
    
    recomendations_for_user.pop() ## removing last null(\n) value
    
    actual_movies_watched_by_users = get_test_rec_movieIds_als(int(userId)) # movies in test data
#     print(count)
    common_count = find_common(actual_movies_watched_by_users, recomendations_for_user) # movies in algo's predictions
    
#     print(userId,userK,no_recomendations)
#     print(common_count)

    userIds.append(userId)
    actual_movies_watched.append(len(actual_movies_watched_by_users)) 
    recomendations.append(int(no_recomendations))
    common_movies.append(common_count)
    
    total_count_for_precision = int(no_recomendations)
    total_count_for_recall = len(actual_movies_watched_by_users)
    
    temp_Precision = getPrecision(total_count_for_precision, common_count)
    precision.append(temp_Precision)
    temp_recall = getRecall(total_count_for_recall, common_count)
    recall.append(temp_recall)
    f1score.append(getf1score(temp_Precision, temp_recall))
    
print(count)
# test_recommendations_als_5_10.txt row length

2800


In [None]:
result_df_dict = {"userId":userIds,"actual_movies_watched":actual_movies_watched,
                  "recomendations":recomendations,"common_movies":common_movies,
                   "precision": precision, "recall": recall, "f1_score": f1score}

In [None]:
result_als_df = pd.DataFrame(result_df_dict)

In [None]:
result_als_df

Unnamed: 0,userId,actual_movies_watched,recomendations,common_movies,precision,recall,f1_score
0,1,28,5,0,0.00,0.000000,0.000000
1,1,28,10,0,0.00,0.000000,0.000000
2,1,28,25,1,0.04,0.035714,0.037736
3,1,28,50,3,0.06,0.107143,0.076923
4,1,28,100,4,0.04,0.142857,0.062500
...,...,...,...,...,...,...,...
2795,943,22,5,1,0.20,0.045455,0.074074
2796,943,22,10,1,0.10,0.045455,0.062500
2797,943,22,25,3,0.12,0.136364,0.127660
2798,943,22,50,4,0.08,0.181818,0.111111


In [None]:
result_als_df = result_als_df.astype({"userId": int})


In [None]:
result_als_df_5 = result_als_df.loc[result_als_df['recomendations']==5]
result_als_df_10 = result_als_df.loc[result_als_df['recomendations']==10]
result_als_df_25 = result_als_df.loc[result_als_df['recomendations']==25]
result_als_df_50 = result_als_df.loc[result_als_df['recomendations']==50]
result_als_df_100 = result_als_df.loc[result_als_df['recomendations']==100]

In [None]:
result_als_df_5.mean()

userId                   467.183929
actual_movies_watched     15.428571
recomendations             5.000000
common_movies              0.298214
precision                  0.059643
recall                     0.025373
f1_score                   0.029301
dtype: float64

In [None]:
result_als_df_10.mean()

userId                   467.183929
actual_movies_watched     15.428571
recomendations            10.000000
common_movies              0.576786
precision                  0.057679
recall                     0.047135
f1_score                   0.045170
dtype: float64

In [None]:
result_als_df_25.mean()

userId                   467.183929
actual_movies_watched     15.428571
recomendations            25.000000
common_movies              1.425000
precision                  0.057000
recall                     0.114265
f1_score                   0.068479
dtype: float64

In [None]:
result_als_df_50.mean()

userId                   467.183929
actual_movies_watched     15.428571
recomendations            50.000000
common_movies              2.683929
precision                  0.053679
recall                     0.206189
f1_score                   0.079115
dtype: float64

In [None]:
result_als_df_100.mean()

userId                   467.183929
actual_movies_watched     15.428571
recomendations           100.000000
common_movies              4.783929
precision                  0.047839
recall                     0.344057
f1_score                   0.080057
dtype: float64