In [1]:
import numpy as np
import pandas as pd
import math
from scipy.sparse import csr_matrix, csc_matrix, lil_matrix
from sklearn.decomposition import TruncatedSVD

In [14]:
config = pd.read_csv('config.csv').iloc[0]
user = config['id']
k_value = config['k']

In [3]:
with open(config['dataset_file'], 'r') as inFile:
    appreciate_data = inFile.readlines()

In [4]:
# it is to note that the user id is int, item id is string in the dataset and the user has liked the same image more than once during different time stamps
temp_all_items = set()
temp_user_likes = dict()
for line in appreciate_data:
    line = line.strip()
    user_id = int(line.split()[0])
    item_id = str(line.split()[1])

    temp_all_items.add(item_id)

    if user_id not in temp_user_likes:
        temp_user_likes[user_id] = set()
    
    temp_user_likes[user_id].add(item_id)

In [5]:
# storing all the items in sorted order
all_items = []
for item in temp_all_items:
    all_items.append(item)

all_items.sort()

user_likes = dict()
for k in temp_user_likes.keys():
    arr = []
    for item in temp_user_likes[k]:
        arr.append(item)
    arr.sort()
    user_likes[k] = arr

In [6]:
# mapping items to the index number
item_map = {}
cnt = 0
for item in all_items:
    item_map[item] = cnt
    cnt += 1

In [7]:
m = len(user_likes)
n = len(all_items)
lil = lil_matrix((m,n), dtype=bool)
r = 0
for k in user_likes.keys():
    for item in user_likes[k]:
        c = item_map[item]
        lil[r,c] = True
    r += 1

In [8]:
# SVD
svd = TruncatedSVD(n_components=2000, n_iter=5, random_state=42)
mat = svd.fit_transform(lil)

In [9]:
def cosine_similarity(vec1, vec2):
  dot = 0.0
  sq1 = 0.0
  sq2 = 0.0
  for i in range(len(vec1)):
    dot += (vec1[i]*vec2[i])
    sq1 += (vec1[i]**2)
    sq2 += (vec2[i]**2)

  sq = math.sqrt(sq1*sq1)
  cos_sim = dot/sq
  # we are returning 1-cos_sim, because we will be sorting the list in ascending order(so lesser the distance more would be the similarity)
  return 1.0 - cos_sim


In [10]:
# as in this problem, we are reducing the dimension so euclidean should work better than cosine and was also verified using the items matched using jaccard index
def euclidean_distance(vec1,vec2):
  dist = 0.0
  for i in range(len(vec1)):
    dist += ((vec1[i]-vec2[i])**2)

  return math.sqrt(dist)


In [11]:
def manhattan_distance(vec1,vec2):
  dist = 0.0
  for i in range(len(vec1)):
    dist += abs(vec1[i]-vec2[i])

  return dist

In [12]:
# here first we will find the vector of the user for whom we need to recommend images
r = 0
for id in user_likes.keys():
    if id == user:
        user_vector = mat[r]
        break
    r += 1

lst = []
r = 0
for id in user_likes.keys():
    comparison = mat[r] == user_vector
    if not comparison.all():
        dist = euclidean_distance(user_vector,mat[r])
        # dist = cosine_similarity(user_vector,mat[r])
        # dist = manhattan distance(user_vector,mat[r])
        lst.append((dist,id))
    r += 1

lst.sort()

In [15]:
# choosing top k elements and reporting the items and the user ids
# writing output to output_file  
outFile = open(config['output_file'], 'w')
for i in range(k_value):
    id = lst[i][1]
    print(id)
    for item in user_likes[id]:
        if item not in user_likes[user]:
            outFile.write(item + ' ' + str(id) + '\n')

outFile.close()

2452817
1480429
1525134
148706
1459742
