**Image based Score Prediction**

**Installing and importing all the required libraries**

In [0]:
!pip install lshash3 --upgrade
!pip install img2vec_pytorch
!pip install numpy

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from google.colab import drive
from google.colab import files
import json
import ast
import numpy as np
from img2vec_pytorch import Img2Vec
from PIL import Image
import os
from lshash.lshash import LSHash
import operator
import math
import pandas as pd

In [314]:
drive.mount('/content/drive')

#This file is a dictionary of dictionaries having userid as key, list of buisiness_ids and a list of images of each business as values for the business_ids 
s=""
with open('/content/drive/My Drive/Columbia Photos/dict1.txt') as f:
  for line in f:
        s=s+line

#Converting json compatible format
json_data = ast.literal_eval(s)

s3=""
with open('/content/drive/My Drive/mainDataForAP.txt') as f:
  for line in f:
        s3=s3+line

all_ratings = ast.literal_eval(s3)

s4=""
with open('/content/drive/My Drive/businessIdListForAP.txt') as f:
  for line in f:
        s4=s4+line

last_ratings= ast.literal_eval(s4)


s1=[]
#Getting all image IDs from our subset
for k,v in json_data.items():
    for k1,v1 in v.items():
        s1=s1+v1

#this dictionary is for the last business the user has rated (key is user id and the value is the business id of the last business he has rated)
s2=""
with open('/content/drive/My Drive/Columbia Photos/businessIdList-2.txt') as f:
    for line in f:
        s2=s2+line
        
json_data1 = ast.literal_eval(s2)

#business to image list dicitonary (Key is the business ID and the value is the list of images for this business) Made for ease of use
business2imglist={}
for k,v in json_data.items():
    for k1,v1 in v.items():
        business2imglist[k1]=v1

#image to business dictionary, We can use this whenever we will need the business id to which an image belongs to
img2business={}
for k,v in json_data.items():
    for k1,v1 in v.items():
        for i in v1:
            img2business[i]=k1

#dictonary of user and all the businesses he rated leaving the last business
user2businesslist={} 
for k,v in json_data.items():
    li=[]
    for k1,v1 in v.items():
        if not json_data1[k]==k1:
            li.append(k1)
    user2businesslist[k]=li

user2oldimages={}
user2lastimgs={}

for k,v in user2businesslist.items():
    j=[]
    for i in v:
        j=j+business2imglist[i]
    user2oldimages[k]=j
    
for k,v in json_data1.items():
    user2lastimgs[k]=business2imglist[v]

users=list(user2businesslist.keys())
businesses=list(business2imglist.keys())

#All of the dictionaries defined above are made for ease of use and to not to rquire traversal of a dictionary of dicitonaries each time

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Getting the embeddings of all the images in the subset and saving them**

In [0]:
#The embeddings are stored in a dictionary with key as the image_id (which is also the file name of the image)
img2vec = Img2Vec()

emb={}
for filename in os.listdir('/content/drive/My Drive/Columbia Photos/imgs/'):
    if filename.endswith(".jpg"):
      img = Image.open('/content/drive/My Drive/Columbia Photos/imgs/'+filename)
      emb[filename[:-4]]=img2vec.get_vec(img, tensor=False)
        

In [0]:
#Saving the embeddings we learnt
np.save('embeddings.npy', emb) 

#Saving the embeddings file to local drive
files.download( "embeddings.npy" ) 

**Loading the saved embeddings (instead of having to re-run the embedding generator each time)**

In [0]:
#As generating the embeddings will take long we can now load the generated embeddings instead
#loading the embeddings file from drive
emb = np.load('/content/drive/My Drive/embeddings.npy',allow_pickle='TRUE').item()

**The following set of chunks get the predicted rating of the last business of the users based on image similarity**

In [0]:
#Defining both our hyper-parameters here
k1= 6
k2= 6
#User's last busiess closest neighbours
user_lb_cn={}
c=0
matches=[]
lsh = LSHash(6, 8)
for k,v in json_data1.items():
  t2=[]

  #getting the closest neighbours for the each image of the last business the user rated, using eucledian similarity
  for n in user2lastimgs[k]:
    t=[]
    # response = lsh.query(emb[n].flatten())
    # print(len(response))
    for j in user2oldimages[k]:
      dist=lsh.euclidean_dist(emb[n].flatten(),emb[j].flatten())
      #Not including repeated images
      if dist!=0:
        t.append((img2business[j],dist))
      
      else:
        #we're getting the images that are same in different businesses, for different businesses the exact same images are uploaded, this is noise in the dataset
        matches.append((n,j))


    #getting top k1(Hyper-parameter 1) closest businesses for this image
    t1=sorted(t, key = lambda x: x[1])
    t2=t2+t1[:k1]

  #This part of the code is essentially grouping all the same businesses and taking the average of their similarity scores
  #This is because the images of the last business the user has rated can be closely similar to more than one image of a same business the user has previously rated
  d={}
  t3=[]
  for b in t2:
    key,val = b
    d.setdefault(key, []).append(val)
  for name, values in d.items():
    t3.append((name,sum(values)/len(values)))
  #Taking the overall top k2(Hyper-parameter 2) closest for the overall last business the user has rated
  user_lb_cn[k]=sorted(t3, key = lambda x: x[1])[:k2]


In [0]:
last_busi_ratings_all_embs={}
def predict(user_id):
  w=0
  r=0
  for i in user_lb_cn[user_id]:
    w=w+(1/i[1])

  for j in user_lb_cn[user_id]:
    r=r+((1/j[1])*all_ratings[user_id][j[0]])/w

  last_busi_ratings_all_embs[user_id]=r
  return r


In [322]:
def rmse():
  score=0
  n=len(json_data1)
  for k,v in json_data1.items():
    score=score+math.pow((predict(k)-last_ratings[k]),2)
  
  return math.sqrt(score/n)

val=rmse()
print("RMSE on hyper-parameter1 as",k1,"and hyper-parameter2 as",k2,"is:",val)

RMSE on hyper-parameter1 as 6 and hyper-parameter2 as 6 is: 1.4640925642508344


**Modification 1:** <br>
Instead of using multiple embedding for each business, after this modification we will be using one average embedding vector for each business instead. Please refer the accompanying report document for more details.

In [0]:
#Average image embeddings for each business

img2vec1 = Img2Vec()

emb1={}

for k,v in business2imglist.items():
  tmp=[]
  for h in v:
    img = Image.open('/content/drive/My Drive/Columbia Photos/imgs/'+h+'.jpg') 
    embs=img2vec1.get_vec(img, tensor=False)
    tmp.append(embs.flatten())
  #Getting the mean embeddings of each business
  emb1[k]=sum(tmp)/len(tmp)
 

In [0]:
#Saving the average embeddings for businesses
np.save('embeddings1.npy', emb1) 

#Downloading the average embeddings for businesses to the local drive
files.download( "embeddings1.npy" ) 

**Loading the saved embedding for the new model**

In [0]:
#loading the embeddings file from drive
emb1 = np.load('/content/drive/My Drive/embeddings1.npy',allow_pickle='TRUE').item()

**The following set of chunks get the predicted ratings of the last business of the users based on our modified model**

In [0]:
lsh1= LSHash(6,8)

def business_item_item():
  user_last_avg_amb_dict={}
  for k,v in user2businesslist.items():
    dist=[]
    #distance between the last rated business and the other businesses users have rated
    for d in v:
      dist.append((d,(lsh1.euclidean_dist(emb1[d],emb1[json_data1[k]]))))
    user_last_avg_amb_dict[k]=dist
  return user_last_avg_amb_dict


In [0]:
last_busi_ratings={}
def predict1(user_id,dictt):
  w=0
  r=0
  for i in dictt[user_id]:
    w=w+(1/i[1])

  for j in dictt[user_id]:
    r=r+((1/j[1])*all_ratings[user_id][j[0]])/w

  last_busi_ratings[user_id]=r
  return r

In [0]:
def rmse1(dictt):
  score=0
  n=len(json_data1)
  for k,v in json_data1.items():
    score=score+math.pow((predict1(k,dictt)-last_ratings[k]),2)
  
  return math.sqrt(score/n)

In [328]:
dictionary=business_item_item()
vb=rmse1(dictionary)
print("RMSE after the modification is:",vb)

RMSE after the modification is: 1.4355458015229903


We can see that after the modification, the model performed better than all the variants of the previous models. So now we will use this model to generate predictions for all the businesses for each user and fill the entire user business matrix.

Predicting scores for all the user-business pairs, (not only for the last business the user has rated) based on image similarity. Since it doesn't align with our objective statement to predict ratings for all businesses and all users, we will not be using this dictionary, but should there be a need to predict any other ratings other than the last business this can be used in any future objectives.

In [0]:
def get_score(uid,bid):
  distt=[]
  sc=0
  w=0
  r=0
  for v in user2businesslist[uid]:
    dis= lsh1.euclidean_dist(emb1[v],emb1[bid])
    if dis==0:
      continue
      #print(v,bid)
    else:
      distt.append((v,dis))

  for i in distt:
    w=w+(1/i[1])

  for j in distt:
    r=r+((1/j[1])*all_ratings[uid][j[0]])/w

  return r

In [0]:
#This will predict the ratings for all the businesses not only the last business
def business_item_item_forall():
  lsh1= LSHash(6,8)
  new_dict={}

  for k2,v2 in user2businesslist.items():
    new_dict[k2]={}

  for u1 in users:
    for b1 in businesses:
      if b1 in user2businesslist[u1]:
        new_dict[u1][b1]= all_ratings[u1][b1]
      else:
        new_dict[u1][b1]=-1

  for u in users:
    for b in businesses:
      if new_dict[u][b]==-1: #Predicting scores for all the businesses the user has not rated
        new_dict[u][b]= get_score(u,b)

  return new_dict


In [0]:
full_matrix=business_item_item_forall()

Making all the scores into a dataframe. This dataframe has businesses (business IDs) in the rows and the users (user IDs) in the columns, the value in the row, column combination will be the score our model predicted for that particular user and the business. Then we save it as a pickle

In [0]:
dff=pd.DataFrame(full_matrix)
#Saving Dataframe as pickle for any possible future use
dff.to_pickle("/content/drive/My Drive/full_matrix.pkl")

**Ensemble model: (ALS + Image based prediciton (Using average embeddings))**

Here we are using the predictions ALS gave for the last business the user has rated in conjunction with the predicitons our model gave. We are not however giving equal weights for the predictions for both of these models though. We will use the actual ratings the user has given to the last business and fit a regression model with one variable for the rating our model predicted and another variable for the rating ALS predicted. Then the co-efficients found by regression are used as weights given to the predicted scores.

In [0]:
#generating the dataframe of our predicted score and actual score for the user business pairs of the last rated business of the user

lio=[]
for u1,v1 in json_data1.items():
  lio.append((u1,v1,last_busi_ratings[u1],last_ratings[u1]))

df1=pd.DataFrame(lio, columns=['user_id', 'business_id','predicted_rating','actual_rating'])

#Dataframe having the predictions from ALS for the test-set
dfg=pd.read_csv("/content/drive/My Drive/ALSPredictions3.csv")

merged=pd.merge(df1, dfg, on=['user_id', 'business_id'], how='inner')

In [335]:
#Fitting Regression to get the weights to give to the ensemble model

X = merged[["predicted_rating","prediction"]]

y = merged[["rating"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

regressor = LinearRegression()  
regressor.fit(X_train, y_train)

print("Learned intercept parameter")
#To retrieve the intercept:
print(regressor.intercept_)

print("\nLearned regression paramters 1st one for image model and 2nd one for ALS")
#For retrieving the slope:
print(regressor.coef_)

#This is printing the RMSE for the predicitons on the regression test_set
predictions_ensemble = regressor.predict(X_test)
print ('testing rmse:', math.sqrt(mean_squared_error(predictions_ensemble, y_test)))

#This is printing the RMSE we have achieved after using the ensmeble with the predicted parameters on the last business
print("\nRMSE After Using ensmble to predict the ratings of the last business")
predictions_last_ratings = regressor.predict(X)
print ('Ensemble rmse:', math.sqrt(mean_squared_error(predictions_last_ratings, y)))

Learned intercept parameter
[2.43098674]

Learned regression paramters 1st one for image model and 2nd one for ALS
[[0.32550042 0.02885347]]
testing rmse: 1.3031321716065227

RMSE After Using ensmble to predict the ratings of the last business
Ensemble rmse: 1.3308631140825242


We can see that this ensemble model performed better than both the ALS baseline and the bias baseline. (The codes for the all the baselines are in a different notebook).

The following code generates dataframes (with columns actual rating, predicted rating) for each model we explored using images.

In [0]:
#Generating dataframe for ensemble

#Converting ndarray to a list
lis=[]
for i in predictions_last_ratings:
  lis.append(i[0])

#Getting original ratings as a list
nlis=merged['rating'].values.tolist()

actual_rating = np.array(y)
df_ensemble = pd.DataFrame({'actual rating': nlis, 'predicted rating': lis}, columns=['actual rating', 'predicted rating'])

#Generating dataframe for multiple embedings based method
tmp_tup=[]
for k,v in last_busi_ratings_all_embs.items():
  #Appending true ratings and our predicted ratings
  tmp_tup.append((last_ratings[k],last_busi_ratings_all_embs[k]))

df_all_emb=pd.DataFrame(tmp_tup, columns=['actual_rating', 'predicted_rating'])

#Generating dataframe for average embeddings based method
df_avg_emb=df1[['actual_rating','predicted_rating']]

In [0]:
df_ensemble.to_csv("/content/drive/My Drive/Columbia Photos/ensemble_values.csv",index=False)
df_all_emb.to_csv("/content/drive/My Drive/Columbia Photos/all_embs_values.csv",index=False)
df_avg_emb.to_csv("/content/drive/My Drive/Columbia Photos/avg_embs_values.csv",index=False)

All the files used in this code can be viewed by clicking on the following link. https://drive.google.com/open?id=1Rc841hGS9v2ZMsM1BQSGAcoYLJqy5BuE