In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
import numpy
import string
import csv
from sklearn import linear_model
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate
from surprise import accuracy

In [2]:
import sys
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random
import implicit 

from sklearn.preprocessing import MinMaxScaler

## Task 1: Cook prediction

Step 1 - Data Import & Preparation

Step 2 - Cross-Validation

Step 3 - Model Fitting

Step 4 - Prediction

In [3]:
#-------------------------
# 1. LOAD AND PREP DATA
#-------------------------
def readData(path):
    f = gzip.open(path, 'rt')
    reader = csv.reader(f)
    head = next(reader)
    rows = []
    for row in reader:
        rows.append(row)
    return head, rows
        
p = "assignment1/trainInteractions.csv.gz"
header, data1 = readData(p)
header

['user_id', 'recipe_id', 'date', 'rating']

In [4]:
train = [[d[0], d[1], d[3]] for d in data1]

In [5]:
data2 = pd.read_csv(p)
knn_train = data2[:250000]
knn_train.loc[knn_train['user_id'] == int('88348277')]

Unnamed: 0,user_id,recipe_id,date,rating
0,88348277,3969194,2004-12-23,5
1642,88348277,90253693,2004-09-03,5
2440,88348277,54352178,2004-10-09,5
6953,88348277,821715,2004-11-15,0
7890,88348277,68579114,2005-04-06,5
...,...,...,...,...
244437,88348277,67101061,2008-03-30,3
244801,88348277,79579247,2004-12-30,0
247493,88348277,16101522,2004-08-29,5
247660,88348277,14764305,2004-11-11,5


In [6]:
usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
ratingDict = {} # To retrieve a rating for a specific user/item pair

for d in data1:
    user,item = d[0], d[1]
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    ratingDict[(user,item)] = int(d[3])

In [7]:
# Create a numeric user_id and recipe_id column
data2['user'] = data2['user_id'].astype("category")
data2['recipe'] = data2['recipe_id'].astype("category")
data2['userID'] = data2['user'].cat.codes
data2['recipeID'] = data2['recipe'].cat.codes

userCode = dict(enumerate(data2.user_id.astype("category").cat.categories))
uc = {v: k for k, v in userCode.items()}
recipeCode = dict(enumerate(data2.recipe_id.astype("category").cat.categories))
rc = {v: k for k, v in recipeCode.items()}

In [8]:
# Convert user names into numerical IDs
sparse_item_user = sparse.csr_matrix((data2['rating'].astype(float), (data2['recipeID'], data2['userID'])))
sparse_user_item = sparse.csr_matrix((data2['rating'].astype(float), (data2['userID'], data2['recipeID'])))

# Initialize the als model and fit it using the sparse recipe-user matrix
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20)

# Calculate the confidence by multiplying it by alpha value.
alpha_val = 40
data_conf = (sparse_item_user * alpha_val).astype('double')

# Fit the model
model.fit(data_conf)



  0%|          | 0/20 [00:00<?, ?it/s]

In [9]:
def recommendPop(user_id, sparse_user_item, user_vecs, item_vecs, query_item, pop_list):
    """Recommend items for a given user given a trained model
    
    Args:
        user_id (int): The id of the user we want to create recommendations for.
        
        data_sparse (csr_matrix): Our original training data.
        
        user_vecs (csr_matrix): The trained user x features vectors
        
        item_vecs (csr_matrix): The trained item x features vectors
        
        item_lookup (pandas.DataFrame): Used to map artist ids to artist names
        
        num_items (int): How many recommendations we want to return:
        
    Returns:
        recommendations (pandas.DataFrame): DataFrame with num_items artist names and scores
    
    """
    # Get all interactions by the user
    u = uc[int(user_id)]
    user_interactions = sparse_user_item[u,:].toarray()

    # We don't want to recommend items the user has consumed. So let's
    # set them all to 0 and the unknowns to 1.
    user_interactions = user_interactions.reshape(-1) + 1 #Reshape to turn into 1D array

    # This is where we calculate the recommendation by taking the 
    # dot-product of the user vectors with the item vectors.
    rec_vector = user_vecs[u,:].dot(item_vecs.T).toarray()

    # Let's scale our scores between 0 and 1 to make it all easier to interpret.
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    recommend_vector = user_interactions*rec_vector_scaled
   
    # Get all the artist indices in order of recommendations (descending) and
    # select only the top "num_items" items. 

    # item has been seen before 
    if int(query_item) in rc:
        i = rc[int(query_item)]
        ## if compatible or item is popular
        if (recommend_vector[i] >= 0.7) or (query_item in pop_list):
            return 1
        else: 
            return 0
        ## if not compatible 
    # item has not been seen before but users cooked a lot
    elif len(itemsPerUser[user_id]) >= 3500:
        return 1    
    return 0

In [10]:
# cross_validate(algo, knn_train, measures=['RMSE'], cv=5, verbose=True)

In [11]:
user_vecs = sparse.csr_matrix(model.user_factors)
item_vecs = sparse.csr_matrix(model.item_factors)

def would_cook1():
    # Popular recipes
    recipeCount = defaultdict(int)
    totalCooked = 0

    for user,recipe,_ in train:
        recipeCount[recipe] += 1
        totalCooked += 1

    mostPopular = [(recipeCount[x], x) for x in recipeCount]
    mostPopular.sort()
    mostPopular.reverse()
    
    # Try different threshold 
    limit = totalCooked/1.5
        
    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > limit: break

    predictions = open("assignment1/predictions_Made.txt", 'w')
    for l in open("assignment1/stub_Made.txt"):
        if l.startswith("user_id"):
        #header
            predictions.write(l)
            continue

        u,i = l.strip().split('-')
        
        # New User
        if (not int(u) in uc):
            if i in return1:
                predictions.write(u + '-' + i + ",1\n")
            else:
                predictions.write(u + '-' + i + ",0\n")

            continue
            
        # Existing user
        result = recommendPop(u, sparse_user_item, user_vecs, item_vecs, i, return1)
        if result == 0:
            predictions.write(u + '-' + i + ",0\n")
        else:
            predictions.write(u + '-' + i + ",1\n")
    predictions.close()    

In [None]:
would_cook1()