In [23]:
!pip3 install --upgrade google-cloud-firestore > /dev/null 2>/dev/null
!pip3 install firebase-admin google-cloud-firestore

Dropdown(description='Select approach', options=('Item-based (correlation)', 'Item-based (cosine)', 'User-base…

In [1]:
import datetime

from google.cloud import storage
from google.cloud import firestore

from firebase_admin import credentials, firestore
from firebase_admin import db
import firebase_admin

import pandas as pd
import numpy as np
from sklearn.externals import joblib
from sklearn.neighbors import NearestNeighbors

import warnings
warnings.filterwarnings('ignore')


cred = credentials.ApplicationDefault()
app = firebase_admin.initialize_app(cred)
store = firestore.client()

def get_collection_asDF(colection_name):
    doc_ref = store.collection(colection_name)
    try:
        docs = doc_ref.get()
        data = [doc.to_dict() for doc in docs]        
        df = pd.DataFrame(data)
        return df
    except google.cloud.exceptions.NotFound:
        print(u'Missing data')
        
books = get_collection_asDF('books')
ratings = get_collection_asDF('ratings')
users = get_collection_asDF('users')        
        
books.drop(['imagine'],axis=1,inplace=True)
books['an'] = books['an'].replace(0,np.nan)
books['an'].fillna(books['an'].mean(), inplace=True)
books['an'] = books['an'].astype(np.int32)    

users.drop(['nume','parola','isAdmin'],axis=1,inplace=True)
users.varsta.loc[(users.varsta >90) | (users.varsta<5)] = np.nan
users.varsta = users.varsta.fillna(users.varsta.mean())
users.varsta = users.varsta.astype(np.int32)
users.id = users.id.astype(np.int32)

ratings_new = ratings[ratings.isbn.isin(books.isbn)]
ratings = ratings[ratings.utilizator.isin(users.id)]
ratings_explicit = ratings_new[ratings_new.scor != 0]
ratings_implicit = ratings_new[ratings_new.scor == 0]
users_exp_ratings = users[users.id.isin(ratings_explicit.utilizator)]
users_imp_ratings = users[users.id.isin(ratings_implicit.utilizator)]

counts1 = ratings_explicit['utilizator'].value_counts()
ratings_explicit = ratings_explicit[ratings_explicit['utilizator'].isin(counts1[counts1 >= 1].index)]
counts = ratings_explicit['scor'].value_counts()
ratings_explicit = ratings_explicit[ratings_explicit['scor'].isin(counts[counts >= 1].index)]

ratings_matrix = ratings_explicit.pivot(index='utilizator', columns='isbn', values='scor')
ratings_matrix.fillna(0, inplace = True)
ratings_matrix = ratings_matrix.astype(np.int32)

model_knn_users = NearestNeighbors(metric = 'cosine', algorithm = 'brute') 
model_knn_users.fit(ratings_matrix)

model_knn_items = NearestNeighbors(metric = 'cosine', algorithm = 'brute') 
model_knn_items.fit(ratings_matrix.T)


def save(pipeline,model,BUCKET_NAME):
    
    joblib.dump(pipeline, model)

    # Upload the model to GCS
    bucket = storage.Client().bucket(BUCKET_NAME)
    blob = bucket.blob('{}/{}'.format(
        datetime.datetime.now().strftime('book_%Y%m%d_%H%M'),
        model))
    blob.upload_from_filename(model)
    
BUCKET_NAME =  "cc-homework-3-272910-aiplatform"
save(model_knn_users,'model_users.joblib',BUCKET_NAME)
save(model_knn_items,'model_items.joblib',BUCKET_NAME)
save(ratings_matrix,'ratings_matrix.joblib',BUCKET_NAME)
save(books,'books.joblib',BUCKET_NAME)



In [16]:
import os

import re
import numpy as np
import pandas as pd
import joblib


class MyPredictor(object):

    def __init__(self, model_users, model_items, ratings_matrix, books):

        self._model_users = model_users
        self._model_items = model_items
        self._ratings_matrix = ratings_matrix
        self._books = books

    def find_k_similar_users(self, user_id, k=10):
        loc = self._ratings_matrix.index.get_loc(user_id)
        distances, indices = self._model_users.kneighbors(self._ratings_matrix.iloc[loc, :].values.reshape(1, -1),
                                                          n_neighbors=k + 1)
        similarities = 1 - distances.flatten()

        return similarities, indices

    def predict_user_based(self, user_id, item_id, _ratings_matrix, k=10):
        prediction = 0
        user_loc = _ratings_matrix.index.get_loc(user_id)
        item_loc = _ratings_matrix.columns.get_loc(item_id)
        similarities, indices = self.find_k_similar_users(user_id, k)
        mean_rating = _ratings_matrix.iloc[user_loc, :].mean()
        sum_wt = np.sum(similarities) - 1
        product = 1
        wtd_sum = 0

        for i in range(0, len(indices.flatten())):
            if indices.flatten()[i] == user_loc:
                continue
            else:
                ratings_diff = _ratings_matrix.iloc[indices.flatten()[i], item_loc] - np.mean(
                    _ratings_matrix.iloc[indices.flatten()[i], :])
                product = ratings_diff * (similarities[i])
                wtd_sum = wtd_sum + product

        prediction = int(np.nan_to_num(round(mean_rating + (wtd_sum / sum_wt))))
#         print('\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id, item_id, prediction))

        return prediction

    def find_k_similar_items(self, item_id, k=10):
        ratings = self._ratings_matrix.T
        loc = ratings.index.get_loc(item_id)

        distances, indices = self._model_items.kneighbors(ratings.iloc[loc, :].values.reshape(1, -1), n_neighbors=k + 1)
        similarities = 1 - distances.flatten()

        return similarities, indices

    def predict_item_based(self, user_id, item_id, _ratings_matrix, k=10):
        prediction = wtd_sum = 0
        item_id = int(item_id)
        user_loc = _ratings_matrix.index.get_loc(user_id)
        item_loc = _ratings_matrix.columns.get_loc(item_id)
        similarities, indices = self.find_k_similar_items(item_id, k)
        sum_wt = np.sum(similarities) - 1
        product = 1
        for i in range(0, len(indices.flatten())):
            if indices.flatten()[i] == item_loc:
                continue
            else:
                product = _ratings_matrix.iloc[user_loc, indices.flatten()[i]] * (similarities[i])
                wtd_sum = wtd_sum + product
        prediction = int(np.nan_to_num(round(wtd_sum / sum_wt)))

        if prediction <= 0:
            prediction = 1
        elif prediction > 10:
            prediction = 10

#         print('\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id, item_id, prediction))

        return prediction

    def recommend(self, user_id, k=10, item=True):
        if (user_id not in self._ratings_matrix.index.values) or type(user_id) is not int:
            print("User id should be a valid integer from this list :\n\n {} ".format(
                re.sub('[\[\]]', '', np.array_str(self._ratings_matrix.index.values))))
        else:

            prediction = []

            for i in range(self._ratings_matrix.shape[1]):
                if self._ratings_matrix[self._ratings_matrix.columns[i]][user_id] != 0:  # not rated already
                    if item:
                        prediction.append(
                            self.predict_item_based(user_id, self._ratings_matrix.columns[i], self._ratings_matrix))
                    else:
                        prediction.append(
                            self.predict_user_based(user_id, self._ratings_matrix.columns[i], self._ratings_matrix))
                else:
                    prediction.append(-1)

            prediction = pd.Series(prediction)
            prediction = prediction.sort_values(ascending=False)
            recommended = prediction[:k]
            return [str(self._books.isbn[recommended.index[i]]) for i in range(len(recommended))]


    def predict(self, instances, **kwargs):
        """Performs custom prediction.

        Preprocesses inputs, then performs prediction using the trained
        scikit-learn model.

        Args:
            instances: A list of prediction input instances.
            **kwargs: A dictionary of keyword args provided as additional
                fields on the predict request body.

        Returns:
            A list of outputs containing the prediction results.
        """
        
        if 'k' in kwargs.keys():
            k =int(kwargs['k'])
        else: 
            k= 10
            
        if 'item' in kwargs.keys():
            item = bool(kwargs['item'])
        else:
            item = True
                            
        outputs = [self.recommend(i,k,item) for i in instances ] 
        return outputs

    @classmethod
    def from_path(cls, model_dir):

        model_users_path = os.path.join(model_dir, 'model_users.joblib')
        print(model_users_path)
        model_users = joblib.load(model_users_path)
        print('load users')
        
        
        model_items_path = os.path.join(model_dir, 'model_items.joblib')
        print(model_items_path)
        model_items = joblib.load(model_items_path)
        print('load items')
        
        ratings_matrix_path = os.path.join(model_dir, 'ratings_matrix.joblib')
        print(ratings_matrix_path)
        ratings_matrix = joblib.load(ratings_matrix_path)
        print('load ratings')
        
        books_path = os.path.join(model_dir, 'books.joblib')
        print(books_path)
        books =  joblib.load(books_path)
        print('load books')
        
        return cls(model_users, model_items, ratings_matrix, books)


In [17]:
pred = MyPredictor(model_knn_users,model_knn_items,ratings_matrix,books)

In [13]:
pred.find_k_similar_users(8)

(array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([[ 0, 70, 72, 67, 68, 69, 65, 75, 74, 71, 73]]))

In [18]:
pred.recommend(8,k=4,item=True)

['140003180', '380717018', '38076654', '1567184294']

In [59]:
pred.predict([8,12],k=4,item=False)

[[140003180, 380717018, 38076654, 1567184294],
 [385418493, 440241413, 312970242, 316769487]]