In [1]:
import os
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import numpy as np
from sklearn.utils import shuffle
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
class Recommend:
    def __init__(self, 
                 transaction_df,
                 features_df):
        self.transaction_df = transaction_df
        self.features_df = features_df
        self.model = NearestNeighbors(n_neighbors=12)
        self.model.fit(self.features_df.iloc[:, 1:-1].to_numpy())
        
    
    def predict(self, centroid):
        _, predictions = self.model.kneighbors(centroid.reshape(1, -1))
        return predictions
    
    def find_centroid(self, article_list):
        all_features = []
        for article in article_list:
            if article in self.features_df["article_id"].tolist():
                block = self.features_df[self.features_df["article_id"] == article].iloc[:, 1:-1]
#                 print(block)
                if block.shape[0] > 0:
                    all_features.append(block.to_numpy())

        all_features = np.concatenate(np.array(all_features))
        return np.sum(all_features, axis=0) / all_features.shape[0]
            
            
            
    def get_items_from_index(self, index_list):
        
        predictions_string = ""
        for idx in index_list:
            try:
                predictions_string += str(int(feature_df.iloc[idx]["article_id"])) + " "
            except:
                pass
        return predictions_string.strip()

    def generate_recommendations(self, n_customers=5, feature_type=""):
        # make predictions for customers in sample_sub_df

        transaction_df = self.transaction_df.drop(['t_dat', 'price', 'sales_channel_id'], axis=1)
        
        customer_group = transaction_df.groupby("customer_id")
        
        customer_count = 0
        results = []
        
        for name, group in tqdm(customer_group):
            if customer_count < n_customers:
                if len(group) > 1:
                    customer_count += 1
                    try:
                        group_len = len(group)
                        if group_len == 2:
                            test_sample = group["article_id"].sample(n=1)
#                             print(test_sample)
                            group.drop(test_sample.index, inplace=True)
                            centroid = self.find_centroid(test_sample.tolist())
                            recommendations = self.predict(centroid)
                            recommendations = recommendations.tolist()[0]
                            recommendations_string = self.get_items_from_index(recommendations[1:2])

                            ground_truth_string = " ".join(group["article_id"].astype(str).tolist())

                            results.append([name, recommendations_string, ground_truth_string])

                        else:
                            if group_len > 12:
                                group_len = 12
                            n_samples = int(0.4*group_len)
                            n_recomm = group_len - n_samples
                            test_sample = group["article_id"].sample(n=n_samples)
#                             print(test_sample)
                            group.drop(test_sample.index, inplace=True)
                            centroid = self.find_centroid(test_sample.tolist())
                            recommendations = self.predict(centroid)
                            recommendations = recommendations.tolist()[0]
                            recommendations_string = self.get_items_from_index(recommendations[1:n_recomm+1])

                            ground_truth = group.sample(n=n_recomm)
                            ground_truth_string = " ".join(ground_truth["article_id"].astype(str).tolist())
                            results.append([name, recommendations_string, ground_truth_string])
                    except:
                        pass
        
        results_df = pd.DataFrame(results, columns=['customer_id', 'recommendations', 'ground_truth'])
        results_df.to_csv("recommendations_"+ feature_type +".csv")
        return results_df
        
        
            
            
class evaluate:
    def __init__(self, results_df, articles_df, feature_df):
        self.results_df = results_df
        self.articles_df = articles_df
        self.feature_df = feature_df
        self.articles_df = self.articles_df[["article_id", "index_group_name"]]
        
    def calc_metrics(self, predictions, recommendations):    
        per_customer_index_name_count = 0
        per_customer_cosine_similarity_count = 0
        
        for i in range(len(predictions)):
            try:
                check_index_group_name_pred = self.articles_df[articles_df["article_id"] == int(predictions[i])]["index_group_name"].tolist()
                check_index_group_name_recom = self.articles_df[articles_df["article_id"] == int(recommendations[i])]["index_group_name"].tolist()
                if check_index_group_name_pred[0] == check_index_group_name_recom[0]:
                    per_customer_index_name_count += 1

                img_pred = self.feature_df[self.feature_df["article_id"] == int(predictions[i])].iloc[:, 1:-1].to_numpy()
                img_recom = self.feature_df[self.feature_df["article_id"] == int(recommendations[i])].iloc[:, 1:-1].to_numpy()
                if cosine_similarity(img_pred, img_recom) > 0.9:
                    per_customer_cosine_similarity_count += 1
            except:
                pass

        return per_customer_index_name_count/len(predictions), per_customer_cosine_similarity_count/len(predictions)
    
    def get_metrics(self):
        valid_count = 0
        all_customers_index_metric = 0
        all_customers_cosine_simialrity = 0
        
        for (idx, row) in self.results_df.iterrows():
            preds = row["ground_truth"].split(" ")
            recoms = row["recommendations"].split(" ")

            if len(preds) == len(recoms):
                valid_count += 1
                index_metric, similarity = self.calc_metrics(preds, recoms)
                all_customers_index_metric += index_metric
                all_customers_cosine_simialrity += similarity
                
                
        return all_customers_cosine_simialrity/valid_count, all_customers_index_metric/valid_count            

In [3]:
## bottleneck, run this cell only once before calling the object. Don't run unless runtime fails
articles_path = "C:/Users/ojubh\/Desktop/SEMESTER 2/Deep Learning/Final_Project/data/articles.csv"
transaction_path = "C:/Users/ojubh\/Desktop/SEMESTER 2/Deep Learning/Final_Project/data/transactions_train.csv"
features_path = "C:/Users/ojubh\/Desktop/SEMESTER 2/Deep Learning/Final_Project/data/text_embeddings_final.csv"

transaction_df = pd.read_csv(transaction_path)
feature_df = pd.read_csv(features_path)
articles_df = pd.read_csv(articles_path)




In [23]:
recommend = Recommend(transaction_df, feature_df)
recommendations = recommend.generate_recommendations(n_customers=2000, feature_type="text")

100%|██████████| 1362281/1362281 [02:54<00:00, 7796.85it/s] 


In [20]:
cosine_metrics, category_metric = evaluate(recommendations, articles_df, feature_df).get_metrics()

In [5]:
image_embds = pd.read_csv("C:/Users/ojubh\/Desktop/SEMESTER 2/Deep Learning/Final_Project/data/visual_embeddings_final.csv")
text_embds = pd.read_csv("C:/Users/ojubh\/Desktop/SEMESTER 2/Deep Learning/Final_Project/data/text_embeddings_final.csv")
merged = image_embds.merge(text_embds, on="article_id", how="inner")
merged.drop(['Unnamed: 0_y'], axis=1, inplace=True)
article_id = merged.pop("article_id")
merged.insert(513, "article_id", article_id)