In [None]:
import itertools as it

import numpy as np
import torch
import torch.nn as nn
import itertools as it
import matplotlib.pyplot as plt
import os
import operator
import pandas as pd
import random

from tqdm.auto import trange
from cornac.models.recommender import Recommender
from cornac.utils.common import scale
from cornac.exception import ScoreException
from cornac.data import Dataset
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import ndcg_at_k
from recommenders.models.cornac.cornac_utils import predict_ranking
from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED

from sklearn.preprocessing import MinMaxScaler

# Collaborative filtering Approach

In [None]:
#@title Default title text

EPS = 1e-10

ACT = {
    "sigmoid": nn.Sigmoid(),
    "tanh": nn.Tanh(),
    "elu": nn.ELU(),
    "relu": nn.ReLU(),
    "relu6": nn.ReLU6(),
}


class BiVAE(nn.Module):
    def __init__(
        self,
        k,
        user_encoder_structure,
        item_encoder_structure,
        act_fn,
        likelihood,
        cap_priors,
        feature_dim,
        user_batch_size,
        item_batch_size,
    ):
        super(BiVAE, self).__init__()


        #initializes mu_theta, mu_beta, theta and beta tensors
        self.mu_theta = torch.zeros((item_encoder_structure[0], k))  # n_users*k
        self.mu_beta = torch.zeros((user_encoder_structure[0], k))  # n_items*k

        self.theta = torch.randn(item_encoder_structure[0], k) * 0.01
        self.beta = torch.randn(user_encoder_structure[0], k) * 0.01
        torch.nn.init.kaiming_uniform_(self.theta, a=np.sqrt(5))

        self.likelihood = likelihood
        self.act_fn = ACT.get(act_fn, None)
        if self.act_fn is None:
            raise ValueError("Supported act_fn: {}".format(ACT.keys()))

        self.cap_priors = cap_priors
        if self.cap_priors.get("user", False):
            self.user_prior_encoder = nn.Linear(feature_dim.get("user"), k)
        if self.cap_priors.get("item", False):
            self.item_prior_encoder = nn.Linear(feature_dim.get("item"), k)

        # User Encoder
        self.user_encoder = nn.Sequential()
        for i in range(len(user_encoder_structure) - 1):
            self.user_encoder.add_module(
                "fc{}".format(i),
                nn.Linear(user_encoder_structure[i], user_encoder_structure[i + 1]),
            )
            self.user_encoder.add_module("act{}".format(i), self.act_fn)

            # Apply Xavier initialization to the weights of the linear layer
            if isinstance(self.user_encoder[-1], nn.Linear):
                init.xavier_uniform_(self.user_encoder[-1].weight)
                if self.user_encoder[-1].bias is not None:
                    init.constant_(self.user_encoder[-1].bias, 0.0)
        self.user_mu = nn.Linear(user_encoder_structure[-1],k)  # mu
        self.user_std = nn.Linear(user_encoder_structure[-1],k)


        # Item Encoder
        self.item_encoder = nn.Sequential()
        for i in range(len(item_encoder_structure) - 1):
            self.item_encoder.add_module(
                "fc{}".format(i),
                nn.Linear(item_encoder_structure[i], item_encoder_structure[i + 1]),
            )
            self.item_encoder.add_module("act{}".format(i), self.act_fn)
            if isinstance(self.item_encoder[-1], nn.Linear):
                init.kaiming_uniform_(self.item_encoder[-1].weight, mode='fan_in', nonlinearity='relu')
                if self.item_encoder[-1].bias is not None:
                    init.constant_(self.item_encoder[-1].bias, 0.0)

        self.item_mu = nn.Linear(item_encoder_structure[-1],k)  # mu
        self.item_std = nn.Linear(item_encoder_structure[-1],k)

    def to(self, device):
        self.beta = self.beta.to(device=device)
        self.theta = self.theta.to(device=device)
        self.mu_beta = self.mu_beta.to(device=device)
        self.mu_theta = self.mu_theta.to(device=device)
        return super(BiVAE, self).to(device)

    def encode_user_prior(self, x):
        h = self.user_prior_encoder(x)
        return h

    def encode_item_prior(self, x):
        h = self.item_prior_encoder(x)
        return h

    def encode_user(self, x):
        h = self.user_encoder(x)
        return self.user_mu(h), torch.sigmoid(self.user_std(h))

    def encode_item(self, x):
        h = self.item_encoder(x)
        return self.item_mu(h), torch.sigmoid(self.item_std(h))

    def decode_user(self, theta, beta):
        h = theta.mm(beta.t())
        return torch.sigmoid(h)

    def decode_item(self, theta, beta):
        h = beta.mm(theta.t())
        return torch.sigmoid(h)

    def reparameterize(self, mu, std):
        eps = torch.randn_like(mu)
        return mu + eps * std

    def forward(self, x, user=True, beta=None, theta=None):

        if user:
            mu, std = self.encode_user(x)
            theta = self.reparameterize(mu, std)
            return theta, self.decode_user(theta, beta), mu, std
        else:
            mu, std = self.encode_item(x)
            beta = self.reparameterize(mu, std)
            return beta, self.decode_item(theta, beta), mu, std

    def loss(self, x, x_, mu, mu_prior, std, kl_beta, labels, relevance_scores):
        # Likelihood
        ll_choices = {
            "bern": x * torch.log(x_ + EPS) + (1 - x) * torch.log(1 - x_ + EPS),
            "gaus": -(x - x_) ** 2,
            "pois": x * torch.log(x_ + EPS) - x_,
        }

        ll = ll_choices.get(self.likelihood, None)
        if ll is None:
            raise ValueError("Supported likelihoods: {}".format(ll_choices.keys()))

        ll = torch.sum(ll, dim=1)

        # KL term
        kld = -0.5 * (1 + 2 * torch.log(std) - (mu - mu_prior).pow(2) - std.pow(2))
        kld = torch.sum(kld, dim=1)
        
        return torch.mean(kl_beta * kld - ll) 

In [None]:
#@title Default title text

def learn(
    bivae,
    train_set,
    n_epochs,
    user_batch_size,
    item_batch_size,
    user_learn_rate,
    item_learn_rate,
    beta_kl,
    verbose,
    plot_loss = False,
    device=torch.device("cpu"),
    dtype=torch.float32,
):
    user_params = it.chain(
        bivae.user_encoder.parameters(),
        bivae.user_mu.parameters(),
        bivae.user_std.parameters(),
    )

    item_params = it.chain(
        bivae.item_encoder.parameters(),
        bivae.item_mu.parameters(),
        bivae.item_std.parameters(),
    )

    if bivae.cap_priors.get("user", False):
        user_params = it.chain(user_params, bivae.user_prior_encoder.parameters())
        user_features = train_set.user_feature.features[: train_set.num_users]

    if bivae.cap_priors.get("item", False):
        item_params = it.chain(item_params, bivae.item_prior_encoder.parameters())
        item_features = train_set.item_feature.features[: train_set.num_items]

    u_optimizer = torch.optim.Adam(params=user_params, lr=user_learn_rate)
    i_optimizer = torch.optim.Adam(params=item_params, lr=item_learn_rate)

    x = train_set.matrix.copy()
    x.data = np.ones_like(x.data)  # Binarize data
    tx = x.transpose()

    progress_bar = trange(1, n_epochs + 1, disable=not verbose)

    u_loss_list = []
    i_loss_list = []
    for _ in progress_bar:

        # item side
        i_sum_loss = 0.0
        i_count = 0
        for i_ids in train_set.item_iter(item_batch_size, shuffle=False):
            i_batch = tx[i_ids, :]
            i_batch = i_batch.A
            i_batch = torch.tensor(i_batch, dtype=dtype, device=device)

            # Reconstructed batch
            beta, i_batch_, i_mu, i_std = bivae(i_batch, user=False, theta=bivae.theta)

            i_mu_prior = 0.0  # zero mean for standard normal prior if not CAP prior
            if bivae.cap_priors.get("item", False):
                i_batch_f = item_features[i_ids]
                i_batch_f = torch.tensor(i_batch_f, dtype=dtype, device=device)
                i_mu_prior = bivae.encode_item_prior(i_batch_f)

            i_loss = bivae.loss(i_batch, i_batch_, i_mu, i_mu_prior, i_std, beta_kl, i_batch, beta)
            i_optimizer.zero_grad()
            i_loss.backward()
            i_optimizer.step()

            i_sum_loss += i_loss.data.item()
            i_count += len(i_batch)

            beta, _, i_mu, _ = bivae(i_batch, user=False, theta=bivae.theta)

            bivae.beta.data[i_ids] = beta.data
            bivae.mu_beta.data[i_ids] = i_mu.data


        # user side
        u_sum_loss = 0.0
        u_count = 0
        for u_ids in train_set.user_iter(user_batch_size, shuffle=False):
            u_batch = x[u_ids, :]
            u_batch = u_batch.A
            u_batch = torch.tensor(u_batch, dtype=dtype, device=device)

            # Reconstructed batch
            theta, u_batch_, u_mu, u_std = bivae(u_batch, user=True, beta=bivae.beta)

            u_mu_prior = 0.0  # zero mean for standard normal prior if not CAP prior
            if bivae.cap_priors.get("user", False):
                u_batch_f = user_features[u_ids]
                u_batch_f = torch.tensor(u_batch_f, dtype=dtype, device=device)
                u_mu_prior = bivae.encode_user_prior(u_batch_f)

            u_loss = bivae.loss(u_batch, u_batch_, u_mu, u_mu_prior, u_std, beta_kl, u_batch, theta)
            u_optimizer.zero_grad()
            u_loss.backward()
            u_optimizer.step()

            u_sum_loss += u_loss.data.item()
            u_count += len(u_batch)

            theta, _, u_mu, _ = bivae(u_batch, user=True, beta=bivae.beta)
            bivae.theta.data[u_ids] = theta.data
            bivae.mu_theta.data[u_ids] = u_mu.data

        progress_bar.set_postfix(
            loss_i=(i_sum_loss / i_count), loss_u=(u_sum_loss / (u_count))
        )
        
        u_loss_list.append(u_sum_loss / u_count)
        
        i_loss_list.append(i_sum_loss / i_count)


    

    # infer mu_beta
    for i_ids in train_set.item_iter(item_batch_size, shuffle=False):
        i_batch = tx[i_ids, :]
        i_batch = i_batch.A
        i_batch = torch.tensor(i_batch, dtype=dtype, device=device)

        beta, _, i_mu, _ = bivae(i_batch, user=False, theta=bivae.theta)
        bivae.mu_beta.data[i_ids] = i_mu.data

    # infer mu_theta
    for u_ids in train_set.user_iter(user_batch_size, shuffle=False):
        u_batch = x[u_ids, :]
        u_batch = u_batch.A
        u_batch = torch.tensor(u_batch, dtype=dtype, device=device)

        theta, _, u_mu, _ = bivae(u_batch, user=True, beta=bivae.beta)
        bivae.mu_theta.data[u_ids] = u_mu.data

    #plotting losses
    x = list(range(1, n_epochs+1))

    if plot_loss:
        plt.figure(figsize=(10, 5))
        plt.plot(x, u_loss_list, label = 'u_loss')
        plt.plot(x, i_loss_list, label = 'i_loss')
        plt.legend()
        plt.show()
    return bivae

##Creating Recommender

In [None]:
#@title Default title text

class BiVAECF(Recommender):
    """Bilateral Variational AutoEncoder for Collaborative Filtering.
    Parameters
    ----------
    k: int, optional, default: 10
        The dimension of the stochastic user ``theta'' and item ``beta'' factors.
    encoder_structure: list, default: [20]
        The number of neurons per layer of the user and item encoders for BiVAE.
        For example, encoder_structure = [20], the user (item) encoder structure will be [num_items, 20, k] ([num_users, 20, k]).
    act_fn: str, default: 'tanh'
        Name of the activation function used between hidden layers of the auto-encoder.
        Supported functions: ['sigmoid', 'tanh', 'elu', 'relu', 'relu6']
    likelihood: str, default: 'pois'
        The likelihood function used for modeling the observations.
        Supported choices:
        bern: Bernoulli likelihood
        gaus: Gaussian likelihood
        pois: Poisson likelihood
    n_epochs: int, optional, default: 100
        The number of epochs for SGD.
    batch_size: int, optional, default: 100
        The batch size.
    learning_rate: float, optional, default: 0.001
        The learning rate for Adam.
    beta_kl: float, optional, default: 1.0
        The weight of the KL terms as in beta-VAE.
    cap_priors: dict, optional, default: {"user":False, "item":False}
        When {"user":True, "item":True}, CAP priors are used (see BiVAE paper for details),\
        otherwise the standard Normal is used as a Prior over the user and item latent variables.
    name: string, optional, default: 'BiVAECF'
        The name of the recommender model.
    trainable: boolean, optional, default: True
        When False, the model is not trained and Cornac assumes that the model is already \
        pre-trained.
    verbose: boolean, optional, default: False
        When True, some running logs are displayed.
    seed: int, optional, default: None
        Random seed for parameters initialization.
    use_gpu: boolean, optional, default: True
        If True and your system supports CUDA then training is performed on GPUs.
    References
    ----------
    * Quoc-Tuan Truong, Aghiles Salah, Hady W. Lauw. " Bilateral Variational Autoencoder for Collaborative Filtering."
    ACM International Conference on Web Search and Data Mining (WSDM). 2021.
    """

    def __init__(
        self,
        name="BiVAECF",
        k=10,
        user_encoder_structure=[20],
        item_encoder_structure = [20],
        act_fn="tanh",
        likelihood="pois",
        n_epochs=100,
        user_batch_size=100,
        item_batch_size = 100,
        user_learning_rate=0.001,
        item_learning_rate=0.001,
        beta_kl=1.0,
        cap_priors={"user": False, "item": False},
        trainable=True,
        verbose=False,
        seed=None,
        use_gpu=True,
        plot_loss = False,
    ):
        Recommender.__init__(self, name=name, trainable=trainable, verbose=verbose)
        self.k = k
        self.user_encoder_structure = user_encoder_structure
        self.item_encoder_structure = item_encoder_structure
        self.act_fn = act_fn
        self.likelihood = likelihood
        self.user_batch_size = user_batch_size
        self.item_batch_size = item_batch_size
        self.n_epochs = n_epochs
        self.user_learning_rate = user_learning_rate
        self.item_learning_rate = item_learning_rate
        self.beta_kl = beta_kl
        self.cap_priors = cap_priors
        self.seed = seed
        self.use_gpu = use_gpu
        self.plot_loss = plot_loss

    def fit(self, train_set, val_set=None):
        """Fit the model to observations.
        Parameters
        ----------
        train_set: :obj:`cornac.data.Dataset`, required
            User-Item preference data as well as additional modalities.
        val_set: :obj:`cornac.data.Dataset`, optional, default: None
            User-Item preference data for model selection purposes (e.g., early stopping).
        Returns
        -------
        self : object
        """
        Recommender.fit(self, train_set, val_set)

        import torch
        # from .bivae import BiVAE, learn

        self.device = (
            torch.device("cuda:0")
            if (self.use_gpu and torch.cuda.is_available())
            else torch.device("cpu")
        )

        if self.trainable:
            feature_dim = {"user": None, "item": None}
            if self.cap_priors.get("user", False):
                if train_set.user_feature is None:
                    raise ValueError(
                        "CAP priors for users is set to True but no user features are provided"
                    )
                else:
                    feature_dim["user"] = train_set.user_feature.feature_dim

            if self.cap_priors.get("item", False):
                if train_set.item_feature is None:
                    raise ValueError(
                        "CAP priors for items is set to True but no item features are provided"
                    )
                else:
                    feature_dim["item"] = train_set.item_feature.feature_dim

            if self.seed is not None:
                torch.manual_seed(self.seed)
                torch.cuda.manual_seed(self.seed)

            if not hasattr(self, "bivaecf"):
                num_items = train_set.matrix.shape[1]
                num_users = train_set.matrix.shape[0]
                self.bivae = BiVAE(
                    k=self.k,
                    #changes
                    user_encoder_structure=[num_items] + self.user_encoder_structure,
                    item_encoder_structure=[num_users] + self.item_encoder_structure,
                    #changes end
                    act_fn=self.act_fn,
                    likelihood=self.likelihood,
                    cap_priors=self.cap_priors,
                    feature_dim=feature_dim,
                    user_batch_size=self.user_batch_size,
                    item_batch_size = self.item_batch_size
                ).to(self.device)

            learn(
                self.bivae,
                self.train_set,
                n_epochs=self.n_epochs,
                user_batch_size=self.user_batch_size,
                item_batch_size = self.user_batch_size,
                user_learn_rate=self.user_learning_rate,
                item_learn_rate = self.item_learning_rate,
                beta_kl=self.beta_kl,
                verbose=self.verbose,
                device=self.device,
                plot_loss = self.plot_loss,
            )

        elif self.verbose:
            print("%s is trained already (trainable = False)" % (self.name))

        return self

    def score(self, user_idx, item_idx=None):
        """Predict the scores/ratings of a user for an item.
        Parameters
        ----------
        user_idx: int, required
            The index of the user for whom to perform score prediction.
        item_idx: int, optional, default: None
            The index of the item for which to perform score prediction.
            If None, scores for all known items will be returned.
        Returns
        -------
        res : A scalar or a Numpy array
            Relative scores that the user gives to the item or to all known items
        """

        if item_idx is None:
            if self.train_set.is_unk_user(user_idx):
                raise ScoreException(
                    "Can't make score prediction for (user_id=%d)" % user_idx
                )

            theta_u = self.bivae.mu_theta[user_idx].view(1, -1)
            beta = self.bivae.mu_beta
            known_item_scores = (
                self.bivae.decode_user(theta_u, beta).cpu().numpy().ravel()
            )

            return known_item_scores
        else:
            if self.train_set.is_unk_user(user_idx) or self.train_set.is_unk_item(
                item_idx
            ):
                raise ScoreException(
                    "Can't make score prediction for (user_id=%d, item_id=%d)"
                    % (user_idx, item_idx)
                )
            theta_u = self.bivae.mu_theta[user_idx].view(1, -1)
            beta_i = self.bivae.mu_beta[item_idx].view(1, -1)
            pred = self.bivae.decode_user(theta_u, beta_i).cpu().numpy().ravel()

            pred = scale(
                pred, self.train_set.min_rating, self.train_set.max_rating, 0.0, 1.0
            )

            return pred

In [None]:
file_path = '/kaggle/input/user-data/'
data_file = os.path.join(file_path,"atrad_user_history.csv")

df = pd.read_csv(data_file, names=["userID", "itemID"],dtype = {'userID':np.int32}, skiprows=1)
df['itemID'] = df['itemID'].apply(lambda x : x.split(".")[0])
df = df.assign(rating=1)
df.head(2)

In [None]:
df = df.drop_duplicates()
df_all = df.copy()

In [None]:
def filter_dataset(dataframe, min_items, min_interactions):
    dataframe = dataframe.groupby("itemID").filter(lambda x: len(x) >= min_items)
    dataframe = dataframe.groupby("userID").filter(lambda x: len(x) >= min_interactions)
    return dataframe

In [None]:
df = filter_dataset(
    df,
    min_items = 1,
    min_interactions = 4
)

In [None]:
def get_user_rated_item_count(df):
    user_item_interaction_count = df.groupby(['userID'],as_index = False)['itemID'].count()
    user_item_interaction_count.rename(columns = {'itemID':'interaction_count'}, inplace = True)
    return user_item_interaction_count

In [None]:
interaction_count_df = get_user_rated_item_count(df)
interaction_count_df.head()

In [None]:
interaction_count_df.groupby("interaction_count").count()

In [None]:
interaction_count_df['interaction_count']

In [None]:
# top k items to recommend
TOP_K = 10

# Model parameters
LATENT_DIM = 70
USER_ENCODER_DIMS = [200,100]
ITEM_ENCODER_DIMS = [500,250,100]
ACT_FUNC = "elu"
LIKELIHOOD = "bern"
NUM_EPOCHS = 1000
USER_BATCH_SIZE = 256
ITEM_BATCH_SIZE = 256
# LEARNING_RATE = 0.001
USER_LEARNING_RATE = 0.001
ITEM_LEARNING_RATE = 0.0005
PLOT_LOSS = False
BETA_KL = 0.9

bivae = BiVAECF(
    k=LATENT_DIM,
    user_encoder_structure = USER_ENCODER_DIMS,
    item_encoder_structure = ITEM_ENCODER_DIMS,
    act_fn=ACT_FUNC,
    likelihood=LIKELIHOOD,
    n_epochs=NUM_EPOCHS,
    user_batch_size=USER_BATCH_SIZE,
    item_batch_size = ITEM_BATCH_SIZE,
    user_learning_rate=USER_LEARNING_RATE,
    item_learning_rate=USER_LEARNING_RATE,
    plot_loss = PLOT_LOSS,
    seed=69,
    beta_kl=BETA_KL,
    use_gpu=torch.cuda.is_available(),
    verbose=True
)

# Content based approach

In [None]:
cos_sim_data_file = os.path.join(file_path,"cos_sim.csv")
cos_sim_data = pd.read_csv(cos_sim_data_file)

In [None]:
cos_sim_data = cos_sim_data.set_index('symbol').rename_axis('symbol', axis=1)
cos_sim_data

In [None]:
def give_recommendations(stock_business_code,k,print_recommendation):
    stock_recomm =cos_sim_data.loc[stock_business_code].sort_values(ascending=False).index.tolist()[1:k+1]

    index_recomm = cos_sim_data.loc[stock_business_code].sort_values(ascending=False).values.tolist()[1:k+1]

    result = dict(zip(stock_recomm, index_recomm))
    # result = {'stocks':stock_recomm,'Index':index_recomm}

    if print_recommendation==True:
        print('The prefered stock is : {} \n'.format(stock_business_code))
        k = 1
        for stock in stock_recomm:
            print('The number %i recommended stock is this one: %s \n'%(k,stock))
            k = k+1
    return result

In [None]:
def top_k_recommendations(k,train_):
    Recommendations = pd.DataFrame(columns=['userID', 'Security', 'itemID','content_prediction'])

    for i, j in train_.iterrows():
        MASKED_USER_ID = j['userID']
        Security = j['itemID']

        recomm = give_recommendations(Security,k,False)
        recommendation_stock = list(recomm.keys())
        content_prediction=list(recomm.values())

        Recommendations.loc[len(Recommendations.index)] = [MASKED_USER_ID, Security,recommendation_stock, content_prediction]  

    Recommendations_ver2=Recommendations.explode(['itemID', 'content_prediction'])
    sorted_df = Recommendations_ver2.sort_values(by=['content_prediction'], ascending=False)
    recommendations_without_duplicates = sorted_df.drop_duplicates(['userID','itemID'], keep='first')
    Recommendations_top_k = recommendations_without_duplicates.sort_values(['userID', 'content_prediction'], ascending=[True, False]).groupby('userID').head(k)

    del Recommendations_top_k["Security"]
    Recommendations_top_k['userID'] = Recommendations_top_k['userID'].astype(int)
    return Recommendations_top_k

# Cross validation for hybrid

In [None]:
from sklearn.model_selection import KFold
import statistics
from sklearn.model_selection import StratifiedKFold

# Split the data into four folds using stratified KFold

def cross_val_rec_sys(df, model):
    df = df.sample(frac=1).reset_index(drop=True)

    NDCGs = []
    
    skf = StratifiedKFold(n_splits=4)

    y = df['userID']
    X = df.drop(['userID'], axis = 1)

    train_sets = []
    train_s = []
    test_sets = []
    test_s = []

    for fold, (train_indices, test_indices) in enumerate(skf.split(X,y)):
        train_ = df.iloc[train_indices]
        test_ = df.iloc[test_indices]

        test_ = test_[test_["userID"].isin(train_["userID"].unique())]
        test_ = test_[test_["itemID"].isin(train_["itemID"].unique())]  

        train_set = Dataset.from_uir(train_.itertuples(index=False), seed= 69)
        test_set = Dataset.from_uir(test_.itertuples(index=False), seed= 69)

        ndcg_score, results, eval_ndcg_3,eval_ndcg_5,eval_ndcg_15,eval_ndcg_20 = evaluate_recsys(model, train_set, train_, test_)
        print(ndcg_score)
        NDCGs.append(ndcg_score)

        train_sets.append(train_set)
        train_s.append(train_)
        test_sets.append(test_set)
        test_s.append(test_)
    return statistics.mean(NDCGs), NDCGs, train_sets, train_s, test_sets,test_s,results

In [None]:
def calculate_final_rating(df):
    ratings = []
    for index, row in df.iterrows():
        userID = row['userID']
        cf_pred = row['cf_prediction']
        cb_pred = row['content_prediction']
        n = interaction_count_df[interaction_count_df['userID']==userID]['interaction_count'].values[0]
        r = 1 / (1 + np.exp((-n/5)))
        rating = r*cf_pred + (1-r)*cb_pred
        ratings.append(rating)
    return ratings

In [None]:
def merge_df(cf_predictions,content_predictions):
    results = cf_predictions.merge(content_predictions,how='outer')
    results = results.fillna(0.0)
    final_ratings = calculate_final_rating(results)
    results['prediction'] = final_ratings
    return results

In [None]:
def evaluate_recsys(model, train_set, train_, test_):
    model.fit(train_set);
    cb_preds = top_k_recommendations(287, train_)
    cf_preds = predict_ranking(model, train_, usercol='userID', itemcol='itemID', remove_seen=True)
    
    content_predictions = cb_preds.copy()
    all_predictions = cf_preds.copy()
    
    content_predictions['content_prediction'] = MinMaxScaler().fit_transform(np.array(content_predictions['content_prediction']).reshape(-1,1))
    all_predictions['prediction'] = MinMaxScaler().fit_transform(np.array(all_predictions['prediction']).reshape(-1,1))
    eval_ndcg_cb = ndcg_at_k(test_, content_predictions, col_user='userID', col_item ='itemID',col_prediction='content_prediction', k=TOP_K)
    
    print('eval_ndcg_cb: {}'.format(eval_ndcg_cb))
    eval_ndcg_cf = ndcg_at_k(test_, all_predictions, col_user='userID', col_item ='itemID',col_prediction='prediction', k=TOP_K)
    print('eval_ndcg_cf: {}'.format(eval_ndcg_cf))
    
    all_predictions.rename(columns = {'prediction':'cf_prediction'}, inplace = True)
    results = merge_df(all_predictions, content_predictions)
    
    eval_ndcg = ndcg_at_k(test_, results, col_user='userID', col_item ='itemID',col_prediction='prediction', k=TOP_K)
    
    return eval_ndcg,results

In [None]:
def hit_rate(model, train_set, train_, test_, k):
    all_items_indices = train_set.item_ids
    item_id_to_indices_df = pd.DataFrame()
    item_id_to_indices_df['itemID'] = train_set.item_ids
    item_id_to_indices_df['itemIndice'] = train_set.item_indices
    hits = 0
    random.seed(0)
    for user_id in train_set.user_ids:

        user = list(train_set.user_ids).index(user_id)

        #filter out unknown indices
        train_items = train_[train_['userID'] == user_id]['itemID'].values
        hold_out_items = test_[test_['userID'] == user_id]['itemID'].values
        hold_out_items_list = item_id_to_indices_df[item_id_to_indices_df['itemID'].isin(hold_out_items)].itemIndice.values
        hold_item = random.choice(hold_out_items_list)
        unknown_items = list(set(all_items_indices) - set(train_items))
        unknown_items_indices = item_id_to_indices_df[item_id_to_indices_df['itemID'].isin(unknown_items)].itemIndice.values

        #generate recommendations
        recommended_items = model.rank(user, unknown_items_indices)
        top_recommendations = list(recommended_items[0])[:k]

        #check if a hit
        hit = False
        for item in top_recommendations:
            if item == hold_item:
                hit = True
                break
        if (hit) :
            hits += 1


    total_users = len(train_set.user_ids)

    hit_rate = (hits/total_users)
    return hit_rate


In [None]:
avg_ndcg, history, train_sets, train_s, test_sets, test_s,results = cross_val_rec_sys(df, bivae)
print('avg ndcg : {}'.format(avg_ndcg))
print('ndcgs : {}'.format(history))

In [None]:
results.to_csv('results.csv',index=False)

In [None]:
train_set = train_sets[3]
train_ = train_s[3]
test_ = test_s[3]
item_id_to_indices_df = pd.DataFrame()
item_id_to_indices_df['itemID'] = train_set.item_ids
item_id_to_indices_df['itemIndice'] = train_set.item_indices

In [None]:
all_items_indices = train_set.item_ids
def recommender(user_id, k = 10):
    #grab user indice
    top_recommendations_list = []
    user = list(train_set.user_ids).index(user_id)

    #filter out unknown indices
    train_items = train_[train_['userID'] == user_id]['itemID'].values
    hold_out_items = test_[test_['userID'] == user_id]['itemID'].values
    unknown_items = list(set(all_items_indices) - set(train_items))
    unknown_items_indices = item_id_to_indices_df[item_id_to_indices_df['itemID'].isin(unknown_items)].itemIndice.values
    
    #generate recommendations
    recommended_items = bivae.rank(user, unknown_items_indices)
    print(recommended_items)
    top_recommendations = list(recommended_items[0])[:k]
    
    for item in top_recommendations:
        top_recommendations_list.append(item_id_to_indices_df[item_id_to_indices_df['itemIndice']==item]['itemID'].values[0])
    return top_recommendations_list

## Final recommendations

In [None]:
def recommend_cb(userID, k):
    itemIDs = df_all[df_all['userID']==userID]['itemID'].values.tolist()
    cb_recommendations = {}
    for i in itemIDs:
        stock_recomm =cos_sim_data.loc[i].sort_values(ascending=False).index.tolist()[1:k+1]
        index_recomm = cos_sim_data.loc[i].sort_values(ascending=False).values.tolist()[1:k+1]
        result = dict(zip(stock_recomm, index_recomm))
        cb_recommendations.update(result)

    cf_results = remove_duplicates(cb_recommendations)
    cb_recommendations_sorted = dict( sorted(cf_results.items(), key=operator.itemgetter(1),reverse=True))
    final_recommendations = list(cb_recommendations_sorted.keys())[0:k]
    return final_recommendations

In [None]:
def remove_duplicates(rec_dict):
    result = {}
    for key,value in rec_dict.items():
        if value not in result.values():
            result[key] = value
    return result

In [None]:
sec_gics = os.path.join(file_path,"fullcompanylist.csv")

security_file_withgics = pd.read_csv(sec_gics)
security_file_withgics = security_file_withgics.drop(['Unnamed: 0', 'Unnamed: 0.1', 'buisnesssummary', 'address'], axis=1)
for i in range(security_file_withgics.shape[0]):
    security_file_withgics = security_file_withgics.replace(security_file_withgics['symbol'][i],security_file_withgics['symbol'][i].strip('()'))
security_file_withgics['symbol'] = security_file_withgics['symbol'].apply(lambda x : x.split(".")[0])
security_file_withgics = security_file_withgics.drop_duplicates()
security_file_withgics = security_file_withgics.reset_index(drop=True)
security_file_withgics

In [None]:
def view_recommendation(recommendations):
    rec_list = []
    for i in recommendations:
        rec_tuple = tuple(security_file_withgics[security_file_withgics['symbol']==i].values[0])
        rec_list.append(rec_tuple)
    df_recommedations = pd.DataFrame(rec_list, columns=['name', 'symbol', 'gics'])  
    return df_recommedations

In [None]:
def recommend_hybrid(userID,k):
    recommendations = []
    print('Top {} recommended stocks for user {}'.format(k,userID))
    if (userID in df['userID'].values):
        print('Recommendationa from weighted hybrid system:')
        return view_recommendation(recommender(userID, k))
    else:
        print('Recommendationa from content based system:')
        return view_recommendation(recommend_cb(userID, k))

In [None]:
final_recommendations = recommend_hybrid(3,10)
final_recommendations