In [None]:
import pandas as pd
import numpy as np
import math
from scipy.sparse.linalg import svds
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
import sklearn
from sklearn.decomposition import NMF
from numpy import linalg as LA
from sklearn.metrics import mean_squared_error
from math import sqrt


# **Import Article file which contains all the information about the article the has interacted with**

# In[512]:


articles_df = pd.read_csv('article.csv')
articles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']
#removing unwanted columns
articles_df = articles_df.drop(articles_df.columns[[0, 1, 3,4,5,6,7,8,11,12]], axis=1)
articles_df.head()


# In[513]:


#check for null values
articles_df.isnull().values.any()


# **Import the user interaction file which will give us the information about how eah user interacted with each articles**

# In[5]:


df = pd.read_csv('User_Interaction.csv')


# In[6]:


df = df.drop(df.columns[[0,4,5,6,7]], axis=1)
df.head()


# In[514]:


# there are about 72312 entries in this dataset
df.shape


# In[515]:


#1895 unique user interactions
len(df.personId.unique())


# In[516]:


# users interacted with 3171 unique artciles
len(df.contentId.unique())


# In[517]:


#The different interations the user had with the artciles
df['eventType'].unique()


# In[11]:


df['eventType']. value_counts(). idxmax()


# In[12]:


df['eventType']. value_counts(). idxmin()


# In[13]:


df['eventType']. value_counts()


# **Providing weight to each interaction based on its priority(higher the weight higher the priority)**

# In[518]:


eventType_weight= {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 2.5, 
   'FOLLOW': 3.0,
   'COMMENT CREATED': 4.0,  
}

df['eventWeight'] = df['eventType'].apply(lambda x: eventType_weight[x])


# In[519]:


df.head(10)


# In[16]:


#count of the each article interation by the user
count_df_content = df.groupby(['personId', 'contentId']).size()
count_df_content


# In[17]:


#count of how many artciles did the user interact with
count_df = df.groupby(['personId', 'contentId']).size().groupby('personId').size()
count_df.head(10)


# In[18]:


#we only want users with interaction above ceratin threshold to avoid cold start problem


# In[19]:


#user id with interactions greater than the threshold
interaction_threshold_users=count_df[count_df >= 5].reset_index()[['personId']]
interaction_threshold_users.head(10)


# In[20]:


#total number of users who have interacted with the articles more than 5 time
len(interaction_threshold_users)


# In[21]:


#merging the two dataframe, because we use only the datset with users who have interacted more than 5 times
df_new=df.merge(interaction_threshold_users)


# In[22]:


df_new.head(10)


# In[23]:


df_new.shape


# In[520]:


# Using Log transformation, we can fix the skewness of the data


# In[25]:


def smooth_user_preference(x):
    return math.log(1+x, 2)
    
df_log = df_new.groupby(['personId', 'contentId'])['eventWeight'].sum().apply(smooth_user_preference).reset_index()


# In[26]:


df_log.head(10)


# In[27]:


df_log.shape


# In[522]:


#spliting the data for training and validation
df_train,df_test=train_test_split(df_log,stratify=df_log['personId'],test_size=0.2)
print("Training data",len(df_train))
print("Testing data",len(df_test))


# In[524]:


#pivot the dataset for better representation of large data 
df_pivot=df_train.pivot(index='personId',columns="contentId",values='eventWeight').fillna(0)


# In[30]:


df_pivot.head(20)


# In[31]:


pivot_matrix=df_pivot.values


# In[32]:


pivot_matrix


# In[557]:


df_list=list(df_pivot.index)


# In[558]:


#create a sparse matrix
sparse_matrix_df=csr_matrix(pivot_matrix)


# In[559]:


sparse_matrix_df


# In[560]:


sparse_matrix_df.toarray()


# # Matrix Factorization
# 
# **Three Factorization approaches have been used for this dataset**
# 1) Sing Value Decomposition(SVD)
# 
# 2) Nonnegative Matrix Factorization(NMF)
# 
# 3) Stochastic Gradient Decent(SGD)

# # SVD

# In[530]:


#The no. of latent factors have been identified using trail and error

No_OF_FACTORS = 45

#SVD decomposes a single matrix to 3 smaller matrices.
U,sigma,Vt = svds(sparse_matrix_df,k=No_OF_FACTORS)


# In[531]:


U.shape


# In[532]:


Vt.shape


# In[533]:


sigma=np.diag(sigma)
sigma.shape


# In[534]:


#reconstructing the matrix will provide a matrix which is the predicted matrix
predicted_df=np.dot(np.dot(U, sigma),Vt)
predicted_df.shape


# In[243]:


#normalization 
predicted_df_norm = (predicted_df - predicted_df.min()) / (predicted_df.max() - predicted_df.min())


# In[244]:


predicted_df_norm


# In[245]:


#matrix to datframe
pred_df=pd.DataFrame(predicted_df_norm,columns=df_pivot.columns,index=df_pivot.index).transpose()
pred_df


# # Recommendation Model for SVD

# In[535]:


class CFRecommender:
    
    def __init__(self, cf_predictions_df, items_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=None):
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False).reset_index().rename(columns={user_id: 'weight'})

        recommendations_df = sorted_user_predictions[~sorted_user_predictions['contentId'].isin(items_to_ignore)].sort_values('weight', ascending = False).head(topn)

        recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                        left_on = 'contentId', 
                                                        right_on = 'contentId')[['weight', 'contentId', 'title','url']]


        return recommendations_df
    
cf_recommender_model = CFRecommender(pred_df, articles_df)


# In[537]:


#Recommend articles for the provided user using the recommendation system 
cf_recommender_model.recommend_items(-9016528795238256703, topn=10)


# # NMF

# In[257]:


nmf_model = NMF(n_components=45)


# In[265]:


nmf_model.fit(sparse_matrix_df)


# In[266]:


Theta = nmf_model.transform(sparse_matrix_df)       
M = nmf_model.components_.T          

# Making the predictions
NMF_pred = M.dot(Theta.T)              
NMF_pred = NMF_pred.T                    


# In[267]:


NMF_pred


# In[268]:


NMF_pred_norm=(NMF_pred - NMF_pred.min()) / (NMF_pred.max() - NMF_pred.min())
NMF_pred_norm


# In[269]:


nmf_pred_df=pd.DataFrame(NMF_pred_norm,columns=df_pivot.columns,index=df_pivot.index).transpose()


# In[270]:


class NMFRecommender:
    
    def __init__(self, nmf_predictions_df, items_df=None):
        self.nmf_predictions_df = nmf_predictions_df
        self.items_df = items_df
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=None):
        # Get and sort the user's predictions
        sorted_user_predictions = self.nmf_predictions_df[user_id].sort_values(ascending=False).reset_index().rename(columns={user_id: 'weight'})

        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['contentId'].isin(items_to_ignore)].sort_values('weight', ascending = False).head(topn)

        recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                        left_on = 'contentId', 
                                                        right_on = 'contentId')[['weight', 'contentId', 'title','url']]


        return recommendations_df
    
nmf_recommender_model = NMFRecommender(nmf_pred_df, articles_df)


# In[271]:


nmf_recommender_model.recommend_items(-9016528795238256703, topn=10)


# # SGD

# In[561]:


#check how sparse our data is 
sparsity = float(len(sparse_matrix_df.nonzero()[0]))
sparsity /= (sparse_matrix_df.shape[0] * sparse_matrix_df.shape[1])
sparsity *= 100
print('{:.2f}%'.format(sparsity))


# In[562]:


def train_test_split(eventWeight):
    train=eventWeight.copy()
    
    
    return train


# In[563]:


train = train_test_split(sparse_matrix_df)


# In[564]:


pred_df


# In[565]:


def predictions(P,Q):
    return np.dot(P.T,Q)


# In[566]:


lmbda = 0.4 # Regularization parameter
k = 4 #Factor parameter
m, n = train.shape  # Number of users and items

n_epochs = 100  # Number of epochs
alpha=0.01  # Learning rate

P = 3 * np.random.rand(k,m) # Latent user feature matrix
Q = 3 * np.random.rand(k,n) # Latent movie feature matrix


# In[567]:


train.shape


# In[568]:




#Function to find root-mean-square-error
def rmse(predictions, ground_truth):
    predictions = predictions[ground_truth.nonzero()].flatten()   #Predicted values
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()  #Original values
    return sqrt(mean_squared_error(predictions, ground_truth))


# In[569]:


users, items = train.nonzero()
for u, i in zip(users, items):
    error = train[u, i] - predictions(P[:,u],Q[:,i])
    P[:, u] += alpha * (error * Q[:, i] - lmbda * P[:, u])
    Q[:, i] += alpha * (error * P[:, u] - lmbda * Q[:, i])
        


# In[570]:


SGD_prediction=prediction(P,Q) 
SGD_prediction


# In[571]:


SGD_pred_norm=(SGD_prediction - SGD_prediction.min()) / (SGD_prediction.max() - SGD_prediction.min())


# In[597]:


sgd_pred_df=pd.DataFrame(SGD_pred_norm,columns=pred_df.index,index=pred_df.columns).transpose()


# In[598]:


class SGDRecommender:
    
    def __init__(self, sgd_predictions_df, items_df=None):
        self.sgd_predictions_df = sgd_predictions_df
        self.items_df = items_df
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=None):
        # Get and sort the user's predictions
        sorted_user_predictions = self.sgd_predictions_df[user_id].sort_values(ascending=False).reset_index().rename(columns={user_id: 'weight'})

        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['contentId'].isin(items_to_ignore)].sort_values('weight', ascending = False).head(topn)

        recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                        left_on = 'contentId', 
                                                        right_on = 'contentId')[['weight', 'contentId', 'title','url']]


        return recommendations_df
    
sgd_recommender_model = SGDRecommender(sgd_pred_df, articles_df)


# In[599]:


sgd_recommender_model.recommend_items(-9016528795238256703, topn=10)


In [None]:
class PopRecommender:
    
    def __init__(self, cf_predictions_df, items_df):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df
    def recommend_items(self, category_id, items_to_ignore=[], topn=None):
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[category_id].sort_values(ascending=False).reset_index().rename(columns={category_id: 'views'})

        recommendations_df = sorted_user_predictions[~sorted_user_predictions['video_id'].isin(items_to_ignore)].sort_values('views', ascending = False).head(topn)

        recommendations_df = recommendations_df.merge(filtered_df)
        
        return recommendations_df
    
pop_recommender_model = PopRecommender(df_pivot,filtered_df)
