#### IMPORT LIBRARIES 

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 

## PROBLEM 5.1: POGRAMMING CF

In [3]:
class CF(object):
    """docstring for CF"""
    def __init__(self, Y_data, k, dist_func = cosine_similarity, uuCF = 1):
        self.uuCF = uuCF # user-user (1) or item-item (0) CF
        self.Y_data = Y_data if uuCF else Y_data[:, [1, 0, 2]]
        self.k = k
        self.dist_func = dist_func
        self.Ybar_data = None
        # number of users and items. Remember to add 1 since id starts from 0
        self.n_users = int(np.max(self.Y_data[:, 0])) + 1 
        self.n_items = int(np.max(self.Y_data[:, 1])) + 1
        #self.n_users = int(np.max(self.Y_data[1:, 0])) + 1
        #self.n_items = int(np.max(self.Y_data[1:, 1])) + 1
      
    def normalize_Y(self):
        users = self.Y_data[:, 0] # all users - first col of the Y_data
        self.Ybar_data = self.Y_data.copy()
        self.mu = np.zeros((self.n_users,))
        for n in range(self.n_users):
            # row indices of rating done by user n
            # since indices need to be integers, we need to convert
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all ratings associated with user n
            item_ids = self.Y_data[ids, 1] 
            # and the corresponding ratings 
            ratings = self.Y_data[ids, 2]
            # take mean
            m = np.mean(ratings) 
            if np.isnan(m):
                m = 0 # to avoid empty array and nan value
            self.mu[n] = m
            # normalize
            self.Ybar_data[ids, 2] = ratings - self.mu[n]

        ################################################
        # form the rating matrix as a sparse matrix. Sparsity is important 
        # for both memory and computing efficiency. For example, if #user = 1M, 
        # #item = 100k, then shape of the rating matrix would be (100k, 1M), 
        # you may not have enough memory to store this. Then, instead, we store 
        # nonzeros only, and, of course, their locations.
        self.Ybar = sparse.coo_matrix((self.Ybar_data[:, 2],
            (self.Ybar_data[:, 1], self.Ybar_data[:, 0])), (self.n_items, self.n_users))
        self.Ybar = self.Ybar.tocsr()

    def similarity(self):
        eps = 1e-6
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)
    
        
    def refresh(self):
        """
        Normalize data and calculate similarity matrix again (after
        some few ratings added)
        """
        self.normalize_Y()
        self.similarity() 
        
    def fit(self):
        self.refresh()
        
    
    def __pred(self, u, i, normalized = 1):
        """ 
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        # Step 1: find all users who rated i
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32)
        # Step 2: 
        users_rated_i = (self.Y_data[ids, 0]).astype(np.int32)
        # Step 3: find similarity btw the current user and others 
        # who already rated i
        sim = self.S[u, users_rated_i]
        # Step 4: find the k most similarity users
        a = np.argsort(sim)[-self.k:] 
        # and the corresponding similarity levels
        nearest_s = sim[a]
        # How did each of 'near' users rated item i
        r = self.Ybar[i, users_rated_i[a]]
        if normalized:
            # add a small number, for instance, 1e-8, to avoid dividing by 0
            return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8)

        return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8) + self.mu[u]
    
    def pred(self, u, i, normalized = 1):
        """ 
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        if self.uuCF: return self.__pred(u, i, normalized)
        return self.__pred(i, u, normalized)
            
    
    def recommend(self, u):
        """
        Determine all items should be recommended for user u.
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which 
        have not been rated by u yet. 
        """
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()              
        recommended_items = []
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 0: 
                    recommended_items.append(i)
        
        return recommended_items 

    def print_recommendation(self):
        """
        print all items which should be recommended for each user 
        """
        print('Recommendation: ')
        for u in range(self.n_users):
            recommended_items = self.recommend(u)
            if self.uuCF:
                print('    Recommend item(s):', recommended_items, 'for user', u)
            else: 
                print('    Recommend item', u, 'for user(s) : ', recommended_items)


#### RUNNING CF

In [7]:
# data file
r_cols = ['user_id', 'item_id', 'rating']
ratings = pd.read_csv('C:/Users/Win 10/Downloads/data.csv', sep = ' ', names = r_cols, encoding='latin-1')
Y_data = ratings[['user_id', 'item_id', 'rating']].to_numpy()

### USER-USER COLLABORATIVE FILTERING

In [54]:
#User-User CF
rs = CF(Y_data, k = 2,dist_func = cosine_similarity, uuCF = 1)#get pearson similarities for ratings matrix M
pearson_sim = 1-pairwise_distances(M, metric="correlation")

In [23]:
rating_2=ratings.copy()

In [25]:
col = ['item_id', 'user_id']

In [26]:
for c in col:
    rating_2[c].replace({val: i for i, val in enumerate(rating_2[c].unique())}, inplace=True)

In [30]:
n_users=int(rating_2.user_id.nunique())
n_items=int(rating_2.item_id.nunique())

In [31]:
n_users

7

In [32]:
n_items

5

In [55]:
rs.fit()

In [56]:
data=pd.DataFrame(columns=['u0','u1','u2','u3','u4','u5','u6'],index=['i0','i1','i2','i3','i4'])
data

Unnamed: 0,u0,u1,u2,u3,u4,u5,u6
i0,,,,,,,
i1,,,,,,,
i2,,,,,,,
i3,,,,,,,
i4,,,,,,,


In [57]:
for i in range (5):
    for j in range(7):
        data.iloc[i,j]=rs.pred(j,i)
    

##### THE NORMALIZED Y

In [58]:
data

Unnamed: 0,u0,u1,u2,u3,u4,u5,u6
i0,1.97723,2.02277,-0.740253,-1.41068,-1.42265,0.176938,-0.634336
i1,0.70762,0.479426,-0.171047,-1.14666,-1.33333,0.54238,0.0452015
i2,0.905594,1.10968,-1.90862,-1.83587,-1.78013,-1.15846,-1.92471
i3,-1.02277,-0.977234,0.581724,2.12521,2.04145,0.590268,0.584943
i4,-1.9317,-2.0683,1.58172,1.56717,1.55603,1.59027,1.58494


In [60]:
rs.mu

array([3.25      , 2.75      , 2.5       , 1.33333333, 2.5       ,
       1.5       , 3.33333333])

##### THE FULL Y

In [63]:
data+rs.mu

Unnamed: 0,u0,u1,u2,u3,u4,u5,u6
i0,5.22723,4.77277,1.75975,-0.0773503,1.07735,1.67694,2.699
i1,3.95762,3.22943,2.32895,0.186674,1.16667,2.04238,3.37853
i2,4.15559,3.85968,0.591381,-0.502535,0.719873,0.341543,1.40862
i3,2.22723,1.77277,3.08172,3.45855,4.54145,2.09027,3.91828
i4,1.3183,0.681703,4.08172,2.90051,4.05603,3.09027,4.91828


### ITEM-ITEM COLLABORATVE FILTERING

In [68]:
rs = CF(Y_data, k = 2, uuCF = 0)
rs.fit()

In [69]:
data=pd.DataFrame(columns=['u0','u1','u2','u3','u4','u5','u6'],index=['i0','i1','i2','i3','i4'])
data

Unnamed: 0,u0,u1,u2,u3,u4,u5,u6
i0,,,,,,,
i1,,,,,,,
i2,,,,,,,
i3,,,,,,,
i4,,,,,,,


In [70]:
for i in range (5):
    for j in range(7):
        data.iloc[i,j]=rs.pred(j,i)

In [74]:
data

Unnamed: 0,u0,u1,u2,u3,u4,u5,u6
i0,2.22626,2.35074,-0.649263,-2.33939,-1.23917,-0.291808,-1.52117
i1,2.17374,2.4,-0.6,-2.26061,-1.25051,0.0,-2.25
i2,2.4,2.29926,-0.700737,-2.6,-1.19518,-0.75,-0.779469
i3,-0.997868,-1.8081,0.407249,1.29003,1.19416,0.344708,1.40725
i4,-0.918799,-2.10857,0.676085,1.03002,1.16435,0.648592,1.67608


In [75]:
mean=rs.mu
mean

array([2.6       , 2.        , 1.75      , 3.16666667, 2.75      ])

In [77]:
data=data.transpose() + mean
data.transpose()

Unnamed: 0,u0,u1,u2,u3,u4,u5,u6
i0,4.82626,4.95074,1.95074,0.260608,1.36083,2.30819,1.07883
i1,4.17374,4.4,1.4,-0.260608,0.749486,2.0,-0.25
i2,4.15,4.04926,1.04926,-0.85,0.554821,1.0,0.970531
i3,2.1688,1.35857,3.57392,4.4567,4.36083,3.51137,4.57392
i4,1.8312,0.641435,3.42608,3.78002,3.91435,3.39859,4.42608


## PEARSON SIMILARITY

### USER-USER COLLABORATIVE FILTERING

In [262]:
ratings.head(5)

Unnamed: 0,user_id,item_id,rating
0,0,0,5.0
1,0,1,4.0
2,0,3,2.0
3,0,4,2.0
4,1,0,5.0


In [276]:
data=pd.DataFrame(columns=['u0','u1','u2','u3','u4','u5','u6'],index=['i0','i1','i2','i3','i4'])

In [277]:
for i in range(5):
    for u in range(7):
        for a in range(22):
            if (ratings['user_id'].loc[a]==u) & (ratings['item_id'].loc[a]==i):
                data.iloc[i,u]=ratings['rating'].loc[a]

In [278]:
data

Unnamed: 0,u0,u1,u2,u3,u4,u5,u6
i0,5.0,5.0,2.0,0.0,1.0,,
i1,4.0,,,0.0,,2.0,
i2,,4.0,1.0,,,1.0,1.0
i3,2.0,2.0,3.0,4.0,4.0,,4.0
i4,2.0,0.0,4.0,,,,5.0


In [279]:
mean_data=np.mean(data).values
mean_data

array([3.25      , 2.75      , 2.5       , 1.33333333, 2.5       ,
       1.5       , 3.33333333])

In [280]:
data=data-mean_data
data

Unnamed: 0,u0,u1,u2,u3,u4,u5,u6
i0,1.75,2.25,-0.5,-1.33333,-1.5,,
i1,0.75,,,-1.33333,,0.5,
i2,,1.25,-1.5,,,-0.5,-2.33333
i3,-1.25,-0.75,0.5,2.66667,1.5,,0.666667
i4,-1.25,-2.75,1.5,,,,1.66667


In [281]:
data.replace(np.nan, 0, inplace=True)
data

Unnamed: 0,u0,u1,u2,u3,u4,u5,u6
i0,1.75,2.25,-0.5,-1.333333,-1.5,0.0,0.0
i1,0.75,0.0,0.0,-1.333333,0.0,0.5,0.0
i2,0.0,1.25,-1.5,0.0,0.0,-0.5,-2.333333
i3,-1.25,-0.75,0.5,2.666667,1.5,0.0,0.666667
i4,-1.25,-2.75,1.5,0.0,0.0,0.0,1.666667


In [282]:
from sklearn.metrics import pairwise_distances

In [290]:
pearson_sim = 1-pairwise_distances(data.transpose(), metric="correlation")
pearson_sim=pd.DataFrame(pearson_sim)
pearson_sim

Unnamed: 0,0,1,2,3,4,5,6
0,1.0,0.833074,-0.580948,-0.785674,-0.816497,0.204124,-0.381337
1,0.833074,1.0,-0.873334,-0.398621,-0.552345,-0.230144,-0.707568
2,-0.580948,-0.873334,1.0,0.273861,0.316228,0.474342,0.962102
3,-0.785674,-0.398621,0.273861,1.0,0.866025,-0.288675,0.1849
4,-0.816497,-0.552345,0.316228,0.866025,1.0,0.0,0.160128
5,0.204124,-0.230144,0.474342,-0.288675,0.0,1.0,0.560449
6,-0.381337,-0.707568,0.962102,0.1849,0.160128,0.560449,1.0


In [284]:
def findksimilarusers(user_id, ratings,k=2):
    similarities=[]
    indices=[]
    model_knn = NearestNeighbors(metric = 'correlation', algorithm = 'brute') 
    model_knn.fit(ratings)

    distances, indices = model_knn.kneighbors(ratings.iloc[user_id-1, :].values.reshape(1, -1), n_neighbors = k+1)
    similarities = 1-distances.flatten()
            
    return indices

In [285]:
index={}
for i in range(1,8):
    s=findksimilarusers(i,data.transpose())
    user=s[0,0]
    index[user]=s[0,1:]

In [286]:
index

{0: array([1, 5], dtype=int64),
 1: array([0, 5], dtype=int64),
 2: array([6, 5], dtype=int64),
 3: array([4, 2], dtype=int64),
 4: array([3, 2], dtype=int64),
 5: array([6, 2], dtype=int64),
 6: array([2, 5], dtype=int64)}

In [288]:
index=pd.DataFrame.from_dict(index)

In [291]:
for column in range(7):
    for row in range(5):
        if data.iloc[row,column]==0:
            data.iloc[row,column]=(pearson_sim.iloc[index[column][0],column]*data.iloc[row,index[column][0]]+pearson_sim.iloc[index[column][1],column]*data.iloc[row,index[column][1]])/(np.abs(pearson_sim.iloc[index[column][0],column])+np.abs(pearson_sim.iloc[index[column][1],column]))

In [292]:
data

Unnamed: 0,u0,u1,u2,u3,u4,u5,u6
i0,1.75,2.25,-0.5,-1.333333,-1.5,-0.229197,-0.400318
i1,0.75,0.479426,0.16511,-1.333333,-0.932531,0.5,0.288382
i2,0.905594,1.25,-1.5,-0.36038,-0.665204,-0.5,-2.333333
i3,-1.25,-0.75,0.5,2.666667,1.5,0.590268,0.666667
i4,-1.25,-2.75,1.5,0.36038,0.665204,1.590268,1.666667


### ITEM-ITEM COLLABORATIVE FILTERING

In [307]:
data=pd.DataFrame(columns=['u0','u1','u2','u3','u4','u5','u6'],index=['i0','i1','i2','i3','i4'])

In [308]:
for i in range(5):
    for u in range(7):
        for a in range(22):
            if (ratings['user_id'].loc[a]==u) & (ratings['item_id'].loc[a]==i):
                data.iloc[i,u]=ratings['rating'].loc[a]
data=data.transpose()
data

Unnamed: 0,i0,i1,i2,i3,i4
u0,5.0,4.0,,2.0,2.0
u1,5.0,,4.0,2.0,0.0
u2,2.0,,1.0,3.0,4.0
u3,0.0,0.0,,4.0,
u4,1.0,,,4.0,
u5,,2.0,1.0,,
u6,,,1.0,4.0,5.0


In [309]:
mean_data=np.mean(data).values
mean_data

array([2.6       , 2.        , 1.75      , 3.16666667, 2.75      ])

In [310]:
data=data-mean_data
data

Unnamed: 0,i0,i1,i2,i3,i4
u0,2.4,2.0,,-1.16667,-0.75
u1,2.4,,2.25,-1.16667,-2.75
u2,-0.6,,-0.75,-0.166667,1.25
u3,-2.6,-2.0,,0.833333,
u4,-1.6,,,0.833333,
u5,,0.0,-0.75,,
u6,,,-0.75,0.833333,2.25


In [311]:
data.replace(np.nan, 0, inplace=True)
data

Unnamed: 0,i0,i1,i2,i3,i4
u0,2.4,2.0,0.0,-1.166667,-0.75
u1,2.4,0.0,2.25,-1.166667,-2.75
u2,-0.6,0.0,-0.75,-0.166667,1.25
u3,-2.6,-2.0,0.0,0.833333,0.0
u4,-1.6,0.0,0.0,0.833333,0.0
u5,0.0,0.0,-0.75,0.0,0.0
u6,0.0,0.0,-0.75,0.833333,2.25


In [312]:
pearson_sim = np.corrcoef(data.transpose())
pearson_sim=pd.DataFrame(pearson_sim,columns=['i0','i1','i2','i3','i4'],index=['i0','i1','i2','i3','i4'])
pearson_sim

Unnamed: 0,i0,i1,i2,i3,i4
i0,1.0,0.767869,0.489031,-0.889101,-0.517437
i1,0.767869,1.0,0.0,-0.643268,-0.138086
i2,0.489031,0.0,1.0,-0.54711,-0.883184
i3,-0.889101,-0.643268,-0.54711,1.0,0.681002
i4,-0.517437,-0.138086,-0.883184,0.681002,1.0


In [313]:
from sklearn.neighbors import NearestNeighbors
def findksimilaritems(item_id, ratings, metric='correlation', k=2):
    similarities=[]
    indices=[]    
    ratings=ratings.T
    model_knn = NearestNeighbors(metric = metric, algorithm = 'brute')
    model_knn.fit(ratings)

    distances, indices = model_knn.kneighbors(ratings.iloc[item_id-1,:].values.reshape(1, -1), n_neighbors = k+1)
    similarities = 1-distances.flatten()
    return indices

In [314]:
index={}
for i in range(1,6):
    s=findksimilaritems(i,data)
    user=s[0,0]
    index[user]=s[0,(1,2)]

In [315]:
index

{0: array([1, 2], dtype=int64),
 1: array([0, 2], dtype=int64),
 2: array([0, 1], dtype=int64),
 3: array([4, 2], dtype=int64),
 4: array([3, 1], dtype=int64)}

In [316]:
for column in range(5):
    for row in range(7):
        if data.iloc[row,column]==0:
            data.iloc[row,column]=(pearson_sim.iloc[index[column][0],column]*data.iloc[row,index[column][0]]+pearson_sim.iloc[index[column][1],column]*data.iloc[row,index[column][1]])/(np.abs(pearson_sim.iloc[index[column][0],column])+np.abs(pearson_sim.iloc[index[column][1],column]))

In [317]:
#Normalized data
data=data.transpose()
data

Unnamed: 0,u0,u1,u2,u3,u4,u5,u6
i0,2.4,2.4,-0.6,-2.6,-1.6,-0.291808,-0.291808
i1,2.0,2.4,-0.6,-2.0,-1.6,-0.291808,-0.291808
i2,2.4,2.25,-0.75,-2.6,-1.6,-0.75,-0.75
i3,-1.166667,-1.166667,-0.166667,0.833333,0.833333,0.334117,0.833333
i4,-0.75,-2.75,1.25,1.030016,0.962582,0.326984,2.25


In [318]:
# Item-Items Ratings
non_normalized_uu=data.transpose()+mean_data
non_normalized_uu.transpose()

Unnamed: 0,u0,u1,u2,u3,u4,u5,u6
i0,5.0,5.0,2.0,0.0,1.0,2.308192,2.308192
i1,4.0,4.4,1.4,0.0,0.4,1.708192,1.708192
i2,4.15,4.0,1.0,-0.85,0.15,1.0,1.0
i3,2.0,2.0,3.0,4.0,4.0,3.500783,4.0
i4,2.0,0.0,4.0,3.780016,3.712582,3.076984,5.0
