In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
# import all csv files in the folder ../data/csvs

data_path = '../data/csvs'
files = os.listdir(data_path)
df = {}

for file in files:
    if file.endswith('.csv'):
        df[file[:-4]] = pd.read_csv(f'{data_path}/{file}')

In [3]:
# PostTypeID = 1 -> Question
# PostTypeID = 2 -> Answer
answers = df['Posts'][df['Posts']['PostTypeId'] == 2]
Answerer_table = answers[['OwnerUserId']].groupby('OwnerUserId').size().reset_index(name='AnswerCount').sort_values(by='AnswerCount', ascending=False)

In [4]:
questions = df['Posts'][df['Posts']['PostTypeId'] == 1]

# get all unique tags for each question
# tagIds are given in 'Tags' dataframe and tags are | separated
tags = df['Tags']

def get_tags(tag_string):
    taglist = tag_string.split('|')
    if(len(taglist) > 0):
        return taglist[1:-1]
    else:
        return []

# fill NaN values with empty string
questions['Tags'] = questions['Tags'].fillna('')
questions['Tags'] = questions['Tags'].apply(get_tags)

# each question has a field with the list of tags
# for each tag in 'Tags' dataframe, get the count of questions with that tag in 'questions' dataframe
def get_tag_count(tag):
    return questions['Tags'].apply(lambda y: tag in y).sum()

tags['TagCount'] = tags['TagName'].apply(get_tag_count)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  questions['Tags'] = questions['Tags'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  questions['Tags'] = questions['Tags'].apply(get_tags)


In [5]:
tags_table = tags[['Id','TagName', 'TagCount']].sort_values(by='TagCount', ascending=False)
tags_table = tags_table.set_index('TagName')

In [6]:
# get all users from the 'Users' dataframe and merge with the answers dataframe
users = df['Users']

# add users with 0 answers
Answerer_table = users.merge(Answerer_table, how='left', left_on='Id', right_on='OwnerUserId')
Answerer_table = Answerer_table.fillna(0).sort_values(by='AnswerCount', ascending=False)

In [7]:
# get the top 3 users with the most answers
print(Answerer_table.head(3)[['Id','AnswerCount','DisplayName']])

            Id  AnswerCount    DisplayName
53721     9113       2839.0      Doc Brown
147175  177980       2326.0           Ewan
49017     1204       2043.0  Robert Harvey


In [8]:
# get the top 3 tags with the most questions
print(tags_table.head(3))

          Id  TagCount
TagName               
design   609      5162
c#       249      4931
java      76      4928


## Question 2


In [9]:
Answerer_table = Answerer_table[['Id','AnswerCount']]

In [10]:
# all users with more than 20 answers
users_20 = Answerer_table[Answerer_table['AnswerCount'] > 20].sort_values(by='Id', ascending=True)

# all tags with more than 20 questions
tags_20 = tags[tags['TagCount'] > 20].sort_values(by='Id', ascending=True)

In [11]:
utility_df = pd.DataFrame(index=users_20['Id'], columns=tags_20['Id']).fillna(0)
users_20.set_index('Id', inplace=True)
tags_20.set_index('Id', inplace=True)

In [12]:
def fill_utility_matrix(ans):
    tags = questions[questions['Id'] == ans['ParentId']]['Tags'].iloc[0]
    AnswererUserId = ans['OwnerUserId']
    if(AnswererUserId not in utility_df.index):
        return
    tagIdList = []
    for tag in tags:
        if tag not in tags_table.index:
            continue
        tagId = tags_table.at[tag,'Id']
        if tagId in tags_20.index:
            tagIdList.append(tagId)

    for tagId in tagIdList:
        utility_df.at[AnswererUserId, tagId] += 1
        

In [13]:
for index, ans in answers.iterrows():
    fill_utility_matrix(ans)

In [14]:
# count 0s in the utility matrix
zero_count = utility_df.apply(lambda x: x == 0).sum().sum()
total_count = utility_df.shape[0] * utility_df.shape[1]
print(f'Percentage of unknown expert ratings in the utility matrix: {zero_count/total_count*100}%')

Percentage of unknown expert ratings in the utility matrix: 88.89014050660478%


In [15]:
# replace 0s with NaN as it is unknown
utility_df = utility_df.replace(0, float('nan'))

In [16]:
# convert the utility matrix to a numpy array
utility_matrix = utility_df.to_numpy()

In [17]:
# report the dimensions of the utility matrix
print(f'Dimensions of the utility matrix: (Users, Tags) {utility_matrix.shape}')

Dimensions of the utility matrix: (Users, Tags) (1119, 952)


## Question 3


In [18]:
# convert the index into a series where the index is the user id and the value is the row number
user_index = pd.Series(utility_df.index, index=range(utility_df.shape[0]))

# convert the columns into a series where the index is the tag id and the value is the column number
tag_index = pd.Series(utility_df.columns, index=range(utility_df.shape[1]))

In [19]:
# for each value that is not NaN convert it to floor(x/3) if x < 15 else 5
expert_matrix = np.where(np.isnan(utility_matrix), 
                          np.nan,  # Keep NaN as it is
                          np.where(utility_matrix < 15, utility_matrix // 3, 5))

print(expert_matrix)

[[ 4. nan  2. ...  0.  0. nan]
 [nan nan  2. ... nan nan nan]
 [ 0. nan  0. ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [ 0. nan nan ... nan  1.  0.]
 [nan nan nan ... nan nan nan]]


In [20]:
# report the summation of the expert matrix
print(f'Summation of the expert matrix: {np.nansum(expert_matrix)}')

# report the highest row sum of the expert matrix
print(f'Highest row sum of the expert matrix: {np.nanmax(np.nansum(expert_matrix, axis=1))}')

# report the highest column sum of the expert matrix
print(f'Highest column sum of the expert matrix: {np.nanmax(np.nansum(expert_matrix, axis=0))}')

Summation of the expert matrix: 41221.0
Highest row sum of the expert matrix: 1161.0
Highest column sum of the expert matrix: 1394.0


In [21]:
# the bottom right corner of width 15% of the expert matrix is the test set
test_set = np.copy(expert_matrix[int(expert_matrix.shape[0]*0.85):,int(expert_matrix.shape[1]*0.85):])


In [22]:
# report the summation of the train set
train_sum = np.nansum(expert_matrix) - np.nansum(test_set)
print(f'Summation of the train set: {train_sum}')

# report the dimensions of the test set
print(f'Dimensions of the test set: {test_set.shape}')

# report the summation of the test set
print(f'Summation of the test set: {np.nansum(test_set)}')

Summation of the train set: 40565.0
Dimensions of the test set: (168, 143)
Summation of the test set: 656.0


## Question 4: item-item


In [23]:
expert_matrix

array([[ 4., nan,  2., ...,  0.,  0., nan],
       [nan, nan,  2., ..., nan, nan, nan],
       [ 0., nan,  0., ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [ 0., nan, nan, ..., nan,  1.,  0.],
       [nan, nan, nan, ..., nan, nan, nan]])

In [24]:
# calculate mean of each column and center the values and fill NaN with 0

# uncomment this if centering is needed and NaNs should be filled with 0
# expert_matrix = expert_matrix - np.nanmean(expert_matrix, axis=0)
# expert_matrix = np.nan_to_num(expert_matrix, nan=0)

In [25]:
# Develop an item-item collaborative filtering model using the train set and use pearson correlation for similarity between tags
# first we need to calculate the pearson correlation between each pair of tags
expert_matrix_df = pd.DataFrame(utility_matrix)
tag_correlation = expert_matrix_df.corr()
print(f'The tag similarity matrix has dimensions: {tag_correlation.shape}')

The tag similarity matrix has dimensions: (952, 952)


In [26]:
def avg_prediction_function_item(data,user_index, tag_index, similar_tag_indices):
    predicition = 0
    non_nan = 0
    for i in similar_tag_indices:
        if not np.isnan(data[user_index, i]):
            predicition += data[user_index, i]
            non_nan += 1
    if non_nan == 0:
        return np.nan
    return predicition/non_nan
    

In [27]:
def weighted_avg_prediction_function_item(data,user_index,tag_index,similar_tag_indices):
    predicition = 0
    total_weight = 0
    for i in similar_tag_indices:
        if not np.isnan(data[user_index, i]):
            predicition += data[user_index, i] * tag_correlation.at[tag_index, i]
            total_weight += tag_correlation.at[tag_index, i]
    if total_weight == 0:
        return np.nan
    return predicition / total_weight

In [28]:
def get_similar_tags(data,user_index,tag_index,tag_correlation,K):
    similar_tags = []
    # sort the tags similar to the tag_index in descending order into a list of indexes of the tags
    similar_tag_indices = tag_correlation[tag_index].sort_values(ascending=False).index
    for i in similar_tag_indices:
        if i != tag_index and not np.isnan(data[user_index, i]):
            similar_tags.append(i)
        if len(similar_tags) == K:
            break
    return similar_tags

In [29]:
def test_item_cf(data, K, correlation_matrix,prediction_function):
    '''
    data: entire dataset of which the bottom right 15% will be used for testing
    K: number of neighbors to consider
    correlation_matrix: matrix of tag-tag correlation
    '''
    # deep copy the data
    data = np.copy(data)
    predictions = np.full((data.shape[0]-int(0.85*data.shape[0]), data.shape[1]-int(0.85*data.shape[1])),np.nan)

    # predict for the test set
    for i in range(int(0.85*data.shape[0]), data.shape[0]):
        for j in range(int(0.85*data.shape[1]), data.shape[1]):
            # get the K most similar tags which the user has rated except the tag itself
            similar_tag_indices = get_similar_tags(data, i, j, correlation_matrix, K)
            # if (K+1) then remove the last element
            if len(similar_tag_indices) > K:
                similar_tag_indices = similar_tag_indices[:-1]
            # get the prediction
            predictions[i-int(0.85*data.shape[0]), j-int(0.85*data.shape[1])] = prediction_function(data, i, j, similar_tag_indices)
    return predictions

In [30]:
predictions_cf_item_simple_avg_2_neighbours = test_item_cf(expert_matrix, 2, tag_correlation, avg_prediction_function_item)
predictions_cf_item_simple_avg_3_neighbours = test_item_cf(expert_matrix, 3, tag_correlation, avg_prediction_function_item)
predictions_cf_item_simple_avg_5_neighbours = test_item_cf(expert_matrix, 5, tag_correlation, avg_prediction_function_item)

In [31]:
predictions_cf_item_weighted_avg_2_neighbours = test_item_cf(expert_matrix, 2, tag_correlation, weighted_avg_prediction_function_item)
predictions_cf_item_weighted_avg_3_neighbours = test_item_cf(expert_matrix, 3, tag_correlation, weighted_avg_prediction_function_item)
predictions_cf_item_weighted_avg_5_neighbours = test_item_cf(expert_matrix, 5, tag_correlation, weighted_avg_prediction_function_item)

In [32]:
def RMSE(predictions,test_set):
    sum = 0
    count = 0
    for i in range(predictions.shape[0]):
        for j in range(predictions.shape[1]):
            if not np.isnan(test_set[i,j]) and not np.isnan(predictions[i,j]):
                sum += (predictions[i,j] - test_set[i,j])**2
                count += 1
    
    return np.sqrt(sum/count)

In [33]:
rmse_simple_avg_2_neighbours = RMSE(predictions_cf_item_simple_avg_2_neighbours, test_set)
rmse_simple_avg_3_neighbours = RMSE(predictions_cf_item_simple_avg_3_neighbours, test_set)
rmse_simple_avg_5_neighbours = RMSE(predictions_cf_item_simple_avg_5_neighbours, test_set)

rmse_weighted_avg_2_neighbours = RMSE(predictions_cf_item_weighted_avg_2_neighbours, test_set)
rmse_weighted_avg_3_neighbours = RMSE(predictions_cf_item_weighted_avg_3_neighbours, test_set)
rmse_weighted_avg_5_neighbours = RMSE(predictions_cf_item_weighted_avg_5_neighbours, test_set)

In [34]:
# report the RMSE for the simple average method with 2, 3, and 5 neighbors
print(f'RMSE for simple average method with 2 neighbors: {rmse_simple_avg_2_neighbours}')
print(f'RMSE for simple average method with 3 neighbors: {rmse_simple_avg_3_neighbours}')
print(f'RMSE for simple average method with 5 neighbors: {rmse_simple_avg_5_neighbours}')

# report the RMSE for the weighted average method with 2, 3, and 5 neighbors
print(f'RMSE for weighted average method with 2 neighbors: {rmse_weighted_avg_2_neighbours}')
print(f'RMSE for weighted average method with 3 neighbors: {rmse_weighted_avg_3_neighbours}')
print(f'RMSE for weighted average method with 5 neighbors: {rmse_weighted_avg_5_neighbours}')

RMSE for simple average method with 2 neighbors: 0.8065539869850512
RMSE for simple average method with 3 neighbors: 0.7725755683887762
RMSE for simple average method with 5 neighbors: 0.7684676748370861
RMSE for weighted average method with 2 neighbors: 0.8002631095407067
RMSE for weighted average method with 3 neighbors: 0.7618844545199125
RMSE for weighted average method with 5 neighbors: 0.7806460817404667


## Question 4: User-User Collaborative System


In [35]:
expert_matrix = np.where(np.isnan(utility_matrix),
                            np.nan,
                            np.where(utility_matrix < 15, utility_matrix // 3, 5))

# # calculate the mean of each row and center the values and fill NaN with 0
# uncomment this if centering is needed and NaNs should be filled with 0
# expert_matrix = expert_matrix - np.nanmean(expert_matrix, axis=1)[:, np.newaxis]

In [36]:
user_correlation = expert_matrix_df.T.corr()

In [37]:
def avg_prediction_function_user(data,user_index, tag_index, similar_user_indices):
    predicition = 0
    non_nan = 0
    for i in similar_user_indices:
        if not np.isnan(data[i, tag_index]):
            predicition += data[i, tag_index]
            non_nan += 1
    if non_nan == 0:
        return np.nan
    return predicition/non_nan

In [38]:
def weighted_avg_prediction_function_user(data,user_index,tag_index,similar_user_indices):
    predicition = 0
    total_weight = 0
    for i in similar_user_indices:
        if not np.isnan(data[i, tag_index]):
            predicition += data[i, tag_index] * user_correlation.at[user_index, i]
            total_weight += user_correlation.at[user_index, i]
    if total_weight == 0:
        return np.nan
    return predicition / total_weight

In [39]:
def get_similar_users(data,user_index,tag_index,user_correlation,K):
    similar_users = []
    # sort the tags similar to the tag_index in descending order into a list of indexes of the tags
    similar_user_indices = user_correlation[user_index].sort_values(ascending=False).index
    for i in similar_user_indices:
        if i != user_index and not np.isnan(data[i, tag_index]):
            similar_users.append(i)
        if len(similar_users) == K:
            break
    return similar_users

In [40]:
def test_user_cf(data, K, correlation_matrix,prediction_function):
    '''
    data: entire dataset of which the bottom right 15% will be used for testing
    K: number of neighbors to consider
    correlation_matrix: matrix of tag-tag correlation
    '''
    # deep copy the data
    data = np.copy(data)
    predictions = np.full((data.shape[0]-int(0.85*data.shape[0]), data.shape[1]-int(0.85*data.shape[1])),np.nan)

    # predict for the test set
    for i in range(int(0.85*data.shape[0]), data.shape[0]):
        for j in range(int(0.85*data.shape[1]), data.shape[1]):
            # get the K most similar tags which the user has rated except the tag itself
            similar_user_indices = get_similar_users(data, i, j, user_correlation, K)
            # if (K+1) then remove the last element
            if len(similar_user_indices) > K:
                similar_user_indices = similar_user_indices[:-1]
            # get the prediction
            predictions[i-int(0.85*data.shape[0]), j-int(0.85*data.shape[1])] = prediction_function(data, i, j, similar_user_indices)
    return predictions

In [41]:
predictions_cf_user_simple_avg_2_neighbours = test_user_cf(expert_matrix, 2, user_correlation, avg_prediction_function_user)
predictions_cf_user_simple_avg_3_neighbours = test_user_cf(expert_matrix, 3, user_correlation, avg_prediction_function_user)
predictions_cf_user_simple_avg_5_neighbours = test_user_cf(expert_matrix, 5, user_correlation, avg_prediction_function_user)

In [42]:
predictions_cf_user_weighted_avg_2_neighbours = test_user_cf(expert_matrix, 2, user_correlation, weighted_avg_prediction_function_user)
predictions_cf_user_weighted_avg_3_neighbours = test_user_cf(expert_matrix, 3, user_correlation, weighted_avg_prediction_function_user)
predictions_cf_user_weighted_avg_5_neighbours = test_user_cf(expert_matrix, 5, user_correlation, weighted_avg_prediction_function_user)

In [43]:
# report the RMSE for the simple average method with 2, 3, and 5 neighbors
rmse_simple_avg_2_neighbours_user = RMSE(predictions_cf_user_simple_avg_2_neighbours, test_set)
rmse_simple_avg_3_neighbours_user = RMSE(predictions_cf_user_simple_avg_3_neighbours, test_set)
rmse_simple_avg_5_neighbours_user = RMSE(predictions_cf_user_simple_avg_5_neighbours, test_set)

# report the RMSE for the weighted average method with 2, 3, and 5 neighbors
rmse_weighted_avg_2_neighbours_user = RMSE(predictions_cf_user_weighted_avg_2_neighbours, test_set)
rmse_weighted_avg_3_neighbours_user = RMSE(predictions_cf_user_weighted_avg_3_neighbours, test_set)
rmse_weighted_avg_5_neighbours_user = RMSE(predictions_cf_user_weighted_avg_5_neighbours, test_set)

In [44]:
print(f'RMSE for simple average method with 2 neighbors: {rmse_simple_avg_2_neighbours_user}')
print(f'RMSE for simple average method with 3 neighbors: {rmse_simple_avg_3_neighbours_user}')
print(f'RMSE for simple average method with 5 neighbors: {rmse_simple_avg_5_neighbours_user}')

print(f'RMSE for weighted average method with 2 neighbors: {rmse_weighted_avg_2_neighbours_user}')
print(f'RMSE for weighted average method with 3 neighbors: {rmse_weighted_avg_3_neighbours_user}')
print(f'RMSE for weighted average method with 5 neighbors: {rmse_weighted_avg_5_neighbours_user}')

RMSE for simple average method with 2 neighbors: 0.9010360159689648
RMSE for simple average method with 3 neighbors: 0.8601977707898477
RMSE for simple average method with 5 neighbors: 0.8043990213940855
RMSE for weighted average method with 2 neighbors: 0.9005418859140297
RMSE for weighted average method with 3 neighbors: 0.8586811722400283
RMSE for weighted average method with 5 neighbors: 0.8031690767385918


## Question 5: Latent Factor Models


In [45]:
# set all values to NaN in the test part of the expert matrix
expert_matrix[int(expert_matrix.shape[0]*0.85):,int(expert_matrix.shape[1]*0.85):] = np.nan

In [46]:
# convert the expert matrix from (user x tag) to (tag x user)
expert_matrix = expert_matrix.T

In [47]:
def init_SVD(k,R):
    U, S, VT = np.linalg.svd(R, full_matrices=False)
    U_k = U[:, :k] 
    S_k = np.diag(S[:k])
    VT_k = VT[:k, :] 

    Q = U_k
    P = np.dot(VT_k.T, S_k)
    return Q, P


In [48]:
# calculate gradient WRT Q
def unregularized_gradient_Q_SGD(R, Q, P,user,tag):
    # missing values are set to NaN
    R_hat = np.dot(Q[tag],P[user])

    # clipping if R_hat is too large
    R_hat = np.clip(R_hat, -1e10, 1e10)

    # Using SSE as the loss function
    loss = (R[tag,user] - R_hat)**2

    # calculate the gradient (SGD)
    gradient = -2 * (R[tag,user] - R_hat) * P[user]
    gradient = np.clip(gradient, -1e10, 1e10)
    return gradient , loss

# calculate gradient WRT P
def unregularized_gradient_P_SGD(R, Q, P,user,tag):
    # missing values are set to NaN
    R_hat = np.dot(Q[tag],P[user])
    R_hat = np.clip(R_hat, -1e10, 1e10)

    # Using SSE as the loss function
    loss = (R[tag,user] - R_hat)**2

    # calculate the gradient (SGD)
    gradient = -2 * (R[tag,user] - R_hat) * Q[tag]
    gradient = np.clip(gradient, -1e10, 1e10)
    return gradient , loss

In [49]:
def train(R, Q, P, learning_rate, iterations):
    for i in range(iterations):
        if i % 10 == 0:
            # get average loss
            loss = 0
            count = 0
            for tag in range(R.shape[0]):
                for user in range(R.shape[1]):
                    if not np.isnan(R[tag,user]):
                        loss += (R[tag,user] - np.dot(Q[tag],P[user]))**2
                        count += 1
            loss /= count
            print(f'Iteration: {i}, Loss: {loss}')
        for tag in range(R.shape[0]):
            for user in range(R.shape[1]):
                if not np.isnan(R[tag,user]):
                    gradient_Q, loss = unregularized_gradient_Q_SGD(R, Q, P, user, tag)
                    gradient_P, loss = unregularized_gradient_P_SGD(R, Q, P, user, tag)

                    Q[tag] = Q[tag] - learning_rate * gradient_Q
                    P[user] = P[user] - learning_rate * gradient_P
    return Q, P

In [50]:
# get NaN in test set
nan_in_test = np.isnan(test_set).sum()

print(f'Number of NaN (unknown) in test set: {nan_in_test}')
print(f'Total number of values in test set: {test_set.shape[0] * test_set.shape[1]}')

Number of NaN (unknown) in test set: 21757
Total number of values in test set: 24024


In [51]:
# calculate loss on test set
def test_loss(R,Q,P):
    loss = 0
    count = 0
    predictions = np.dot(Q,P.T).T

    # get predictions on test set
    predictions_test = predictions[int(predictions.shape[0]*0.85):,int(predictions.shape[1]*0.85):]

    # calculate loss
    for i in range(test_set.shape[0]):
        for j in range(test_set.shape[1]):
            if not np.isnan(test_set[i,j]):
                loss += (test_set[i,j] - predictions_test[i,j])**2
                count += 1
    return np.sqrt(loss/count)

In [52]:
def train_unregulated(k, R, learning_rate, iterations):
    # initialize Q and P
    Q, P = np.random.rand(R.shape[0], k), np.random.rand(R.shape[1], k)

    # train the model
    Q, P = train(R, Q, P, learning_rate, iterations)

    # training loss
    print(f'k= {k}, learning_rate= {learning_rate}, loss= {test_loss(R,Q,P)}')

In [53]:
train_unregulated(2,expert_matrix, 0.0005, 50)
print()
train_unregulated(5,expert_matrix, 0.0005, 50)
print()
train_unregulated(10,expert_matrix, 0.0005, 50)
print()

Iteration: 0, Loss: 0.9426492947150403
Iteration: 10, Loss: 0.5866564095733755
Iteration: 20, Loss: 0.46011725202741255
Iteration: 30, Loss: 0.42126671159936824


## Question 5: Regularization


In [108]:
def regularized_gradient_Q(R, Q, P, user, tag, lambda_):
    R_hat = np.dot(Q[tag],P[user])
    R_hat = np.clip(R_hat, -1e10, 1e10)

    loss = (R[tag,user] - R_hat)**2

    gradient = -2 * (R[tag,user] - R_hat) * P[user] + 2 * lambda_ * Q[tag]
    gradient = np.clip(gradient, -1e10, 1e10)
    return gradient , loss

def regularized_gradient_P(R, Q, P, user, tag, lambda_):
    R_hat = np.dot(Q[tag],P[user])
    R_hat = np.clip(R_hat, -1e10, 1e10)

    loss = (R[tag,user] - R_hat)**2

    gradient = -2 * (R[tag,user] - R_hat) * Q[tag] + 2 * lambda_ * P[user]
    gradient = np.clip(gradient, -1e10, 1e10)
    return gradient , loss

In [109]:
def train(R,Q,P,learning_rate,iterations,lambda_P,lambda_Q):
    for i in range(iterations):
        if i % 10 == 0:
            loss = 0
            count = 0
            for tag in range(R.shape[0]):
                for user in range(R.shape[1]):
                    if not np.isnan(R[tag,user]):
                        loss += (R[tag,user] - np.dot(Q[tag],P[user]))**2
                        count += 1
            loss /= count
            print(f'Iteration: {i}, Loss: {loss}')
        for tag in range(R.shape[0]):
            for user in range(R.shape[1]):
                if not np.isnan(R[tag,user]):
                    gradient_Q, loss = regularized_gradient_Q(R, Q, P, user, tag, lambda_Q)
                    gradient_P, loss = regularized_gradient_P(R, Q, P, user, tag, lambda_P)

                    Q[tag] = Q[tag] - learning_rate * gradient_Q
                    P[user] = P[user] - learning_rate * gradient_P
    return Q, P

In [110]:
def train_regularised(k,lambda_p, lambda_q, R, learning_rate, iterations):
    Q, P = np.random.rand(R.shape[0], k), np.random.rand(R.shape[1], k)

    Q, P = train(R, Q, P, learning_rate, iterations,lambda_p,lambda_q)

    print(f'k= {k}, lambda_p= {lambda_p}, lambda_q= {lambda_q}, learning_rate= {learning_rate}, loss= {test_loss(R,Q,P)}')
    

In [None]:
K = [2,5,10]
lambdas = [(0.001,0.003),(0.05,0.05),(0.5,0.75)]

for k in K:
    for lambda_p, lambda_q in lambdas:
        train_regularised(k, lambda_p, lambda_q, expert_matrix, 0.0005, 50)
        print()

Iteration: 0, Loss: 0.9224288757818738
Iteration: 10, Loss: 0.5903919535634636
Iteration: 20, Loss: 0.4625333542580428
Iteration: 30, Loss: 0.42187946611514165
Iteration: 40, Loss: 0.40987166659824925
k= 2, lambda_p= 0.001, lambda_q= 0.003, learning_rate= 0.0005, loss= 0.6915050322458568

Iteration: 0, Loss: 0.9353437772724426
Iteration: 10, Loss: 0.6100040155844068
Iteration: 20, Loss: 0.4818225258842947
Iteration: 30, Loss: 0.43544300878923997
Iteration: 40, Loss: 0.42056819435171205
k= 2, lambda_p= 0.05, lambda_q= 0.05, learning_rate= 0.0005, loss= 0.6922477782971894

Iteration: 0, Loss: 0.9405706856229342
Iteration: 10, Loss: 0.8572181412790476
Iteration: 20, Loss: 0.8800492587711841
Iteration: 30, Loss: 0.8937229287543261
Iteration: 40, Loss: 0.9030358991185512
k= 2, lambda_p= 0.5, lambda_q= 0.75, learning_rate= 0.0005, loss= 0.8325892024456546

Iteration: 0, Loss: 1.8592671379468941
Iteration: 10, Loss: 0.7632153643060868
Iteration: 20, Loss: 0.5688930219448506
Iteration: 30, Los

## Question 6: Surprise Library


In [None]:
from surprise import KNNBaseline, Reader, Dataset
from surprise import accuracy
from surprise.model_selection import cross_validate

In [None]:
expert_matrix_df = pd.DataFrame(expert_matrix.T,index=user_index,columns=tag_index)


In [None]:
expert_matrix_df.head(10)

In [None]:
df_melted = expert_matrix_df.reset_index().melt(id_vars='Id',var_name='tag',value_name='rating')

In [None]:
glabal_mean = df_melted['rating'].mean()
glabal_mean

In [None]:
# using the global mean to fill NaN values
df_melted['rating'].fillna(glabal_mean, inplace=True)

In [None]:
df_melted.head(10)

In [None]:
sim_options_item = {
    'name': 'pearson',
    'user_based': False,
}

sim_options_user = {
    'name': 'pearson',
    'user_based': True,
}

reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df_melted[['Id','tag','rating']], reader)

In [None]:
algo_2_item = KNNBaseline(k=2, min_k=1, sim_options=sim_options_item)
algo_3_item = KNNBaseline(k=3, min_k=1, sim_options=sim_options_item)
algo_5_item = KNNBaseline(k=5, min_k=1, sim_options=sim_options_item)
algo_2_user = KNNBaseline(k=2, min_k=1, sim_options=sim_options_user)
algo_3_user = KNNBaseline(k=3, min_k=1, sim_options=sim_options_user)
algo_5_user = KNNBaseline(k=5, min_k=1, sim_options=sim_options_user)

algo_2_item.fit(data.build_full_trainset())
algo_3_item.fit(data.build_full_trainset())
algo_5_item.fit(data.build_full_trainset())
algo_2_user.fit(data.build_full_trainset())
algo_3_user.fit(data.build_full_trainset())
algo_5_user.fit(data.build_full_trainset())

In [None]:
expert_matrix = expert_matrix.T

In [None]:
test_set

In [None]:
# convert the test set to a list of tuples
test_set_2 = expert_matrix[int(expert_matrix.shape[0]*0.85):,int(expert_matrix.shape[1]*0.85):]
test_set_list = []
for i in range(test_set_2.shape[0]):
    for j in range(test_set_2.shape[1]):
        if not np.isnan(test_set[i,j]):
            test_set_list.append((user_index[i],tag_index[j],test_set[i,j]))

In [None]:
predictions_2_item = algo_2_item.test(test_set_list)
predictions_3_item = algo_3_item.test(test_set_list)
predictions_5_item = algo_5_item.test(test_set_list)
predictions_2_user = algo_2_user.test(test_set_list)
predictions_3_user = algo_3_user.test(test_set_list)
predictions_5_user = algo_5_user.test(test_set_list)

In [None]:
print(f'RMSE for 2 neighbours item based: {accuracy.rmse(predictions_2_item)}')
print(f'RMSE for 3 neighbours item based: {accuracy.rmse(predictions_3_item)}')
print(f'RMSE for 5 neighbours item based: {accuracy.rmse(predictions_5_item)}')
print(f'RMSE for 2 neighbours user based: {accuracy.rmse(predictions_2_user)}')
print(f'RMSE for 3 neighbours user based: {accuracy.rmse(predictions_3_user)}')
print(f'RMSE for 5 neighbours user based: {accuracy.rmse(predictions_5_user)}')

## Question 6: SVD based


In [None]:
from surprise import SVD
from surprise.model_selection import GridSearchCV

In [None]:
param_grid = {'n_factors': [2, 5, 10], 'n_epochs': [10], 'lr_all': [0.002, 0.005], 'reg_all': [0.02, 0.05]}

In [None]:
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5)
data_gs = Dataset.load_from_df(df_melted[['Id','tag','rating']], reader)
gs.fit(data)

In [None]:
best_params = gs.best_params['rmse']
print("Best parameters: ", best_params)

In [None]:
best_svd = SVD(n_factors=best_params['n_factors'], n_epochs=best_params['n_epochs'], lr_all=best_params['lr_all'], reg_all=best_params['reg_all'])

ModuleNotFoundError: No module named 'surprise'

- Best SVD: n_factors=10, n_epochs=10, lr_all=0.005, reg_all=0.02


In [None]:
best_svd.fit(data.build_full_trainset())

In [None]:
predictions_svd = best_svd.test(test_set_list)

In [None]:

print(f'RMSE for best SVD: {accuracy.rmse(predictions_svd)}')

In [None]:
svd_2 = SVD(n_factors=2, n_epochs=10, lr_all=0.005, reg_all=0.02)
svd_5 = SVD(n_factors=5, n_epochs=10, lr_all=0.005, reg_all=0.02)

svd_2.fit(data.build_full_trainset())
svd_5.fit(data.build_full_trainset())

In [None]:
predictions_svd_2 = svd_2.test(test_set_list)
predictions_svd_5 = svd_5.test(test_set_list)

In [None]:
print(f'RMSE for SVD with 2 factors: {accuracy.rmse(predictions_svd_2)}')
print(f'RMSE for SVD with 5 factors: {accuracy.rmse(predictions_svd_5)}')
print(f'RMSE for SVD with 10 factors: {accuracy.rmse(predictions_svd)}')