In [None]:
# Importing necessary libraries
import numpy as np
import pandas as pd

In [None]:
# Reads the users data file with features
header = ['user_id', 'age', 'sex', 'occupation','zipcode']
users_data = pd.read_table('C:\\Users\\Sirivamsi\\Documents\\Big Data\\Project\\Project Files\\ml-100k\\u.user', sep='|', names=header)

In [None]:
# Reads the ratings of users on items
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_table('C:\\Users\\Sirivamsi\\Documents\\Big Data\\Project\\Project Files\\ml-100k\\u.data', sep='\t', names=header)

In [None]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df, test_size=0.003)

In [None]:
train_data = pd.merge(users_data, train_data, on='user_id')
test_data = pd.merge(users_data, test_data, on='user_id')

In [None]:
# Find the unique list of occupations in the dataset
occupation_list = np.array(users_data.occupation.unique())
print (occupation_list)

In [None]:
# Reading the unique number of users and movies in the dataset
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print ("Number of users = " + str(n_users) + " | Number of movies = " + str(n_items))

In [None]:
# Defining the Pearson Correlation Similarity function
from sklearn.metrics.pairwise import cosine_similarity
def pearson_correlation_similarity(matrix):
   matrix_mean = np.nanmean(matrix,axis=1)
   user_mean_subtracted = matrix - matrix_mean[:, None]
   user_mean_subtracted = np.nan_to_num(user_mean_subtracted)
   similaritymatrix = cosine_similarity(user_mean_subtracted)
   np.fill_diagonal(similaritymatrix, 0)
   return similaritymatrix

In [None]:
# Defining the Adjusted Cosine Similarity function
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
def adjusted_cosine_similarity(matrix):
   matrix_mean = np.nanmean(matrix,axis=1)
   user_mean_subtracted = matrix - matrix_mean[:, None]
   user_mean_subtracted = np.nan_to_num(user_mean_subtracted)
   similaritymatrix = cosine_similarity(user_mean_subtracted.T)
   np.fill_diagonal(similaritymatrix, 0)
   return similaritymatrix

In [None]:
# Function definition for making groups and calculating the user similarity matrices
from sklearn.metrics.pairwise import cosine_similarity
# Takes arguments occupation type, the training/testing matrix and user similarity metric type for calculating the similarity between users
def makegroups(occupation,data_matrix,usersimilaritymetric):
# Checking if the matrix passed is a training set or a testing set and then generates groups
 if data_matrix.shape[0] == train_data.shape[0]:
   train_data_with_occupation = data_matrix.loc[data_matrix['occupation'] == occupation]
   user_id_grouped = train_data_with_occupation['user_id'].unique()
   listofusers = []
   listofusers = np.unique(user_id_grouped)
   num_of_rows=np.count_nonzero(listofusers)
   group_matrix = np.zeros((0,n_items))
# Similarity is calculated between the users of the group
   for i in np.nditer(listofusers):
      temp_matrix = train_data_matrix[i-1].reshape(1,n_items)
      group_matrix = np.append(group_matrix,temp_matrix,axis=0)
   user_similarity = np.zeros((num_of_rows,n_items))
# Computes similarity based on the argument passed
   if usersimilaritymetric == 'cosine':
     user_similarity = cosine_similarity(group_matrix)
     np.fill_diagonal(user_similarity, 0)
   elif usersimilaritymetric == 'pearsoncorrelation':
     user_similarity = pearson_correlation_similarity(group_matrix)
# User similarity matrix is concatenated with the user id's at the start of the column        
   user_similarity = np.concatenate((listofusers[:, np.newaxis], user_similarity), axis=1)      
   return user_similarity
 else:
# If test data is passed, then the test users are grouped and returned         
  test_data_with_occupation = data_matrix.loc[data_matrix['occupation'] == occupation]
  user_id_grouped = test_data_with_occupation['user_id'].unique()
  listoftestusers = []
  listoftestusers = np.unique(user_id_grouped)
 return listoftestusers


In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics.pairwise import cosine_similarity
from math import sqrt
# Function to calculate the prediction and Mean Absolute Error Values
# Takes the arguments test group, user similarity matrix computed on training data,predictions,original values, 
# k - Group Size i.e, a threshold value of 25%,50%,75% or 100% and the type of item similarity metric to be used
def calcpredandmaetest(testgroup,user_similarity_matrix, prediction, originalvalues, k, itemsimilaritymetric, numofitems):
  for p in np.nditer(testgroup):
     v=0   
     for q in user_similarity_matrix[:,0:1]:
        v=v+1
        if p == q:
          noofrows = user_similarity_matrix.shape[0]
          similardata = user_similarity_matrix[v-1,1:noofrows+1] 
# Sorting the most similar users in ascending order and storing the indexes in similarindexvalues
          similarindexvalues = np.argsort(user_similarity_matrix[v-1,1:noofrows+1])
          user_item_matrix = np.zeros((0,n_items))
          totalcount = len(similarindexvalues)
          temp = 1-k
          getnumofusers = np.rint(totalcount*temp).astype(int)
# Constructing user-item matrix with the most similar users
          for z in similarindexvalues[getnumofusers:totalcount+1]:
            similar_users = (user_similarity_matrix[z,0:1]).astype(int)
            user_item_matrix = np.append(user_item_matrix,(train_data_matrix[similar_users-1]),axis=0) 
          if itemsimilaritymetric == 'cosine':
            item_similarity = cosine_similarity(user_item_matrix.T)
            np.fill_diagonal(item_similarity, 0)
          elif itemsimilaritymetric == 'adjustedcosine':
            item_similarity = adjusted_cosine_similarity(user_item_matrix)
          for r in test_data.itertuples():

                if r[1] == p:
                   print ("User id :" + str(r[1]))
                   print ("Item id :" + str(r[6]))
                   print ("User specified Rating :" + str(r[7]))
#  Sorting the most similar items in ascending order and storing the indexes in similaritemindexvalues
                   similaritemindexvalues = np.argsort(item_similarity[r[6]-1]) 
                   numerator=0
                   denominator=0
                   predictedvalue=0  
# Calculating the prediction
                   for x in similaritemindexvalues[n_items-numofitems:n_items]:
                       rawrating = train_data_matrix[r[1]-1,x].astype(int)
                       if rawrating!=0:
                          simofrating = item_similarity[r[6]-1,x]
                          numerator = numerator + (rawrating*simofrating)
                          denominator = denominator + item_similarity[r[6]-1,x]
                   if denominator == 0:
                       predictedvalue = 0
                   else:
                       predictedvalue = numerator/denominator 
                   print ("Prediction :" + str(int(round(predictedvalue))))
                   prediction = np.append(prediction,predictedvalue)
                   originalvalues = np.append(originalvalues,r[7])  
# Returning the original values and prediction for every group
  return originalvalues, prediction

In [None]:
# Can change the values here and test the code
import numpy as np
results = []
train_data_matrix = np.zeros((n_users, n_items))
pd = []
ov = []
itemsizevalue = 400
k=1/2
# Computing Mean Absolute Error on Training data and testing with the validation data set aside
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[6]-1] = line[7]
# Computing Mean Absolute Error for Cosine and Adjusted Cosine
usersimilaritymetric = 'cosine'
itemsimilaritymetric = 'adjustedcosine'
for abcd in ['student']:
    similaritymatrix = makegroups(abcd,train_data,usersimilaritymetric)
    testdata = makegroups(abcd,test_data,usersimilaritymetric)
    ov, pd = calcpredandmaetest(testdata,similaritymatrix,pd,ov,k,itemsimilaritymetric,itemsizevalue)
mae = mean_absolute_error(ov,pd)
print ("Mean Absolute Error " + str(mean_absolute_error(ov,pd)))

