In [2]:
from movielens import *
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
import sys
import time
import random
import math
import operator
import pickle
user = []
item = []
rating = []
rating_test = []
import skfuzzy as fuzz
import matplotlib.pyplot as plt

d = Dataset()
d.load_users("data/u.user", user)
d.load_items("data/u.item", item)
d.load_ratings("data/u.base", rating)
d.load_ratings("data/u.test", rating_test)

n_users = len(user)
n_items = len(item)

utility = np.zeros((n_users, n_items))
for r in rating:
    utility[r.user_id-1][r.item_id-1] = r.rating

# Finds the average rating for each user and stores it in the user's object
for i in range(n_users):
    rated = np.nonzero(utility[i]) #np.nonzero returns indices of the elements that are non-zero.
    n = len(rated[0])
    if n != 0:
        user[i].avg_r = np.mean(utility[i][rated])
    else:
        user[i].avg_r = 0.

# print (utility)
test = np.zeros((n_users, n_items))
for r in rating_test:
    test[r.user_id - 1][r.item_id - 1] = r.rating
movie_genre = []
for movie in item:
    movie_genre.append([movie.unknown, movie.action, movie.adventure, movie.animation, movie.childrens, movie.comedy,
                        movie.crime, movie.documentary, movie.drama, movie.fantasy, movie.film_noir, movie.horror,
                        movie.musical, movie.mystery, movie.romance, movie.sci_fi, movie.thriller, movie.war, movie.western])

movie_genre = np.array(movie_genre)
# print (movie_genre)
# cntr, u_orig, _, _, _, _, _ = fuzz.cluster.cmeans( movie_genre, 7, 2, error=0.005, maxiter=1000)

# # fig2, ax2 = plt.subplots()
# # ax2.set_title('Trained model')
# # for j in range(7):
# #     ax2.plot(movie_genre[0, u_orig.argmax(axis=0) == j],
# #              movie_genre[1, u_orig.argmax(axis=0) == j], 'o',
# #              label='series ' + str(j))
# # ax2.legend()
# # plt.show()
# print (u_orig.argmax(axis=0) == 3)
# print (movie_genre[0,u_orig.argmax(axis=0) == 1])

num_attr = 19
k = 19
MAX_ITER = 1
n = n_items
m = 2.00
def initializeMembershipMatrix():
    membership_mat = list()
    for i in range(n):
        random_num_list = [random.random() for i in range(k)]
        summation = sum(random_num_list)
        temp_list = [x/summation for x in random_num_list]
        membership_mat.append(temp_list)
    return membership_mat

def calculateClusterCenter(membership_mat):

    # zip(*x) unzips the mem_mat into column wise arrays
    cluster_mem_val = list(zip(*membership_mat))

    cluster_centers = list()
    for j in range(k):

        # x stores membership values of all users to jth cluster
        x = list(cluster_mem_val[j])


        # denominator of Cj
        xraised = [e ** m for e in x]
        denominator = sum(xraised)

        # numerator of Cj
        temp_num = list()
        for i in range(n):

            # all attributes of 1 data point i.e. value of ith row
            data_point = list(movie_genre[i])

            # multiplying membership value to value of each attribute of data point
            prod = [xraised[i] * val for val in data_point]

            temp_num.append(prod)

        # array of sum of delta*attr1 of each data point for all attributes
        # sum delta*attr value for each data point -> attribute wise
        numerator = map(sum, zip(*temp_num))

        # center of clusters for all attributes
        center = [z/denominator for z in numerator]

        # centres of attribute clusters for 2 main clusters
        cluster_centers.append(center)
    return cluster_centers


def updateMembershipValue(membership_mat, cluster_centers):
    p = float(2/(m-1))
    for i in range(n):

        # value of various attributes of a data point
        x = list(movie_genre[i])

        # Frobenius norm to measure closeness
        # distances stores 2 values - each of the distance of 'attributes of a data point' from the 'attribute cluster centres' of each cluster
        distances = [np.linalg.norm(list(map(operator.sub, x, cluster_centers[j]))) for j in range(k)]

        for j in range(k):
            den = sum([math.pow(float(distances[j]/distances[c]), p) for c in range(k)])
            membership_mat[i][j] = float(1/den)
    return membership_mat


def getClusters(membership_mat):
    cluster_labels = list()
    for i in range(n):

        # enumerate iterates over membership value of data point to the main clusters
        # idx = index of cluster, val=membership value to cluster
        # assigns the cluster to the data point to which the data point has the highest membership
        max_val, idx = max((val, idx) for (idx, val) in enumerate(membership_mat[i]))
        cluster_labels.append(idx)
    return cluster_labels


def fuzzyCMeansClustering():
    # Membership Matrix
    membership_mat = initializeMembershipMatrix()
    curr = 0
    while curr <= MAX_ITER:
        cluster_centers = calculateClusterCenter(membership_mat)
        membership_mat = updateMembershipValue(membership_mat, cluster_centers)
        cluster_labels = getClusters(membership_mat)
        curr += 1
    return cluster_labels, cluster_centers


labels, centers = fuzzyCMeansClustering()
# print (labels)
utility_clustered = []
for i in range(0, n_users):
    average = np.zeros(19)
    tmp = []
    for m in range(0, 19):
        tmp.append([])
    for j in range(0, n_items):
        if utility[i][j] != 0:
            tmp[labels[j] - 1].append(utility[i][j])#find the cluster of each movie the user has rated and append the rating
    for m in range(0, 19):
        if len(tmp[m]) != 0:
            average[m] = np.mean(tmp[m])
        else:
            average[m] = 0
    utility_clustered.append(average)

utility_clustered = np.array(utility_clustered)
print (utility_clustered)


[[ 3.75        0.          3.9673913  ...,  0.          0.          0.        ]
 [ 0.          0.          3.83333333 ...,  0.          0.          0.        ]
 [ 0.          0.          3.11764706 ...,  0.          0.          0.        ]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          4.4        ...,  0.          0.          0.        ]
 [ 4.          0.          3.79591837 ...,  0.          0.          0.        ]]


In [3]:
for i in range(0, n_users):
    x = utility_clustered[i]
    user[i].avg_r = sum(a for a in x if a > 0) / sum(a > 0 for a in x)

def pcs(x, y):
    num = 0
    den1 = 0
    den2 = 0
    A = utility_clustered[x - 1]
    B = utility_clustered[y - 1]
    num = sum((a - user[x - 1].avg_r) * (b - user[y - 1].avg_r) for a, b in zip(A, B) if a > 0 and b > 0)
    den1 = sum((a - user[x - 1].avg_r) ** 2 for a in A if a > 0)
    den2 = sum((b - user[y - 1].avg_r) ** 2 for b in B if b > 0)
    den = (den1 ** 0.5) * (den2 ** 0.5)
    if den == 0:
        return 0
    else:
        return num / den

pcs_matrix = np.zeros((n_users, n_users))
for i in range(0, n_users):
    for j in range(0, n_users):
        if i!=j:
            pcs_matrix[i][j] = pcs(i + 1, j + 1)
            sys.stdout.write("\rGenerating Similarity Matrix [%d:%d] = %f" % (i+1, j+1, pcs_matrix[i][j]))
            sys.stdout.flush()
            time.sleep(0.00005)
print ("\rGenerating Similarity Matrix [%d:%d] = %f" % (i+1, j+1, pcs_matrix[i][j]))

Generating Similarity Matrix [943:863] = 0.6362597

In [4]:
def norm():
    normalize = np.zeros((n_users, 19))
    for i in range(0, n_users):
        for j in range(0, 19):
            if utility_clustered[i][j] != 0:
                normalize[i][j] = utility_clustered[i][j] - user[i].avg_r
            else:
                normalize[i][j] = float('Inf')
    return normalize

def guess(user_id, i_id, top_n):
    similarity = []
    for i in range(0, n_users):
        if i+1 != user_id:
            similarity.append(pcs_matrix[user_id-1][i]) #append the row of user from similarity matrix
    temp = norm()
    temp = np.delete(temp, user_id-1, 0)
    top = [x for (y,x) in sorted(zip(similarity,temp), key=lambda pair: pair[0], reverse=True)]
    s = 0
    c = 0
    for i in range(0, top_n):
        if top[i][i_id-1] != float('Inf'):
            s += top[i][i_id-1]
            c += 1
    g = user[user_id-1].avg_r if c == 0 else s/float(c) + user[user_id-1].avg_r
    if g < 1.0:
        return 1.0
    elif g > 5.0:
        return 5.0
    else:
        return g

utility_copy = np.copy(utility_clustered)
for i in range(0, n_users):
    for j in range(0, 19):
        if utility_copy[i][j] == 0:
            sys.stdout.write("\rGuessing [User:Rating] = [%d:%d]" % (i, j))
            sys.stdout.flush()
            time.sleep(0.00005)
            utility_copy[i][j] = guess(i+1, j+1, 150)
print ("\rGuessing [User:Rating] = [%d:%d]" % (i, j))
pickle.dump( utility_copy, open("utility_matrix.pkl", "wb"))


Guessing [User:Rating] = [942:18]


In [7]:
y_true = []
y_pred = []
f = open('test.txt', 'w')
for i in range(0, n_users):
    for j in range(0, n_items):
        if test[i][j] > 0:
            f.write("%d, %d, %.4f\n" % (i+1, j+1, utility_copy[i][labels[j]-1]))
            y_true.append(test[i][j])
            y_pred.append(utility_copy[i][labels[j]-1])
f.close()


In [8]:
print ("Mean Squared Error: %f" % mean_squared_error(y_true, y_pred))

Mean Squared Error: 1.121552
