# Assignment 2: Content-Based Recommenders

## Importing Libraries

In [35]:
import numpy as np
import pandas as pd

## Loading the Data

In [36]:
doc_data = pd.read_excel('Assign_2_pandas.xls')

Printing the head of the data frame for the doc_data

In [37]:
doc_data.head()

Unnamed: 0,baseball,economics,politics,Europe,Asia,soccer,war,security,shopping,family,User 1,User 2
doc1,1,0,1,0,1,1,0,0,0,1,1.0,-1.0
doc2,0,1,1,1,0,0,0,1,0,0,-1.0,1.0
doc3,0,0,0,1,1,1,0,0,0,0,,
doc4,0,0,1,1,0,0,1,1,0,0,,1.0
doc5,0,1,0,0,0,0,0,0,1,1,,


## Part 1: Build and use a very basic profile

Putting the 10 attributes of the documents in a list, and then converting the values in the list from unicode to strings.

In [38]:
attr_vals = list(doc_data.columns[0:10])
attr_vals = [x.encode('UTF8') for x in attr_vals]
print attr_vals

['baseball', 'economics', 'politics', 'Europe', 'Asia', 'soccer', 'war', 'security', 'shopping', 'family']


Declaring empty arrays of size 10 (10 attributes) to fill up the profile for each user.

In [39]:
user_1_prof = np.array(len(attr_vals)*[-999] , dtype = np.double)
user_2_prof = np.array(len(attr_vals)*[-999] , dtype = np.double)

Filling up the profile of each user for each atribute. 

In [40]:
for i, attr in enumerate(attr_vals):
    user_1_prof[i] = np.sum(doc_data[attr]*doc_data['User 1'])
    user_2_prof[i] = np.sum(doc_data[attr]*doc_data['User 2'])

In [41]:
user_1_prof

array([ 3., -2., -1.,  0.,  0.,  2., -1., -1.,  1.,  0.])

Checking user_1 and user_2 profiles for consistency with values given in the assignment.

In [42]:
print user_1_prof[attr_vals.index('family')]==0
print user_2_prof[attr_vals.index('Europe')]==3

True
True


Putting the 20 documents in a list, and then converting the values in the list from unicode to strings.

In [43]:
doc_vals = list(doc_data.index)
doc_vals = [x.encode('UTF8') for x in doc_vals]

Declaring empty arrays to store the scores for each user for each doc.

In [44]:
user_1_doc_scores = len(doc_vals)*[-999]
user_2_doc_scores = len(doc_vals)*[-999]

Performing a dot product between each doc and the profile for each user. This will give the predicted score for each document.

In [45]:
for i, doc in enumerate(doc_vals):
    
    # Calculating the dot product for doc and each user
    doc_score_u1 = np.sum(doc_data.ix[doc].values[0:10]*user_1_prof)
    doc_score_u2 = np.sum(doc_data.ix[doc].values[0:10]*user_2_prof)
    
    # Putting the score for each doc and doc name in tuples
    user_1_doc_scores[i] = (doc_score_u1, doc)
    user_2_doc_scores[i] = (doc_score_u2, doc)

Printing the score and the doc that user_1 likes the most

In [46]:
print sorted(user_1_doc_scores)[-1]

(6.0, 'doc16')


How many documents does the model predict user_2 will dislike (prediction score that is negative)?

In [47]:
print sum([1 for doc_tup in user_2_doc_scores if doc_tup[0]<0.0])

4


## Part 2

Bulding a normalized dataframe

In [48]:
col_names = list(doc_data)
col_names = [x.encode('UTF8') for x in col_names]

Defining empty dataframe with the same column names as doc_data dataframe

In [49]:
norm_doc_data = pd.DataFrame(index=doc_vals, columns=col_names)

Filling the values of the norm_doc_data dataframe. The values of the attributes are normalized and then then the columns User 1 and User 2 are added.

In [50]:
for doc in doc_vals:
    
    # First, adding values of attributes to a np array
    doc_list = np.array(list(doc_data[attr_vals].ix[doc]) , dtype=np.double)
    # Normalizing the attributes
    doc_list = doc_list/np.sqrt(np.sum(doc_list))
    # Adding the User1 and User 2 scores to the doc list
    doc_list = np.append(doc_list, np.array([doc_data['User 1'].ix[doc], doc_data['User 2'].ix[doc]], dtype=np.double))
    # Adding doc list array to the norm_doc_data dataframe
    norm_doc_data.ix[doc] = doc_list

Looking at the head of the norm_doc_data dataframe

In [51]:
norm_doc_data.head()

Unnamed: 0,baseball,economics,politics,Europe,Asia,soccer,war,security,shopping,family,User 1,User 2
doc1,0.447214,0.0,0.447214,0.0,0.447214,0.447214,0.0,0.0,0.0,0.447214,1.0,-1.0
doc2,0.0,0.5,0.5,0.5,0.0,0.0,0.0,0.5,0.0,0.0,-1.0,1.0
doc3,0.0,0.0,0.0,0.57735,0.57735,0.57735,0.0,0.0,0.0,0.0,,
doc4,0.0,0.0,0.5,0.5,0.0,0.0,0.5,0.5,0.0,0.0,,1.0
doc5,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.57735,,


Filling up the profile of each user for each atribute, but now with norm_doc_data instead of doc_data

In [52]:
for i, attr in enumerate(attr_vals):
    user_1_prof[i] = np.sum(norm_doc_data[attr]*norm_doc_data['User 1'])
    user_2_prof[i] = np.sum(norm_doc_data[attr]*norm_doc_data['User 2'])

Performing a dot product between each doc and the profile for each user. This will give the predicted score for each document. Now doing this with norm_doc_data.

In [53]:
for i, doc in enumerate(doc_vals):
    
    # Calculating the dot product for doc and each user
    doc_score_u1 = np.sum(norm_doc_data.ix[doc].values[0:10]*user_1_prof)
    doc_score_u2 = np.sum(norm_doc_data.ix[doc].values[0:10]*user_2_prof)
    
    # Putting the score for each doc and doc name in tuples
    user_1_doc_scores[i] = (doc_score_u1, doc)
    user_2_doc_scores[i] = (doc_score_u2, doc)

Checking user_1, doc1 and user_2, doc7, doc19 prediction for consistency with values given in the assignment.

In [54]:
print abs(user_1_doc_scores[0][0] - 1.0090) < 1.0e3
print abs(user_2_doc_scores[6][0] - 0.7444) < 1.0e3
print abs(user_2_doc_scores[18][0] - 0.4834) < 1.0e3

True
True
True


For User 1, what is the second most liked doc now? What is the prediction score for this doc?

In [55]:
print sorted(user_1_doc_scores)[-2]

(1.370922665887427, 'doc6')


## Part 3

Now, we want to add a inverse document frequency to the norm_doc_data dataframe. First, we compute the document frequency below.

In [56]:
doc_freq = np.array( list(doc_data[attr_vals].sum()), dtype = np.double )

Computing the inverse doc frequency.

In [57]:
iv_doc_freq = 1.0/doc_freq

Adding the dot product of the inverse document frequency to the user_1 and user_2 doc scores.

In [58]:
for i, doc in enumerate(doc_vals):
    
    # Calculating the dot product for doc and each user
    doc_score_u1 = np.sum(norm_doc_data.ix[doc].values[0:10]*user_1_prof*iv_doc_freq)
    doc_score_u2 = np.sum(norm_doc_data.ix[doc].values[0:10]*user_2_prof*iv_doc_freq)
    
    # Putting the score for each doc and doc name in tuples
    user_1_doc_scores[i] = (doc_score_u1, doc)
    user_2_doc_scores[i] = (doc_score_u2, doc)

Checking user_1, doc1 prediction for consistency with value given in the assignment.

In [59]:
print abs(user_1_doc_scores[0][0] - 0.2476) < 1.0e3

True


Compare doc1 and doc9 for user1. What’s user1’s prediction for doc9 in the new IDF weighted model?

In [60]:
print user_1_doc_scores[0]
print user_1_doc_scores[8]

(0.2476124657905287, 'doc1')
(0.1790671937654306, 'doc9')


Look at doc6 for user_2. It was moderately positive before and now is slightly negative.

In [61]:
print user_2_doc_scores[5]

(-0.08469536214019151, 'doc6')
