# Hybrid Models for Recommendation Systems

Load Pandas, we are going to need it for manipulating data

In [10]:
import pandas as pd
import numpy as np
from IPython.display import Image
np.set_printoptions(precision = 3)

Now load the data

In [27]:
user_ratings_df = pd.read_csv("user_ratings.csv")
user_features_df = pd.read_csv("user_features.csv")
item_features_df = pd.read_csv("item_features.csv")


user_features_df["key"] = 0
user_features_df["user_id"] = range(0,user_features_df.shape[0])
item_features_df["key"] = 0
item_features_df["item_id"] = range(0,item_features_df.shape[0])

merged_df = pd.merge(user_features_df, item_features_df,left_index=True,on="key")
merged_df[["item_id", "user_id"]]

merged_df["rating"] = map(lambda ids: user_ratings_df.values[ids[1]][ids[2]],
                          merged_df[["user_id", "item_id"]].itertuples())

train = merged_df.dropna()

test = merged_df[merged_df.isnull().any(axis=1)]

print(test.to_latex())





\begin{tabular}{lrrrrrrrl}
\toprule
Empty DataFrame
Columns: Index(['Sex', ' Over60', 'key', 'user\_id', 'Critic0', ' Critic1', 'item\_id',
       'rating'],
      dtype='object')
Index: Int64Index([], dtype='int64') \\
\bottomrule
\end{tabular}



In [30]:
n_latent_features = 2

user_ratings = user_ratings_df.values
latent_user_preferences = np.random.random((user_ratings.shape[0], n_latent_features))
latent_item_features = np.random.random((user_ratings.shape[1],n_latent_features))

user_features = user_features_df.values
item_features = item_features_df.values

print(item_features_df.to_latex())


user_features = np.concatenate([np.ones(shape = (user_features.shape[0],1)), user_features], axis = 1)
item_features = np.concatenate([np.ones(shape = (item_features.shape[0],1)), item_features], axis = 1)



user_features_weights = np.random.random((user_ratings.shape[0], user_features.shape[1] ))
item_features_weights = np.random.random((user_ratings.shape[1],item_features.shape[1] ))



# print user_features

\begin{tabular}{lrrrr}
\toprule
{} &  Critic0 &   Critic1 &  key &  item\_id \\
\midrule
0 &      0.3 &       0.9 &    0 &        0 \\
1 &      0.9 &       0.3 &    0 &        1 \\
2 &      0.6 &       0.4 &    0 &        2 \\
3 &      0.2 &       0.1 &    0 &        3 \\
4 &      0.7 &       0.8 &    0 &        4 \\
\bottomrule
\end{tabular}



In [None]:
def predict_rating(user_id,item_id):
    """ Predict a rating given a user_id and an item_id.
    """
    user_preference = latent_user_preferences[user_id]
    item_preference = latent_item_features[item_id]
    
    user_score = user_features_weights[user_id].dot(user_features[user_id])
    item_score = item_features_weights[item_id].dot(item_features[item_id])
    #print user_preference.dot(item_preference), user_score, item_score
    return user_preference.dot(item_preference) + user_score + item_score

def train(user_id, item_id, rating,alpha = 0.001, 
                                   latent_feature_weight_decay = 0.1, 
                                   user_weight_decay = 0.01,
                                   item_weight_decay = 0.0001):
    
    #print item_id
    prediction_rating = predict_rating(user_id, item_id)
    err =  ( prediction_rating - rating );
    #print err
    user_pref_values = latent_user_preferences[user_id][:]
    latent_user_preferences[user_id] -= alpha * err *  ( latent_item_features[item_id] + latent_feature_weight_decay*latent_user_preferences[user_id])
    latent_item_features[item_id] -= alpha * err * ( user_pref_values + latent_feature_weight_decay*latent_item_features[item_id])
    
    user_features_weights[user_id] -=alpha * err *(  user_features[user_id] + user_weight_decay* user_features_weights[user_id])
    item_features_weights[item_id] -=alpha * err * ( item_features_weights[item_id] + item_weight_decay* item_features_weights[item_id])
    
    
    return err
    


def sgd(iterations = 30000):
    """ Iterate over all users and all items and train for 
        a certain number of iterations
    """
    for iteration in range(0,iterations):
        error = []
        for user_id in range(0,latent_user_preferences.shape[0]):
            for item_id in range(0,latent_item_features.shape[0]):
                rating = user_ratings[user_id][item_id]
                if(not np.isnan(rating)):
                    err = train(user_id,item_id,rating)
                    error.append(err)
    mse = (np.array(error) ** 2).mean()          
    print mse
                    
                    
    
                    
                    
    


In [35]:
for _ in range(0,10): 
    sgd()

0.2774882151356727
0.27734165846084846
0.2772213285908355
0.27712223909739203
0.2770407668242187
0.2769742338328237
0.2769206377683887
0.2768784727564912
0.2768466074891672
0.2768242005759273


In [37]:
predictions = np.zeros(shape = (latent_user_preferences.shape[0], latent_item_features.shape[0]) )
#print latent_user_preferences
print (user_features_weights)
print (item_features_weights)
for user_id in range(0,latent_user_preferences.shape[0]):
            for item_id in range(0,latent_item_features.shape[0]):
                predictions[user_id,item_id] =  predict_rating(user_id,item_id)

  

[[-1.591 -1.641  0.956  0.852  0.967]
 [ 3.142  0.955  3.023  0.031  3.662]
 [ 0.951  0.048  0.38   0.24   0.829]
 [ 0.734  0.353  0.998  0.395  0.831]
 [ 0.395  0.551 -0.027  0.9   -0.062]
 [ 0.92   0.628  0.191  0.68   0.989]
 [ 0.555  0.855  0.945  0.438 -0.32 ]
 [ 0.464  0.899  0.966  0.787  0.02 ]
 [ 0.472  0.655  0.666  0.412  1.148]
 [ 0.622  0.434  0.89   0.055 -0.264]]
[[1.551e+00 1.444e+00 1.482e+00 1.326e+00 3.316e-01]
 [4.625e-03 1.048e-04 1.960e-03 3.145e-03 3.518e-03]
 [1.870e+00 4.825e+00 3.805e+00 4.909e+00 3.428e+00]
 [9.966e-01 5.648e-01 5.605e-01 1.409e-01 6.204e-01]
 [6.265e-01 3.785e-01 6.169e-01 6.466e-01 1.014e+00]]


In [42]:
values = [zip(user_ratings[i], predictions[i]) for i in range(0,predictions.shape[0])]
comparison_data = pd.DataFrame(values)

comparison_data.applymap(lambda xy: "(%2.3f|%2.3f)"%(xy[0],xy[1]))

Unnamed: 0,0,1,2,3,4
0,(8.000|7.984),(2.000|2.064),(nan|-30.740),(5.000|4.961),(4.000|3.992)
1,(3.000|2.896),(2.000|2.536),(nan|87.307),(7.000|6.636),(7.000|6.934)
2,(9.000|9.030),(nan|4.421),(7.000|7.001),(8.000|7.969),(5.000|5.001)
3,(nan|8.969),(nan|4.939),(7.000|7.000),(8.000|8.000),(9.000|9.000)
4,(nan|3.950),(1.000|0.572),(8.000|7.993),(3.000|3.371),(7.000|7.050)
5,(2.000|2.000),(3.000|2.996),(5.000|5.000),(nan|2.050),(nan|36.176)
6,(4.000|4.331),(2.000|0.422),(nan|-5.436),(2.000|3.008),(7.000|7.190)
7,(7.000|6.740),(1.000|2.899),(2.000|2.069),(7.000|5.672),(9.000|8.779)
8,(3.000|3.104),(3.000|2.524),(nan|97.549),(7.000|7.290),(3.000|3.056)
9,(4.000|3.913),(nan|-0.142),(5.000|4.989),(3.000|3.086),(3.000|2.998)


In [43]:
comparison_data

Unnamed: 0,0,1,2,3,4
0,"(8.0, 7.984024638884739)","(2.0, 2.063951276573669)","(nan, -30.74014631201564)","(5.0, 4.961249899788864)","(4.0, 3.9922243502674255)"
1,"(3.0, 2.896479074931704)","(2.0, 2.535884349225416)","(nan, 87.30702190090577)","(7.0, 6.635985137847951)","(7.0, 6.93357806034153)"
2,"(9.0, 9.029647572258058)","(nan, 4.4214547671021185)","(7.0, 7.001242458557483)","(8.0, 7.968597851811825)","(5.0, 5.000592251545798)"
3,"(nan, 8.968764993980841)","(nan, 4.939029068399684)","(7.0, 6.999987396272914)","(8.0, 8.000336095994896)","(9.0, 8.999911114894216)"
4,"(nan, 3.949845353131745)","(1.0, 0.5715069665884411)","(8.0, 7.993372744189076)","(3.0, 3.3705578575672903)","(7.0, 7.049762246275962)"
5,"(2.0, 2.0003219121516387)","(3.0, 2.9955461709298534)","(5.0, 4.999917450433092)","(nan, 2.0497002540178633)","(nan, 36.176362970265565)"
6,"(4.0, 4.33073970409918)","(2.0, 0.4224903630990512)","(nan, -5.436418390985395)","(2.0, 3.007530703119207)","(7.0, 7.189812240972175)"
7,"(7.0, 6.740437717556624)","(1.0, 2.8986410227225683)","(2.0, 2.0692589133211374)","(7.0, 5.671696793425857)","(9.0, 8.779011497220388)"
8,"(3.0, 3.1038187771381938)","(3.0, 2.5235245593192603)","(nan, 97.54870633926299)","(7.0, 7.289992851346297)","(3.0, 3.055645458034933)"
9,"(4.0, 3.91272822083881)","(nan, -0.14192957404870488)","(5.0, 4.9894687787382335)","(3.0, 3.086347363796628)","(3.0, 2.9981515396592413)"


In [45]:
d = comparison_data.to_latex()
text_file = open("comparison.txt", "w")
text_file.write(d)
text_file.close()