In [1]:
import pandas as pd
import numpy as np
from numpy import linalg

---
### About this notebook:
In this chapter we begin looking at Singular Value Decomposition to detect latent features and provide recommendations.

---
### Create Toy dataset:

In [2]:
toy_df = pd.DataFrame({'users':['Sara', 'Jesper', 'Therese', 'Helle', 'Pietro', 'Ekaterina'],
                      'MIB':[5, 4, 5, 3, 3, 2],
                      'ST':[3, 3, 2, 5, 3, 3],
                      'AV':[np.nan, 4, 5, 3, 3, 2],
                      'BH':[2, np.nan, 2, np.nan, 2, 3],
                      'SS':[2, 3, 1, 1, 4, 5],
                      'LM':[2, 3, 1, 1, 5, 5]})
toy_df = toy_df.set_index('users')
toy_df

Unnamed: 0_level_0,MIB,ST,AV,BH,SS,LM
users,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Sara,5,3,,2.0,2,2
Jesper,4,3,4.0,,3,3
Therese,5,2,5.0,2.0,1,1
Helle,3,5,3.0,,1,1
Pietro,3,3,3.0,2.0,4,5
Ekaterina,2,3,2.0,3.0,5,5


In [3]:
users = toy_df.index
columns = toy_df.columns

#### Handle missing Values by imputing:
Using the mean rating for each item (or just use zero if you want to replicate the output shown on figure 11.10 of page 297):

In [4]:
items_mean = toy_df.mean()
items_mean

MIB    3.666667
ST     3.166667
AV     3.400000
BH     2.250000
SS     2.666667
LM     2.833333
dtype: float64

In [5]:
# toy_df = toy_df.fillna(items_mean)
toy_df = toy_df.fillna(0)
toy_df

Unnamed: 0_level_0,MIB,ST,AV,BH,SS,LM
users,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Sara,5,3,0.0,2.0,2,2
Jesper,4,3,4.0,0.0,3,3
Therese,5,2,5.0,2.0,1,1
Helle,3,5,3.0,0.0,1,1
Pietro,3,3,3.0,2.0,4,5
Ekaterina,2,3,2.0,3.0,5,5


---
### Decompose ratings matrix:

In [6]:
# use SVD:
U, sigma, Vt = linalg.svd(toy_df)

In [7]:
# user feature matrix:
pd.DataFrame(U)

Unnamed: 0,0,1,2,3,4,5
0,-0.34319,0.052713,0.912294,0.108027,0.188378,-0.002536
1,-0.430255,0.15729,-0.309303,-0.122381,0.744991,0.352822
2,-0.387373,0.556812,-0.185837,0.632466,-0.323971,0.019917
3,-0.332639,0.418957,0.016887,-0.756437,-0.371977,-0.054553
4,-0.47548,-0.33867,-0.184494,0.026681,0.096152,-0.784363
5,-0.458599,-0.610097,-0.056514,0.020749,-0.396147,0.506865


In [8]:
# weights matrix:
pd.DataFrame(sigma)

Unnamed: 0,0
0,17.269661
1,5.844682
2,3.562881
3,3.132384
4,1.668337
5,0.556047


In [9]:
# item feature matrix:
pd.DataFrame(Vt)

Unnamed: 0,0,1,2,3,4,5
0,-0.504665,-0.437792,-0.405303,-0.219337,-0.399086,-0.426618
1,0.461522,0.169747,0.416428,-0.220471,-0.487983,-0.545928
2,0.499378,0.224176,-0.780897,0.256642,-0.082186,-0.133968
3,0.34005,-0.771945,0.167615,0.509706,-0.020621,-0.012103
4,0.408929,-0.364258,-0.155638,-0.759632,0.191602,0.249235
5,-0.008679,-0.026201,0.014124,-0.024042,0.747471,-0.663135


---
### Reduce the matrix:
See page 296 of the text for the rationale involving the desired dimensions.

In [10]:
def rank_k(k):
    U_reduced = np.mat(U[:, :k])
    Vt_reduced = np.mat(Vt[:k,:])
    sigma_reduced = np.eye(k)*sigma[:k]
    
    return U_reduced, Vt_reduced, sigma_reduced

In [11]:
U_reduced, Vt_reduced, sigma_reduced = rank_k(4)

#### Reconstruct the matrix:

In [12]:
toy_df_reduced = np.round(U_reduced*sigma_reduced*Vt_reduced, 2)
toy_df_reduced = pd.DataFrame(toy_df_reduced, index=users, columns=columns)
toy_df_reduced

Unnamed: 0_level_0,MIB,ST,AV,BH,SS,LM
users,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Sara,4.87,3.11,0.05,2.24,1.94,1.92
Jesper,3.49,3.46,4.19,0.95,2.62,2.82
Therese,5.22,1.8,4.92,1.59,1.1,1.14
Helle,3.25,4.77,2.9,-0.47,1.14,1.13
Pietro,2.93,3.05,3.03,2.11,4.3,4.67
Ekaterina,2.27,2.77,1.89,2.5,4.92,5.35


**Note:** compare the matrix above with the original (i.e. _toy_df_reduced_ vs _toy_df_). See that in the original, user 'Sara' does not have a rating for "AV", while in _toy_df_reduced_ there's a value of 0.05. This value represents the rating 'prediction' for the movie 'AV' for user Sara.