# SVD - exemplo didático

In [2]:
pip install scikit-surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 4.3 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1633987 sha256=81cd47f1f1e0c886f2d579b8f955fa81a4835704964ad384dce91db3c5497cdf
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [4]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error

from surprise import Dataset
from surprise.reader import Reader
from surprise import KNNBasic
from surprise import SVD
from surprise.prediction_algorithms.knns import KNNBaseline
from surprise.prediction_algorithms.slope_one import SlopeOne
from surprise.prediction_algorithms.co_clustering import CoClustering
from surprise import accuracy
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

In [5]:
utl_matrix = pd.read_csv('https://raw.githubusercontent.com/abnr/ml-data/main/exemplo_rec.csv', index_col=0)

In [6]:
utl_matrix

Unnamed: 0,Matrix,Alien,Serenity,Casablanca,Amelie
user_0,1,1,1,0,0
user_1,3,3,3,0,0
user_2,4,4,4,0,0
user_3,5,5,5,0,0
user_4,0,2,0,4,4
user_5,0,0,0,5,5
user_6,0,1,0,2,2


In [7]:
def perform_SVD(input_matrix, n_factors):
    m = input_matrix.shape[0]
    n = input_matrix.shape[1]

    print(input_matrix)
    U, S, V = np.linalg.svd(input_matrix)

    S = np.diag(S)

    S = np.vstack([S, np.zeros(shape=(m-n, n))])

    U_ = pd.DataFrame(U, index=input_matrix.index, columns=[f'LF_{i}' for i in range(m)])
    S_ = pd.DataFrame(S, columns=[f'S_{i}' for i in range(n)])
    V_ = pd.DataFrame(V, index=[f'LF_{i}' for i in range(n)], columns=input_matrix.columns)

    U_cut = U_.iloc[:, :n_factors]
    S_cut = S_.iloc[:n_factors, :n_factors]
    V_cut = V_.iloc[:n_factors, :]
    
    return U_cut, S_cut, V_cut

In [8]:
U, S, V = perform_SVD(utl_matrix, n_factors=2)

        Matrix  Alien  Serenity  Casablanca  Amelie
user_0       1      1         1           0       0
user_1       3      3         3           0       0
user_2       4      4         4           0       0
user_3       5      5         5           0       0
user_4       0      2         0           4       4
user_5       0      0         0           5       5
user_6       0      1         0           2       2


In [9]:
def recreate_utility_matrix(U, S, V, rows, columns):
    utl = np.dot(np.dot(U, S), V)
    utl = pd.DataFrame(
        utl,
        index=rows,
        columns=columns
    )
    
    return utl

In [10]:
utl_matrix_pred = recreate_utility_matrix(U, S, V, utl_matrix.index, utl_matrix.columns)

In [11]:
utl_matrix_pred

Unnamed: 0,Matrix,Alien,Serenity,Casablanca,Amelie
user_0,0.994042,1.011704,0.994042,-0.001327,-0.001327
user_1,2.982126,3.035113,2.982126,-0.003982,-0.003982
user_2,3.976168,4.046818,3.976168,-0.005309,-0.005309
user_3,4.97021,5.058522,4.97021,-0.006636,-0.006636
user_4,0.360313,1.292165,0.360313,4.080263,4.080263
user_5,-0.373851,0.734429,-0.373851,4.916721,4.916721
user_6,0.180157,0.646082,0.180157,2.040132,2.040132


In [12]:
mean_squared_error(utl_matrix, utl_matrix_pred)

0.051729455444565274

## Novo usuário

In [13]:
utl_matrix2 = utl_matrix.copy()
utl_matrix2.loc['user_7', ['Matrix', 'Serenity']] = [4, 4]

In [14]:
utl_matrix2.fillna(utl_matrix2.mean(axis=0), inplace=True)

In [15]:
utl_matrix2

Unnamed: 0,Matrix,Alien,Serenity,Casablanca,Amelie
user_0,1.0,1.0,1.0,0.0,0.0
user_1,3.0,3.0,3.0,0.0,0.0
user_2,4.0,4.0,4.0,0.0,0.0
user_3,5.0,5.0,5.0,0.0,0.0
user_4,0.0,2.0,0.0,4.0,4.0
user_5,0.0,0.0,0.0,5.0,5.0
user_6,0.0,1.0,0.0,2.0,2.0
user_7,4.0,2.285714,4.0,1.571429,1.571429


In [16]:
U2, S2, V2 = perform_SVD(utl_matrix2, 2)

        Matrix     Alien  Serenity  Casablanca    Amelie
user_0     1.0  1.000000       1.0    0.000000  0.000000
user_1     3.0  3.000000       3.0    0.000000  0.000000
user_2     4.0  4.000000       4.0    0.000000  0.000000
user_3     5.0  5.000000       5.0    0.000000  0.000000
user_4     0.0  2.000000       0.0    4.000000  4.000000
user_5     0.0  0.000000       0.0    5.000000  5.000000
user_6     0.0  1.000000       0.0    2.000000  2.000000
user_7     4.0  2.285714       4.0    1.571429  1.571429


In [17]:
utl_matrix_pred2 = recreate_utility_matrix(U2, S2, V2, utl_matrix2.index, utl_matrix2.columns)

In [18]:
X = pd.DataFrame(np.dot(np.dot(U2, S2), V2))

In [19]:
mean_squared_error(utl_matrix2, utl_matrix_pred2)

0.10399173351431881

In [20]:
utl_matrix2

Unnamed: 0,Matrix,Alien,Serenity,Casablanca,Amelie
user_0,1.0,1.0,1.0,0.0,0.0
user_1,3.0,3.0,3.0,0.0,0.0
user_2,4.0,4.0,4.0,0.0,0.0
user_3,5.0,5.0,5.0,0.0,0.0
user_4,0.0,2.0,0.0,4.0,4.0
user_5,0.0,0.0,0.0,5.0,5.0
user_6,0.0,1.0,0.0,2.0,2.0
user_7,4.0,2.285714,4.0,1.571429,1.571429


In [21]:
utl_matrix_pred2

Unnamed: 0,Matrix,Alien,Serenity,Casablanca,Amelie
user_0,1.028772,0.936739,1.028772,0.005487,0.005487
user_1,3.086315,2.810216,3.086315,0.016462,0.016462
user_2,4.115087,3.746955,4.115087,0.021949,0.021949
user_3,5.143858,4.683694,5.143858,0.027437,0.027437
user_4,0.415757,1.085862,0.415757,4.079293,4.079293
user_5,-0.276119,0.60711,-0.276119,4.947339,4.947339
user_6,0.207879,0.542931,0.207879,2.039647,2.039647
user_7,3.482605,3.423327,3.482605,1.472751,1.472751


# Exemplo Real - Hora de botar a mão na massa!

In [22]:
# Carrega dados
int_matrix = pd.read_csv('https://raw.githubusercontent.com/abnr/ml-data/main/u.data', sep ='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
item_info = pd.read_csv('https://raw.githubusercontent.com/abnr/ml-data/main/Movie_Id_Titles')

In [24]:
int_matrix

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,50,5,881250949
1,0,172,5,881250949
2,0,133,1,881250949
3,196,242,3,881250949
4,186,302,3,891717742
...,...,...,...,...
99998,880,476,3,880175444
99999,716,204,5,879795543
100000,276,1090,1,874795795
100001,13,225,2,882399156


In [25]:
item_info

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [23]:
# Checa número de usuários e filmes
int_matrix['user_id'].nunique(), int_matrix['item_id'].nunique()

(944, 1682)

In [26]:
# Checa rating máximo e mínimo
int_matrix['rating'].min(), int_matrix['rating'].max()

(1, 5)

In [27]:
# Olhadinha na matriz de interações
int_matrix.merge(item_info).pivot_table(
    index=['user_id'],
    columns=['title'],
    values=['rating']
)

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,,,,,,,,,,,...,,,,,,,,,,
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,,,...,,,,,,,,,,
940,,,,,,,,,,,...,,,,,,,,,,
941,,,,,,,,,,,...,,,,,,,,,,
942,,,,,,,,3.0,,3.0,...,,,,,,,,,,


In [28]:
int_matrix.drop('timestamp',axis=1,inplace=True)

In [29]:
reader = Reader()
# Transforma no formato compativel com surprise
data = Dataset.load_from_df(int_matrix, reader)

In [30]:
# Inicia algoritmo de SVD com n variáveis latentes
rec = SVD(n_factors=100)

In [31]:
# Define dataset completo para fit do algoritmo
train_full = data.build_full_trainset()
# Separa em treino e teste
train, test = train_test_split(data, train_size=0.2)

In [32]:
# Ajusta o modelo no treino
rec.fit(train)
# Realiza predições no teste
predictions = rec.test(test)
predictions = pd.DataFrame(predictions)

In [33]:
predictions

Unnamed: 0,uid,iid,r_ui,est,details
0,871,302,5.0,3.889632,{'was_impossible': False}
1,254,418,3.0,3.670004,{'was_impossible': False}
2,690,747,3.0,3.179520,{'was_impossible': False}
3,232,181,4.0,3.920274,{'was_impossible': False}
4,393,934,3.0,3.248092,{'was_impossible': False}
...,...,...,...,...,...
19996,807,227,4.0,3.756268,{'was_impossible': False}
19997,880,179,4.0,3.769159,{'was_impossible': False}
19998,19,313,2.0,4.333047,{'was_impossible': False}
19999,62,473,4.0,2.890072,{'was_impossible': False}


In [34]:
# Avalia o modelo com MSE
mean_squared_error(predictions['r_ui'], predictions['est'])

0.9725905102710213

**Grid Search**

In [35]:
# Define espaço de parâmetros
param_grid = {'n_factors' : [50, 100, 150]}

In [36]:
# Inicializa GridSearch com métrica MSE
rec_gs = GridSearchCV(SVD, param_grid, ['mse'], cv=5)

In [37]:
# Procura os melhores parâmetros
rec_gs.fit(data)

In [38]:
# Melhor score
print(rec_gs.best_score)
# Melhor n_factors
print(rec_gs.best_params)

{'mse': 0.8709581619076483}
{'mse': {'n_factors': 50}}


In [39]:
# Seleciona melhor modelo do grid search
best_rec = rec_gs.best_estimator['mse']

In [40]:
# Ajusta o modelo no dataset de treino completo
best_rec.fit(train_full)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f01c8dff190>

In [41]:
# Gera DataFrame de "produção" com todos os itens que cada usuário não viu
prod = train_full.build_anti_testset()

In [43]:
# Estima o rating de cada usuário nos itens não vistos
pred_prod = best_rec.test(prod)
pred_prod = pd.DataFrame(pred_prod)

In [44]:
def generate_predictions(recsys, prod_data, user_id, item_info, n_recommendations):
    prod_predictions = pd.DataFrame(recsys.test(prod_data))
    user_predictions = prod_predictions.loc[
        prod_predictions['uid'] == user_id
    ]
    
    best_predictions = user_predictions.sort_values(by='est', ascending=False)
    best_predictions = best_predictions.rename(columns={'iid':'item_id'})
    
    return best_predictions.iloc[:n_recommendations].merge(item_info)['title'].values.tolist()

**Você pode gostar de...**

In [48]:
generate_predictions(best_rec, prod, 0, item_info, 5)

['Shawshank Redemption, The (1994)',
 'Wrong Trousers, The (1993)',
 'Wallace & Gromit: The Best of Aardman Animation (1996)',
 'Silence of the Lambs, The (1991)',
 "Schindler's List (1993)"]

# KNN

###  KNN com similaridade por usuários e correlação de Pearson

In [49]:
rec_knn = KNNBasic(
    k=10,
    min_k=1,
    sim_options = {
        'name' : 'pearson',
        'user_based':True
    }
)

In [50]:
rec_knn.fit(train)
pred_knn = pd.DataFrame(rec_knn.test(test))

Computing the pearson similarity matrix...
Done computing similarity matrix.


In [51]:
mean_squared_error(pred_knn['r_ui'], pred_knn['est'])

1.305666230778851

In [52]:
rec_knn.fit(train_full)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f01bde14d10>

**Usuários como você também viram...**

In [53]:
generate_predictions(rec_knn, prod, 0, item_info, 5)

['Bitter Moon (1992)',
 'Double Happiness (1994)',
 'Anne Frank Remembered (1995)',
 'Grosse Fatigue (1994)',
 'Aparajito (1956)']

### KNN com similaridade por item e similaridade de cosenos

In [54]:
rec_knn = KNNBasic(
    k=10,
    min_k=1,
    sim_options = {
        'name' : 'cosine',
        'user_based':False
    }
)

In [55]:
rec_knn.fit(train)
pred_knn = pd.DataFrame(rec_knn.test(test))

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [56]:
mean_squared_error(pred_knn['r_ui'], pred_knn['est'])

1.2578150890443238

In [57]:
rec_knn.fit(train_full)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f01c8e62890>

**Com base nos filmes que você viu...**

In [58]:
generate_predictions(rec_knn, prod, 0, item_info, 5)

['Head Above Water (1996)',
 'Amityville: Dollhouse (1996)',
 'Woman in Question, The (1950)',
 'Underworld (1997)',
 'Sliding Doors (1998)']