In [33]:
# Data from chapter, Imbalanced Learn - Not even 1% win the lottery
from sklearn.datasets import fetch_openml
data = fetch_openml(data_id=1220)

df = pd.DataFrame(
    data['data'],
    columns=data['feature_names']
)[
    [
        'user_id',
        'ad_id'
    ]
].astype(int)


df['user_rating'] = pd.Series(data['target']).astype(int) 

In [34]:
df['user_rating'].mean()

0.16841894462801643

In [35]:
df.head(4)

Unnamed: 0,user_id,ad_id,user_rating
0,0,8343295,0
1,562934,20017077,1
2,11621116,21348354,0
3,8778348,20366086,0


In [46]:
df.head(10).groupby(
    ['user_id', 'ad_id']
).max().reset_index().pivot(
    'user_id', 'ad_id', 'user_rating'
).fillna(0).astype(int)

ad_id,6803526,8343295,9027213,20017077,20366086,20886690,21186478,21348354,21367376,21811752
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,0,0,0,0,0,0,0,0,0
562934,0,0,0,1,0,0,0,0,0,0
579253,0,0,0,0,0,0,0,0,0,0
2886008,0,0,0,0,0,0,1,0,0,0
5277279,0,0,0,0,0,0,0,0,0,0
7589739,0,0,0,0,0,0,0,0,0,0
8778348,0,0,0,0,0,0,0,0,0,0
11621116,0,0,0,0,0,0,0,0,0,0
11808635,0,0,1,0,0,0,0,0,0,0
12118311,0,0,0,0,0,0,0,0,0,0


In [93]:
dfdf = df.groupby(
    ['user_id', 'ad_id']
).max().reset_index().pivot(
    'user_id', 'ad_id', 'user_rating'
).fillna(0).astype(int)


(
   dfdf == 0 
).sum().sum() / (dfdf.shape[0] * dfdf.shape[1])

0.9999885602175846

# Loading Data

In [67]:
from surprise.dataset import Dataset
from surprise import Reader

reader = Reader(rating_scale=(0, 1))
dataset = Dataset.load_from_df(df, reader)

In [71]:
dataset_subset = Dataset.load_from_df(df.sample(frac=0.2, random_state=0), reader)

In [72]:
dataset.df.shape[0], dataset_subset.df.shape[0]

(39948, 7990)

# Train/Test Split

In [38]:
from surprise.model_selection import train_test_split

trainset, testset = train_test_split(dataset, test_size=0.25)

# K-Fold Cross Validation 

In [53]:
from surprise.model_selection import cross_validate

def predict_evaluate(recsys, dataset, name='Algorithm'):
    scores = cross_validate(recsys, dataset, measures=['RMSE', 'MAE'], cv=4, n_jobs=-1, verbose=False)
    print(
        'Testset Avg. MAE: {:.2f} & Avg. RMSE: {:.2f} [{}]'.format(
            scores['test_mae'].mean(),
            scores['test_rmse'].mean(),
            name
        )
    )

# Random Recommender

In [48]:
from surprise import AlgoBase

class RandomRating(AlgoBase):

    def __init__(self, p=0.5):
        self.p = p
        AlgoBase.__init__(self)

    def estimate(self, u, i):
        return np.random.binomial(n=1, p=self.p, size=1)[0]

In [49]:
recsys = RandomRating(p=0.168)
predict_evaluate(recsys, dataset, 'RandomRating')

Testset Avg. MAE: 0.28 & Avg. RMSE: 0.53 [RandomRating]


# KNN

In [54]:
from surprise.prediction_algorithms.random_pred import NormalPredictor

recsys = NormalPredictor()
predict_evaluate(recsys, dataset, 'NormalPredictor')

Testset Avg. MAE: 0.33 & Avg. RMSE: 0.47 [NormalPredictor]


In [55]:
from surprise.prediction_algorithms.knns import KNNBasic

recsys = KNNBasic(verbose=False)
predict_evaluate(recsys, dataset, 'KNNBasic')

Testset Avg. MAE: 0.28 & Avg. RMSE: 0.38 [KNNBasic]


In [73]:
from surprise.prediction_algorithms.knns import KNNBasic

recsys = KNNBasic(verbose=False)
predict_evaluate(recsys, dataset_subset, 'KNNBasic')

Testset Avg. MAE: 0.28 & Avg. RMSE: 0.38 [KNNBasic]


In [30]:
# from surprise.prediction_algorithms.knns import KNNWithMeans

# recsys = KNNWithMeans(verbose=False)
# predict_evaluate(recsys, dataset, 'KNNWithMeans')

# Gives same results
# Testset Avg. MAE: 0.28 & Avg. RMSE: 0.38 [KNNBasic]

Testset Avg. MAE: 0.28 & Avg. RMSE: 0.38 [KNNBasic]


In [75]:
from surprise.prediction_algorithms.knns import KNNBasic

sim_options = {
    'name': 'cosine', 'user_based': False
}
recsys = KNNBasic(k=20, sim_options=sim_options, verbose=False)
predict_evaluate(recsys, dataset, 'KNNBasic')

Testset Avg. MAE: 0.29 & Avg. RMSE: 0.39 [KNNBasic]


In [83]:
dataset_subset = Dataset.load_from_df(df.sample(frac=0.25, random_state=0), reader)

In [84]:
from surprise.model_selection import GridSearchCV
from surprise.prediction_algorithms.knns import KNNBasic

param_grid = {
    'sim_options': {
        'name':['cosine', 'pearson'],
    },
    'k': [5, 10, 20, 40],
    'verbose': [True],
}

gscv = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=4, n_jobs=-1)
gscv.fit(dataset_subset)

print('Best MAE:', gscv.best_score['mae'].round(2))
print('Best RMSE:', gscv.best_score['rmse'].round(2))
print('Best Params', gscv.best_params['rmse'])

Best MAE: 0.28
Best RMSE: 0.38
Best Params {'sim_options': {'name': 'pearson', 'user_based': True}, 'k': 20, 'verbose': True}


# Baseline

In [31]:
from surprise.prediction_algorithms.baseline_only import BaselineOnly

recsys = BaselineOnly(verbose=False)
predict_evaluate(recsys, dataset, 'BaselineOnly')

Testset Avg. MAE: 0.27 & Avg. RMSE: 0.37 [BaselineOnly]


# SVD

In [106]:
from surprise.prediction_algorithms.matrix_factorization import SVD

recsys = SVD()
predict_evaluate(recsys, dataset, 'SVD')

Testset Avg. MAE: 0.26 & Avg. RMSE: 0.37 [SVD]


In [107]:
from surprise.prediction_algorithms.matrix_factorization import SVD

recsys = SVD(biased=False)
predict_evaluate(recsys, dataset, 'SVD')

Testset Avg. MAE: 0.27 & Avg. RMSE: 0.38 [SVD]


In [119]:
from surprise.prediction_algorithms.matrix_factorization import SVD

recsys = SVD(n_factors=10, biased=False)
predict_evaluate(recsys, dataset, 'SVD')

Testset Avg. MAE: 0.27 & Avg. RMSE: 0.38 [SVD]


In [125]:
recsys = SVD(n_factors=4, biased=True)
recsys.fit(trainset)

recsys.pu, recsys.qi


(array([[-0.11901254,  0.00442723, -0.15424857,  0.19753723],
        [-0.04175569, -0.00895403,  0.0276043 , -0.0403843 ],
        [ 0.00616301,  0.02387053,  0.01444298, -0.00507596],
        ...,
        [ 0.06476914, -0.02040036, -0.05734052,  0.07204454],
        [ 0.02228705, -0.01147516,  0.05431841, -0.03149536],
        [ 0.08799797,  0.04526675, -0.05421905, -0.0497379 ]]),
 array([[-0.15374548,  0.02884507, -0.13529331,  0.14877113],
        [ 0.0809587 , -0.10117817, -0.02603168, -0.01538782],
        [ 0.18730871, -0.1118241 , -0.112454  ,  0.09194745],
        ...,
        [ 0.02685527,  0.1384208 , -0.04758385, -0.01417576],
        [-0.07149433,  0.09685209,  0.02435593,  0.13502017],
        [-0.06114762, -0.04904397, -0.01307072, -0.13672229]]))

In [109]:
from surprise.prediction_algorithms.matrix_factorization import SVD

recsys = SVD(n_factors=2, biased=False)
predict_evaluate(recsys, dataset, 'SVD')

Testset Avg. MAE: 0.27 & Avg. RMSE: 0.38 [SVD]


In [116]:
from surprise.prediction_algorithms.matrix_factorization import SVD

recsys = SVD(n_factors=5, n_epochs=10, biased=True)
predict_evaluate(recsys, dataset, 'SVD')

Testset Avg. MAE: 0.26 & Avg. RMSE: 0.37 [SVD]


In [117]:
from surprise.prediction_algorithms.matrix_factorization import SVD

recsys = SVD(n_factors=5, n_epochs=200, biased=True)
predict_evaluate(recsys, dataset, 'SVD')

Testset Avg. MAE: 0.27 & Avg. RMSE: 0.38 [SVD]


# Grid Search Cross Validation

In [163]:
from surprise.model_selection import GridSearchCV
from surprise.prediction_algorithms.baseline_only import BaselineOnly

param_grid = {
    'p': [0.01, 0.05, 0.1, 0.15, 0.2, 0.25]
}

gscv = GridSearchCV(RandomRating, param_grid, measures=['rmse', 'mae'], cv=4, n_jobs=-1)
gscv.fit(dataset)

print('Best MAE:', gscv.best_score['mae'].round(2))
print('Best RMSE:', gscv.best_score['rmse'].round(2))
print('Best Params', gscv.best_params['rmse'])

Best MAE: 0.18
Best RMSE: 0.42
Best Params {'p': 0.01}


In [164]:
from surprise.model_selection import GridSearchCV
from surprise.prediction_algorithms.baseline_only import BaselineOnly

param_grid = {
    'bsl_options': {
        'method':['als'],
        'n_epochs': [10, 50, 100],
        'reg_u': [5, 15, 25],
        'reg_i': [5, 10, 20],
        
    },
    'verbose': [False]
}

gscv = GridSearchCV(BaselineOnly, param_grid, measures=['rmse', 'mae'], cv=4, n_jobs=-1)
gscv.fit(dataset)

print('Best MAE:', gscv.best_score['mae'].round(2))
print('Best RMSE:', gscv.best_score['rmse'].round(2))
print('Best Params', gscv.best_params['rmse'])

Best MAE: 0.27
Best RMSE: 0.37
Best Params {'bsl_options': {'method': 'als', 'n_epochs': 50, 'reg_u': 25, 'reg_i': 10}, 'verbose': False}


In [85]:
from surprise.prediction_algorithms.baseline_only import BaselineOnly
from surprise import accuracy

recsys = BaselineOnly()
recsys.fit(trainset)
predictions = recsys.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

Estimating biases using als...
RMSE: 0.3688


0.3688410769535897

In [89]:
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise import accuracy

recsys = SVD(
    n_factors=20,
    n_epochs=100,
    verbose=False,
)
recsys.fit(trainset)
predictions = recsys.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.3749


0.3748614956923788

In [51]:
from surprise.prediction_algorithms.knns import KNNBasic
from surprise import accuracy

recsys = KNNBasic()
recsys.fit(trainset)
predictions = recsys.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.3792


0.37920859429757436

In [52]:
from surprise.prediction_algorithms.knns import KNNWithMeans
from surprise import accuracy

recsys = KNNWithMeans()
recsys.fit(trainset)
predictions = recsys.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.3785


0.3785244871388106

In [53]:
from surprise.prediction_algorithms.knns import KNNBaseline
from surprise import accuracy

recsys = KNNBaseline()
recsys.fit(trainset)
predictions = recsys.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.3727


0.3726847981061818

In [86]:
import joblib
from surprise.prediction_algorithms.baseline_only import BaselineOnly

recsys = BaselineOnly()
recsys.fit(trainset)
joblib.dump(recsys, 'recsys.pkl') 

Estimating biases using als...


['recsys.pkl']

In [87]:
from surprise import accuracy
recsys = joblib.load('recsys.pkl') 
predictions = recsys.test(testset)
accuracy.rmse(predictions)

RMSE: 0.3629


0.3628701778614601

In [1]:
music_ratings = [
    ('U1', 'Metallica'), ('U1', 'Rammstein'), ('U2', 'Rammstein'), ('U3', 'Tiesto'), ('U3', 'Paul van Dyk'),
    ('U2', 'Metallica'), ('U4', 'Tiesto'), ('U4', 'Paul van Dyk'), ('U5', 'Metallica'),  ('U5', 'Slipknot'),
    ('U6', 'Tiesto'), ('U6', 'Aly & Fila'), ('U3', 'Aly & Fila')
]

In [94]:
df_music_ratings = pd.DataFrame(music_ratings, columns=['User', 'Artist'])
df_music_ratings['Rating'] = 1

df_music_ratings_pivoted = df_music_ratings.pivot('User', 'Artist', 'Rating').fillna(0)

df_music_ratings_pivoted.style.applymap(
    lambda cell: 'background-color: black; color: white' if cell == 0 else 'background-color: white; color: black'
)

Artist,Aly & Fila,Metallica,Paul van Dyk,Rammstein,Slipknot,Tiesto
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
U1,0,1,0,1,0,0
U2,0,1,0,1,0,0
U3,1,0,1,0,0,1
U4,0,0,1,0,0,1
U5,0,1,0,0,1,0
U6,1,0,0,0,0,1


In [95]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=2)
svd.fit_transform(df_music_ratings_pivoted).round(2)

array([[ 0.  ,  1.37],
       [ 0.  ,  1.37],
       [ 1.71, -0.  ],
       [ 1.21,  0.  ],
       [ 0.  ,  1.  ],
       [ 1.21,  0.  ]])

In [97]:
pd.DataFrame(
    svd.fit_transform(df_music_ratings_pivoted),
    index=df_music_ratings_pivoted.index,
    columns=['SV1', 'SV2'], 
).round(2).style.bar(
    subset=['SV1', 'SV2'],  align='mid', color='#AAA'
)

Unnamed: 0_level_0,SV1,SV2
User,Unnamed: 1_level_1,Unnamed: 2_level_1
U1,0.0,1.37
U2,0.0,1.37
U3,1.71,-0.0
U4,1.21,-0.0
U5,0.0,1.0
U6,1.21,-0.0


In [101]:
from sklearn.metrics.pairwise import cosine_similarity

user_ids = ['U1', 'U2', 'U3', 'U5']

pd.DataFrame(
    cosine_similarity(
        df_music_ratings_pivoted.loc[user_ids, :].values
    ),
    index=user_ids,
    columns=user_ids
).round(2).style.bar(
    subset=user_ids,  align='mid', color='#AAA'
)

Unnamed: 0,U1,U2,U3,U5
U1,1.0,1.0,0,0.5
U2,1.0,1.0,0,0.5
U3,0.0,0.0,1,0.0
U5,0.5,0.5,0,1.0


In [103]:
from sklearn.metrics.pairwise import cosine_similarity

user_ids = ['U1', 'U2', 'U3', 'U5']

df_user_svd = pd.DataFrame(
    svd.fit_transform(df_music_ratings_pivoted),
    index=df_music_ratings_pivoted.index,
    columns=['SV1', 'SV2'], 
)

pd.DataFrame(
    cosine_similarity(
        df_user_svd.loc[user_ids, :].values
    ),
    index=user_ids,
    columns=user_ids
).round(2).style.bar(
    subset=user_ids,  align='mid', color='#AAA'
)

Unnamed: 0,U1,U2,U3,U5
U1,1,1,0,1
U2,1,1,0,1
U3,0,0,1,0
U5,1,1,0,1


In [98]:
from sklearn.metrics.pairwise import cosine_similarity

user_ids = ['U3', 'U4', 'U5', 'U6']

pd.DataFrame(
    cosine_similarity(
        df_music_ratings_pivoted.loc[user_ids, :].values
    ),
    index=user_ids,
    columns=user_ids
).round(2).style.bar(
    subset=user_ids,  align='mid', color='#AAA'
)

Unnamed: 0,U3,U4,U5,U6
U3,1.0,0.82,0,0.82
U4,0.82,1.0,0,0.5
U5,0.0,0.0,1,0.0
U6,0.82,0.5,0,1.0


In [99]:
from sklearn.metrics.pairwise import cosine_similarity

user_ids = ['U3', 'U4', 'U5', 'U6']

df_user_svd = pd.DataFrame(
    svd.fit_transform(df_music_ratings_pivoted),
    index=df_music_ratings_pivoted.index,
    columns=['Component1', 'Component2'], 
)

pd.DataFrame(
    cosine_similarity(
        df_user_svd.loc[user_ids, :].values
    ),
    index=user_ids,
    columns=user_ids
).round(2).style.bar(
    subset=user_ids,  align='mid', color='#AAA'
)

Unnamed: 0,U3,U4,U5,U6
U3,1,1,0,1
U4,1,1,0,1
U5,0,0,1,0
U6,1,1,0,1


In [9]:
pd.DataFrame(
    svd.components_,
    index=['Component1', 'Component2'],
    columns=df_music_ratings_pivoted.columns,
).T.round(2).style.bar(
    subset=['Component1', 'Component2'],  align='mid', color='#AAA'
)

Unnamed: 0_level_0,Component1,Component2
Artist,Unnamed: 1_level_1,Unnamed: 2_level_1
Aly & Fila,0.5,-0.0
Metallica,0.0,0.79
Paul van Dyk,0.5,-0.0
Rammstein,0.0,0.58
Slipknot,0.0,0.21
Tiesto,0.71,-0.0


In [10]:
from sklearn.metrics.pairwise import cosine_similarity

artist_ids = ['Aly & Fila', 'Metallica', 'Paul van Dyk', 'Rammstein', 'Slipknot', 'Tiesto']

pd.DataFrame(
    cosine_similarity(
        df_music_ratings_pivoted.T.loc[artist_ids, :].values
    ),
    index=artist_ids,
    columns=artist_ids
).round(2).style.bar(
    subset=artist_ids,  align='mid', color='#AAA'
)

Unnamed: 0,Aly & Fila,Metallica,Paul van Dyk,Rammstein,Slipknot,Tiesto
Aly & Fila,1.0,0.0,0.5,0.0,0.0,0.82
Metallica,0.0,1.0,0.0,0.82,0.58,0.0
Paul van Dyk,0.5,0.0,1.0,0.0,0.0,0.82
Rammstein,0.0,0.82,0.0,1.0,0.0,0.0
Slipknot,0.0,0.58,0.0,0.0,1.0,0.0
Tiesto,0.82,0.0,0.82,0.0,0.0,1.0


In [11]:
from sklearn.metrics.pairwise import cosine_similarity

artist_ids = ['Aly & Fila', 'Metallica', 'Paul van Dyk', 'Rammstein', 'Slipknot', 'Tiesto']

df_artist_svd = pd.DataFrame(
    svd.components_,
    index=['Component1', 'Component2'],
    columns=df_music_ratings_pivoted.columns,
).T

pd.DataFrame(
    cosine_similarity(
        df_artist_svd.loc[artist_ids, :].values
    ),
    index=artist_ids,
    columns=artist_ids
).round(2).style.bar(
    subset=artist_ids,  align='mid', color='#AAA'
)

Unnamed: 0,Aly & Fila,Metallica,Paul van Dyk,Rammstein,Slipknot,Tiesto
Aly & Fila,1,0,1,0,0,1
Metallica,0,1,0,1,1,0
Paul van Dyk,1,0,1,0,0,1
Rammstein,0,1,0,1,1,0
Slipknot,0,1,0,1,1,0
Tiesto,1,0,1,0,0,1


In [12]:
from sklearn.neighbors import NearestNeighbors

nn = NearestNeighbors(n_neighbors=2)
nn.fit(svd.components_.T)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=2, p=2,
                 radius=1.0)

In [13]:
nn.kneighbors([[1, 0]], n_neighbors=3, return_distance=True)

(array([[0.29289322, 0.5       , 0.5       ]]), array([[5, 0, 2]]))

In [14]:
nn.kneighbors([[1, 0]], n_neighbors=2, return_distance=False)[0]

array([5, 0])

In [15]:
df_music_ratings_pivoted.columns.values[
    nn.kneighbors([[1, 0]], n_neighbors=2, return_distance=False)[0]
].tolist()

['Tiesto', 'Aly & Fila']

In [16]:
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors

class ArtistRecommender:
    
    def __init__(self):
        pass
        
    def train(self, ratings):
        rating_matrix = self.ratings2matrix(ratings)
        self.artists = rating_matrix.columns
        components = self.calcuate_svd(rating_matrix)
        self.nn = self.build_knn(components)
        
    def ratings2matrix(self, ratings):
        df_ratings = pd.DataFrame(ratings, columns=['User', 'Artist'])
        df_ratings['Rating'] = 1
        return df_ratings.pivot('User', 'Artist', 'Rating').fillna(0)
    
    def calcuate_svd(self, rating_matrix):
        svd = TruncatedSVD(n_components=2)
        svd.fit(rating_matrix)
        return svd.components_
    
    def build_knn(self, components):
        nn = NearestNeighbors(n_neighbors=2)
        nn.fit(svd.components_.T)
        return nn
    
    def predict(self, x, n_neighbors=2):
        return self.artists.values[
            self.nn.kneighbors([x], n_neighbors=n_neighbors, return_distance=False)[0]
        ].tolist()

ac = ArtistRecommender()
ac.train(music_ratings)
ac.predict([0, 1], n_neighbors=3)
        

['Metallica', 'Rammstein', 'Slipknot']

In [17]:
import joblib

ac = ArtistRecommender()
ac.train(music_ratings)
joblib.dump(ac, 'artist_recommender.pkl') 

['artist_recommender.pkl']

In [18]:
ac = joblib.load('artist_recommender.pkl')
ac.predict([1, 0], n_neighbors=5)

['Tiesto', 'Aly & Fila', 'Paul van Dyk', 'Slipknot', 'Rammstein']