In [2]:
all_txns = []

with open('groceries.csv') as f:
    content = f.readlines()
    txns = [ x.strip() for x in content]
    for each_txn in txns:
        all_txns.append(each_txn.split(','))

In [3]:
all_txns[0:5]

[['citrus fruit', 'semi-finished bread', 'margarine', 'ready soups'],
 ['tropical fruit', 'yogurt', 'coffee'],
 ['whole milk'],
 ['pip fruit', 'yogurt', 'cream cheese', 'meat spreads'],
 ['other vegetables',
  'whole milk',
  'condensed milk',
  'long life bakery product']]

In [4]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [5]:
# Initialize OnehotTransactions
one_hot_encoding = TransactionEncoder()
# Transform the data into one-hot-encoding format
one_hot_txns = one_hot_encoding.fit(all_txns).transform(all_txns)
# Conver the matrix into the dataframe.
one_hot_txns_df = pd.DataFrame(one_hot_txns,columns=one_hot_encoding.columns_)

In [6]:
one_hot_txns_df.iloc[5:10, 10:20]

Unnamed: 0,berries,beverages,bottled beer,bottled water,brandy,brown bread,butter,butter milk,cake bar,candles
5,False,False,False,False,False,False,True,False,False,False
6,False,False,False,False,False,False,False,False,False,False
7,False,False,True,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False


In [7]:
one_hot_txns_df.shape

(9835, 169)

In [8]:
len(one_hot_txns_df.columns)

169

In [9]:
frequent_itemsets = apriori(one_hot_txns_df, min_support = 0.02, use_colnames = True)

In [10]:
frequent_itemsets.sample(10, random_state = 90)

Unnamed: 0,support,itemsets
60,0.020437,"(bottled beer, whole milk)"
52,0.033859,(sugar)
89,0.035892,"(tropical fruit, other vegetables)"
105,0.021047,"(root vegetables, tropical fruit)"
88,0.03274,"(soda, other vegetables)"
16,0.058058,(coffee)
111,0.024504,"(shopping bags, whole milk)"
36,0.079817,(newspapers)
119,0.056024,"(yogurt, whole milk)"
55,0.071683,(whipped/sour cream)


In [11]:
rules = association_rules(frequent_itemsets, metric = 'lift', min_threshold =1)

In [12]:
rules.sample(5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
61,(whole milk),(other vegetables),0.255516,0.193493,0.074835,0.292877,1.513634,1.0,0.025394,1.140548,0.455803,0.2,0.123228,0.339817
115,"(root vegetables, whole milk)",(other vegetables),0.048907,0.193493,0.023183,0.474012,2.44977,1.0,0.013719,1.53332,0.62223,0.105751,0.347821,0.296912
1,(whole milk),(beef),0.255516,0.052466,0.021251,0.083168,1.58518,1.0,0.007845,1.033487,0.495856,0.074113,0.032402,0.244103
51,(other vegetables),(root vegetables),0.193493,0.108998,0.047382,0.244877,2.246605,1.0,0.026291,1.179941,0.688008,0.185731,0.1525,0.339789
105,(whole milk),(tropical fruit),0.255516,0.104931,0.042298,0.165539,1.577595,1.0,0.015486,1.072631,0.491782,0.13295,0.067713,0.28432


In [13]:
rules.sort_values('confidence', ascending = False)[0:10]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
120,"(yogurt, other vegetables)",(whole milk),0.043416,0.255516,0.022267,0.512881,2.007235,1.0,0.011174,1.52834,0.524577,0.080485,0.345695,0.300014
16,(butter),(whole milk),0.055414,0.255516,0.027555,0.497248,1.946053,1.0,0.013395,1.480817,0.514659,0.097237,0.324697,0.302543
24,(curd),(whole milk),0.053279,0.255516,0.026131,0.490458,1.919481,1.0,0.012517,1.461085,0.505984,0.092446,0.315577,0.296363
114,"(root vegetables, other vegetables)",(whole milk),0.047382,0.255516,0.023183,0.48927,1.914833,1.0,0.011076,1.457687,0.501524,0.082879,0.313982,0.289999
115,"(root vegetables, whole milk)",(other vegetables),0.048907,0.193493,0.023183,0.474012,2.44977,1.0,0.013719,1.53332,0.62223,0.105751,0.347821,0.296912
28,(domestic eggs),(whole milk),0.063447,0.255516,0.029995,0.472756,1.850203,1.0,0.013783,1.41203,0.490649,0.1038,0.2918,0.295073
109,(whipped/sour cream),(whole milk),0.071683,0.255516,0.032232,0.449645,1.759754,1.0,0.013916,1.352735,0.465077,0.109273,0.260757,0.287895
90,(root vegetables),(whole milk),0.108998,0.255516,0.048907,0.448694,1.756031,1.0,0.021056,1.350401,0.483202,0.154961,0.259479,0.320049
50,(root vegetables),(other vegetables),0.108998,0.193493,0.047382,0.434701,2.246605,1.0,0.026291,1.426693,0.622764,0.185731,0.299078,0.339789
32,(frozen vegetables),(whole milk),0.048094,0.255516,0.020437,0.424947,1.663094,1.0,0.008149,1.294636,0.418855,0.072172,0.227582,0.252466


### COLLOBORATIVE FILTERING

In [15]:
rating_df = pd.read_csv("ml-latest-small/ratings.csv")

In [16]:
rating_df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [17]:
rating_df.drop('timestamp', axis = 1, inplace = True)

In [18]:
len(rating_df.userId.unique())

610

In [19]:
len(rating_df.movieId.unique())

9724

In [20]:
user_movies_df = rating_df.pivot( index='userId',
                                  columns='movieId',
                                  values = "rating" ).reset_index(drop=True)
user_movies_df.index = rating_df.userId.unique()

In [21]:
user_movies_df.iloc[0:5, 0:15]

movieId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
1,4.0,,4.0,,,4.0,,,,,,,,,
2,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,
5,4.0,,,,,,,,,,,,,,


In [22]:
user_movies_df.fillna( 0, inplace = True )
user_movies_df.iloc[0:5, 0:10]

movieId,1,2,3,4,5,6,7,8,9,10
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


####  Calculating Cosine Similarity between users

In [24]:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation
user_sim = 1 - pairwise_distances( user_movies_df.values, metric="cosine" )
#Store the results in a dataframe
user_sim_df = pd.DataFrame( user_sim )
# set the index and column names to user ids (0 to 671)
user_sim_df.index = rating_df.userId.unique()
user_sim_df.columns = rating_df.userId.unique()

In [25]:
user_sim_df.iloc[0:5, 0:5]

Unnamed: 0,1,2,3,4,5
1,1.0,0.027283,0.05972,0.194395,0.12908
2,0.027283,1.0,0.0,0.003726,0.016614
3,0.05972,0.0,1.0,0.002251,0.00502
4,0.194395,0.003726,0.002251,1.0,0.128659
5,0.12908,0.016614,0.00502,0.128659,1.0


In [26]:
user_sim_df.shape

(610, 610)

In [27]:
np.fill_diagonal( user_sim, 0 )
user_sim_df.iloc[0:5, 0:5]

Unnamed: 0,1,2,3,4,5
1,0.0,0.027283,0.05972,0.194395,0.12908
2,0.027283,0.0,0.0,0.003726,0.016614
3,0.05972,0.0,0.0,0.002251,0.00502
4,0.194395,0.003726,0.002251,0.0,0.128659
5,0.12908,0.016614,0.00502,0.128659,0.0


#### Filtering Similar User

In [29]:
user_sim_df.idxmax(axis=1)[0:5]

1    266
2    366
3    313
4    391
5    470
dtype: int64

In [30]:
user_sim_df.iloc[1:2, 360:370]

Unnamed: 0,361,362,363,364,365,366,367,368,369,370
2,0.012776,0.115081,0.084261,0.0,0.149578,0.300074,0.031699,0.008637,0.016431,0.034816


#### Loading the Movies Dataset

In [32]:
movies_df = pd.read_csv("ml-latest-small/movies.csv")

In [33]:
movies_df[0:5]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [34]:
movies_df.drop('genres', axis =1, inplace = True)

In [35]:
def get_user_similar_movies( user1, user2 ):
    common_movies = rating_df[rating_df.userId == user1].merge(
                    rating_df[rating_df.userId == user2],
                    on = "movieId",
                    how = "inner" )
# join the above result set with movies details
    return common_movies.merge( movies_df, on = 'movieId' )

In [36]:
common_movies = get_user_similar_movies( 2, 366)

In [37]:
common_movies[(common_movies.rating_x >= 4.0) & ((common_movies.rating_y >= 4.0))]

Unnamed: 0,userId_x,movieId,rating_x,userId_y,rating_y,title
0,2,3578,4.0,366,4.5,Gladiator (2000)
1,2,6874,4.0,366,4.0,Kill Bill: Vol. 1 (2003)
2,2,48516,4.0,366,4.5,"Departed, The (2006)"
3,2,58559,4.5,366,4.0,"Dark Knight, The (2008)"
4,2,68157,4.5,366,4.5,Inglourious Basterds (2009)
5,2,79132,4.0,366,4.0,Inception (2010)


In [38]:
common_movies = get_user_similar_movies( 2, 368 )
common_movies

Unnamed: 0,userId_x,movieId,rating_x,userId_y,rating_y,title
0,2,3578,4.0,368,3.0,Gladiator (2000)


#### Item Based Similarity

In [40]:
## Calculating Cosine Similarity between movies
rating_mat = rating_df.pivot( index='movieId',columns='userId',values = "rating" ).reset_index(drop=True)
# fill all NaNs with 0
rating_mat.fillna( 0, inplace = True )
# Find the correlation between movies
movie_sim = 1 - pairwise_distances( rating_mat.values,metric="correlation" )
# Fill the diagonal with 0, as it repreresent the auto-correlation of movies
movie_sim_df = pd.DataFrame( movie_sim )

In [41]:
movie_sim_df.iloc[0:5, 0:5]

Unnamed: 0,0,1,2,3,4
0,1.0,0.231327,0.173213,-0.028917,0.192474
1,0.231327,1.0,0.191945,0.071269,0.200526
2,0.173213,0.191945,1.0,0.067143,0.370171
3,-0.028917,0.071269,0.067143,1.0,0.16791
4,0.192474,0.200526,0.370171,0.16791,1.0


In [42]:
movie_sim_df.shape

(9724, 9724)

In [43]:
def get_similar_movies( movieid, topN = 5 ):
    movieidx = movies_df[movies_df.movieId == movieid].index[0]
    movies_df['similarity'] = movie_sim_df.iloc[movieidx]
    top_n = movies_df.sort_values( ["similarity"], ascending = False )[0:topN]
    return top_n

In [44]:
movies_df[movies_df.movieId == 858]

Unnamed: 0,movieId,title
659,858,"Godfather, The (1972)"


In [45]:
get_similar_movies(858)

Unnamed: 0,movieId,title,similarity
659,858,"Godfather, The (1972)",1.0
921,1220,"Blues Brothers, The (1980)",0.76939
913,1212,"Third Man, The (1949)",0.560246
895,1192,Paris Is Burning (1990),0.496048
827,1088,Dirty Dancing (1987),0.442128


In [46]:
movies_df[movies_df.movieId == 231]

Unnamed: 0,movieId,title,similarity
197,231,Dumb & Dumber (Dumb and Dumber) (1994),0.095286


In [47]:
get_similar_movies(231)

Unnamed: 0,movieId,title,similarity
197,231,Dumb & Dumber (Dumb and Dumber) (1994),1.0
302,344,Ace Ventura: Pet Detective (1994),0.582137
138,165,Die Hard: With a Vengeance (1995),0.465081
291,333,Tommy Boy (1995),0.464892
126,153,Batman Forever (1995),0.432095


In [48]:
rating_df = pd.read_csv("ml-latest-small/ratings.csv")

In [49]:
from surprise import Dataset, Reader, KNNBasic, accuracy

In [50]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(rating_df[['userId','movieId','rating']], reader=reader)

In [96]:
item_based_cosine_sim = {'name': 'pearson','user_based': True}
knn = KNNBasic(k= 20, min_k = 5, sim_options = item_based_cosine_sim)

In [98]:
from surprise.model_selection import cross_validate
cv_results = cross_validate(knn, data, measures=['RMSE'], cv=5, verbose=False)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


In [102]:
np.mean(cv_results.get('test_rmse'))

0.971378726504134

In [104]:
from surprise.model_selection.search import GridSearchCV

In [106]:
param_grid = {'k': [10, 20],'sim_options': {'name': ['cosine', 'pearson'],'user_based': [True, False]}}
grid_cv = GridSearchCV(KNNBasic,
                       param_grid,
                       measures=['rmse'],
                       cv=5,
                       refit=True)
grid_cv.fit(data)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing

In [108]:
# best RMSE score
print(grid_cv.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(grid_cv.best_params['rmse'])

0.9741910677571116
{'k': 20, 'sim_options': {'name': 'cosine', 'user_based': True}}


In [110]:
results_df = pd.DataFrame.from_dict(grid_cv.cv_results)
results_df[['param_k', 'param_sim_options', 'mean_test_rmse', 'rank_test_rmse']]

Unnamed: 0,param_k,param_sim_options,mean_test_rmse,rank_test_rmse
0,10,"{'name': 'cosine', 'user_based': True}",0.986416,4
1,10,"{'name': 'cosine', 'user_based': False}",1.027475,8
2,10,"{'name': 'pearson', 'user_based': True}",0.985638,3
3,10,"{'name': 'pearson', 'user_based': False}",1.013649,7
4,20,"{'name': 'cosine', 'user_based': True}",0.974191,1
5,20,"{'name': 'cosine', 'user_based': False}",0.996724,6
6,20,"{'name': 'pearson', 'user_based': True}",0.974951,2
7,20,"{'name': 'pearson', 'user_based': False}",0.986845,5


In [112]:
grid_cv.predict(1,2)

Prediction(uid=1, iid=2, r_ui=None, est=3.7233653693643416, details={'actual_k': 20, 'was_impossible': False})

## Matrix Factorization

In [116]:
from surprise import SVD

svd = SVD(n_factors = 5)

In [118]:
cv_results = cross_validate(svd,
                            data,
                            measures = ['RMSE'],
                            cv = 5,
                            verbose = True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8769  0.8684  0.8686  0.8614  0.8728  0.8696  0.0052  
Fit time          0.61    0.63    0.64    0.63    0.60    0.62    0.01    
Test time         0.20    0.36    0.17    0.17    0.16    0.21    0.07    
