# Content Based Filtering based on reviews

In [30]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

## Preprocessing 
+ import a subset of the dataset,
+ create the `liked` serie with beer_id as index and reviews as content,
+create

In [3]:
kag = pd.read_csv('/home/bxnxne/code/TomsHL/beerly/raw_data/kaggle_v2.csv', nrows=100_000)
kag.dropna(inplace = True)

### Create `liked`, the serie containing an user's reviewed beers.

Here we will simulate an input from user with `user_id = 246` 

In [199]:
rated = kag[kag.user_id == 246]

If the users reviewed the same beer more than once we concatenate the reviews. Doing this gives us the ability to use the `beer_id` as the serie index.

In [211]:
liked_df = rated[rated.overall >= 3]
if (liked_df['beer_id'].duplicated().sum()):
    liked = liked_df.groupby('beer_id')['review_text'].apply(lambda x: "%s" % ' '.join(x))
else:
    liked = liked_df.set_index(liked_df.beer_id, drop = True)['review_text']
liked.head()

beer_id
276     I had one of these the other night and I can't...
798     I saw that one of the Alstroms reviewed this, ...
1027    Apperance: A copper colored body crowned with ...
1525    Appearance: Into a pint glass, this beer pours...
1620    Bought a 765 mL bottle of this recently. Quite...
Name: review_text, dtype: object

### Create `menu` a dummy serie containing beers from a hypothetical menu.

In [202]:
#we don't need all the reviews only beers.
beers = kag.drop_duplicates(subset=['beer_id']) 

In [207]:
menu_df = beers.sample(20, random_state=4)
menu = menu_df.set_index(menu_df.beer_id, drop=True)['review_text']
menu.head()

beer_id
5510     Pours a light, yet full, copper color and is t...
70232    Reviewed from notes. Thanks to my dad for brin...
72176    Bottle number 213 and pours a dark kola hue wi...
26201    Pale yellow pour with minimal head and no laci...
69523    The beer is clouded dark amber with a tan head...
Name: review_text, dtype: object

## Vectorizing
+ first we'll concat the menu beers in top of the liked beers,
+ then we'll feed the resulting df to a `CountVectorizer` tuned to ignore case, numbers and single letter words.

Concatenation into `payload`

In [215]:
payload = pd.concat([beer_menu,liked])['review_text']
payload = pd.concat([menu,liked])

Vectorization

In [216]:
vectorizer = CountVectorizer(stop_words='english',min_df=5,max_features=300,token_pattern=r'(?u)\b[a-z]{2,}\b')
count_matrix = vectorizer.fit_transform(payload)
count_df = pd.DataFrame(count_matrix.toarray(), index=payload.index.tolist())
count_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,88,89,90,91,92,93,94,95,96,97
5510,0,0,0,1,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
70232,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
72176,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
26201,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
69523,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0


## Compute similarities
+ compute a similarity matrix between beers from the menu and user's liked beers,
+ compute the mean for each menu beers to create a 'similarity' score.

### Similarity matrix

In [222]:
m = menu.size
cosine_sim = cosine_similarity(count_matrix[:m], count_matrix[m:])

In [221]:
similarity = pd.DataFrame(cosine_sim)
similarity.index = menu.index
similarity.columns = liked.index
similarity.columns.name = 'liked_beers_id'
similarity.index.name = 'menu_beers_id'
similarity

liked_beers_id,276,798,1027,1525,1620,1621,1904,2671,3517,6549,7063,9906,13161,19765,20470,20539,20746
menu_beers_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
5510,0.386658,0.160872,0.161515,0.475683,0.257775,0.29576,0.351431,0.242882,0.321176,0.317982,0.439587,0.427724,0.439672,0.177003,0.354144,0.266733,0.405054
70232,0.339457,0.232621,0.136237,0.281387,0.186371,0.246732,0.47187,0.340877,0.29554,0.41382,0.333712,0.171802,0.299183,0.073127,0.495022,0.321412,0.292854
72176,0.280976,0.30005,0.159752,0.238302,0.169975,0.212167,0.212814,0.193801,0.148522,0.161749,0.260875,0.18131,0.30697,0.064312,0.380304,0.376889,0.240381
26201,0.231793,0.077152,0.161374,0.414781,0.240381,0.327327,0.210675,0.222686,0.315063,0.343122,0.237171,0.427352,0.217061,0.151585,0.226455,0.239851,0.194257
69523,0.309058,0.282889,0.387298,0.328368,0.343401,0.363696,0.300965,0.274075,0.373408,0.152499,0.29866,0.303895,0.372104,0.30317,0.320812,0.408635,0.275198
13112,0.525657,0.262445,0.170783,0.274352,0.20767,0.268055,0.386763,0.349619,0.344016,0.374654,0.298807,0.258438,0.234404,0.137505,0.449359,0.302184,0.238623
41843,0.216295,0.062994,0.158114,0.4445,0.364502,0.267261,0.245737,0.167836,0.342997,0.342415,0.279715,0.325669,0.303822,0.247537,0.323575,0.174078,0.297394
58540,0.389643,0.332875,0.177229,0.427062,0.350202,0.342368,0.448583,0.362817,0.521773,0.328982,0.434122,0.46934,0.267577,0.0,0.466321,0.209061,0.323824
53362,0.482226,0.324102,0.21693,0.464647,0.365452,0.366679,0.488864,0.268648,0.313725,0.298957,0.531369,0.351069,0.382103,0.424522,0.396376,0.40303,0.421622
49088,0.398049,0.436436,0.228218,0.384949,0.412796,0.289319,0.510754,0.302813,0.420813,0.242624,0.577651,0.36262,0.285044,0.214373,0.380304,0.376889,0.377742


### Create a similarity score by averaging along columns.

In [223]:
out = similarity.mean(axis=1).sort_values(ascending=False)

In [224]:
out.index.name = 'beer_id'
out.name = 'score'
out

beer_id
53362    0.382372
49088    0.364788
27589    0.361066
45733    0.350249
58540    0.344222
66769    0.327698
5510     0.322450
16896    0.320287
69523    0.317537
13112    0.299020
70232    0.290119
70732    0.271669
59802    0.268883
41843    0.268497
26201    0.249299
72176    0.228773
61745    0.206612
65937    0.189254
65973    0.161765
65884    0.155408
Name: score, dtype: float64

# Appendix

## 1.Tfid vs Bag of words

In [228]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfid = TfidfVectorizer(stop_words='english',min_df=5,max_features=300,token_pattern=r'(?u)\b[a-z]{2,}\b')

tfid_count_matrix = tfid.fit_transform(payload)
pd.DataFrame(tfid_count_matrix.toarray(), index=payload.index.tolist()).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,88,89,90,91,92,93,94,95,96,97
5510,0.0,0.0,0.0,0.226147,0.0,0.0,0.0,0.0,0.217361,0.0,...,0.0,0.147372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.17304,0.0,0.142993,0.0,0.0
72176,0.0,0.0,0.0,0.0,0.0,0.213954,0.0,0.0,0.0,0.0,...,0.0,0.0,0.202365,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.162348,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.299404
69523,0.0,0.0,0.0,0.0,0.168975,0.0,0.0,0.0,0.0,0.0,...,0.0,0.114566,0.0,0.0,0.0,0.0,0.0,0.0,0.200802,0.0


In [229]:
tfid.get_feature_names_out()

array(['abv', 'aftertaste', 'alcohol', 'ale', 'amber', 'american',
       'appearance', 'apperance', 'aroma', 'banana', 'beer', 'beers',
       'belgian', 'better', 'bit', 'bitter', 'bitterness', 'bodied',
       'body', 'bottle', 'brew', 'brown', 'carbonation', 'carmel',
       'character', 'chocolate', 'citrus', 'clean', 'clear', 'cloying',
       'color', 'creamy', 'dark', 'decent', 'didn', 'does', 'doesn',
       'drinkability', 'drinkable', 'dry', 'finish', 'flavor', 'flavors',
       'floral', 'fresh', 'fruit', 'fruity', 'glass', 'golden', 'good',
       'great', 'head', 'high', 'hint', 'hop', 'hoppy', 'hops', 'just',
       'lace', 'lacing', 'lemon', 'light', 'like', 'little', 'malt',
       'malts', 'medium', 'moderate', 'mouthfeel', 'nice', 'nose', 'note',
       'notes', 'orange', 'pale', 'pours', 'quite', 'real', 'really',
       'red', 'rich', 'smell', 'smooth', 'style', 'surface', 'sweet',
       'sweetness', 'tan', 'tap', 'taste', 'tastes', 'thing', 'think',
       'try',

In [230]:
vectorizer.get_feature_names_out()

array(['abv', 'aftertaste', 'alcohol', 'ale', 'amber', 'american',
       'appearance', 'apperance', 'aroma', 'banana', 'beer', 'beers',
       'belgian', 'better', 'bit', 'bitter', 'bitterness', 'bodied',
       'body', 'bottle', 'brew', 'brown', 'carbonation', 'carmel',
       'character', 'chocolate', 'citrus', 'clean', 'clear', 'cloying',
       'color', 'creamy', 'dark', 'decent', 'didn', 'does', 'doesn',
       'drinkability', 'drinkable', 'dry', 'finish', 'flavor', 'flavors',
       'floral', 'fresh', 'fruit', 'fruity', 'glass', 'golden', 'good',
       'great', 'head', 'high', 'hint', 'hop', 'hoppy', 'hops', 'just',
       'lace', 'lacing', 'lemon', 'light', 'like', 'little', 'malt',
       'malts', 'medium', 'moderate', 'mouthfeel', 'nice', 'nose', 'note',
       'notes', 'orange', 'pale', 'pours', 'quite', 'real', 'really',
       'red', 'rich', 'smell', 'smooth', 'style', 'surface', 'sweet',
       'sweetness', 'tan', 'tap', 'taste', 'tastes', 'thing', 'think',
       'try',

In [234]:
f'{100 * (tfid.get_feature_names_out() == vectorizer.get_feature_names_out()).sum() / len(tfid.get_feature_names_out())}%'

'100.0%'

## 2. Python functions

In [235]:
def menu_similarity_matrix(dataset, menu_df, user_id):
    
    #menu_df into series with beer_id as index and reviews as col
    menu = menu_df.set_index(menu_df.beer_id, drop = True)['review_text']
    
    m = menu.size
    
    #retrieve user rated beers 
    rated_df = dataset[dataset.user_id == user_id]
    
    #only keep appreciated beers
    liked_df = rated_df[rated_df.overall >= 3]
    liked_df = liked_df.copy()
    
    #if user rated a beer more than once, agg the reviews.
    if (liked_df['beer_id'].duplicated().sum()):
        liked = liked_df.groupby('beer_id')['review_text'].apply(lambda x: "%s" % ' '.join(x))
    else:
        liked = liked_df.set_index(liked_df.beer_id, drop = True)['review_text']
    
    ###### del liked_df, rated_df ?????
    
    #concat menu beers in top of liked beers
    payload = pd.concat([menu,liked])
    
    #countvectorizer
    vectorizer = CountVectorizer(stop_words='english',min_df=5,max_features=300,token_pattern=r'(?u)\b[a-z]{2,}\b')
    count_matrix = vectorizer.fit_transform(payload)
    count_df = pd.DataFrame(count_matrix.toarray(), index=payload.index.tolist())

    
    #similarity matrix
    cosine_sim = cosine_similarity(count_matrix[:m], count_matrix[m:])
    
    similarity = pd.DataFrame(cosine_sim)
    similarity.index = menu.index
    similarity.columns = liked.index
    similarity.columns.name = 'liked_beers_id'
    similarity.index.name = 'menu_beers_id'
    
    return similarity

In [236]:
def score_menu(similarity):
    out = similarity.mean(axis=1).sort_values(ascending=False)
    out.index.name = 'beer_id'
    out.name = 'score'
    return out

In [237]:
menu_similarity_matrix(kag, beer_menu, 246)

liked_beers_id,276,798,1027,1525,1620,1621,1904,2671,3517,6549,7063,9906,13161,19765,20470,20539,20746
menu_beers_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
5510,0.386658,0.160872,0.161515,0.475683,0.257775,0.29576,0.351431,0.242882,0.321176,0.317982,0.439587,0.427724,0.439672,0.177003,0.354144,0.266733,0.405054
70232,0.339457,0.232621,0.136237,0.281387,0.186371,0.246732,0.47187,0.340877,0.29554,0.41382,0.333712,0.171802,0.299183,0.073127,0.495022,0.321412,0.292854
72176,0.280976,0.30005,0.159752,0.238302,0.169975,0.212167,0.212814,0.193801,0.148522,0.161749,0.260875,0.18131,0.30697,0.064312,0.380304,0.376889,0.240381
26201,0.231793,0.077152,0.161374,0.414781,0.240381,0.327327,0.210675,0.222686,0.315063,0.343122,0.237171,0.427352,0.217061,0.151585,0.226455,0.239851,0.194257
69523,0.309058,0.282889,0.387298,0.328368,0.343401,0.363696,0.300965,0.274075,0.373408,0.152499,0.29866,0.303895,0.372104,0.30317,0.320812,0.408635,0.275198
13112,0.525657,0.262445,0.170783,0.274352,0.20767,0.268055,0.386763,0.349619,0.344016,0.374654,0.298807,0.258438,0.234404,0.137505,0.449359,0.302184,0.238623
41843,0.216295,0.062994,0.158114,0.4445,0.364502,0.267261,0.245737,0.167836,0.342997,0.342415,0.279715,0.325669,0.303822,0.247537,0.323575,0.174078,0.297394
58540,0.389643,0.332875,0.177229,0.427062,0.350202,0.342368,0.448583,0.362817,0.521773,0.328982,0.434122,0.46934,0.267577,0.0,0.466321,0.209061,0.323824
53362,0.482226,0.324102,0.21693,0.464647,0.365452,0.366679,0.488864,0.268648,0.313725,0.298957,0.531369,0.351069,0.382103,0.424522,0.396376,0.40303,0.421622
49088,0.398049,0.436436,0.228218,0.384949,0.412796,0.289319,0.510754,0.302813,0.420813,0.242624,0.577651,0.36262,0.285044,0.214373,0.380304,0.376889,0.377742


In [238]:
score_menu(_)

beer_id
53362    0.382372
49088    0.364788
27589    0.361066
45733    0.350249
58540    0.344222
66769    0.327698
5510     0.322450
16896    0.320287
69523    0.317537
13112    0.299020
70232    0.290119
70732    0.271669
59802    0.268883
41843    0.268497
26201    0.249299
72176    0.228773
61745    0.206612
65937    0.189254
65973    0.161765
65884    0.155408
Name: score, dtype: float64