# Content Based Filtering based on reviews

Please run this notebook from the beerly notebooks dir, see: https://github.com/TomsHL/beerly

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer,ENGLISH_STOP_WORDS
from pathlib import Path
common_dir = Path().resolve().parent
data_path = common_dir / 'raw_data'

## Preprocessing 
+ import the dataset of reviews as `dataset`,
+ import `dataset_reviews` which is grouped by `beer_id` with reviews simply concatenated and turn it into a serie,
+ import the output from OCR which is a dataframe called `menu_ocr`,
+ create the `liked` serie with `beer_id` as index and reviews as content,
+ create the `menu` serie with the df from ocr aka `menu_ocr` and the matching review from the `beers` serie.

In [2]:
dataset = pd.read_csv(data_path / 'dataset_cleaned.csv')
dataset_reviews = pd.read_csv(data_path / 'dataset_agg2.csv')
menu_ocr = pd.read_csv(data_path / 'from_ocr.csv')

In [3]:
dataset_reviews

Unnamed: 0,beer_id,review_text
0,3,"The label is very informative, except it didn'..."
1,4,"No dating. Had this one in the fridge, then on..."
2,5,"Compared to the previous beer I ordered, this ..."
3,6,"Smells malty. Tastes malty, then chocolate, th..."
4,7,Cloudy yellow beer with a head that goes to ab...
...,...,...
38307,77247,January seasonal at the Toronto location. Hopp...
38308,77264,A: Crystal clarity with a light oxidized orang...
38309,77293,"Dark, malty, and pleasantly less alcoholic tha..."
38310,77296,On tap at the Bull & Castle.\t\tHazy gold with...


In [6]:
dataset.dropna(subset=['review_text'],inplace = True)

In [7]:
dataset.head()

Unnamed: 0,beer_id,beer_name,brewery_name,beer_style,beer_abv,overall,aroma,appearance,palate,taste,user_id,review_text,beer_brewery
0,52159,Caldera Ginger Beer,Caldera Brewing Company,Herbed / Spiced Beer,4.7,3.0,3.5,3.5,3.0,3.5,27145,Poured from the bottle into a Chimay goblet.\t...,Caldera Ginger Beer Caldera Brewing Company
1,52159,Caldera Ginger Beer,Caldera Brewing Company,Herbed / Spiced Beer,4.7,3.5,3.5,3.5,4.0,4.0,10198,"22 oz bottle from ""Lifesource"" Salem. $3.95 Ni...",Caldera Ginger Beer Caldera Brewing Company
2,52159,Caldera Ginger Beer,Caldera Brewing Company,Herbed / Spiced Beer,4.7,3.0,2.5,3.5,2.0,3.5,13942,"Bottle says ""Malt beverage brewed with Ginger ...",Caldera Ginger Beer Caldera Brewing Company
3,52159,Caldera Ginger Beer,Caldera Brewing Company,Herbed / Spiced Beer,4.7,4.0,3.0,3.5,3.5,4.0,7687,I'm not sure why I picked this up... I like gi...,Caldera Ginger Beer Caldera Brewing Company
4,52159,Caldera Ginger Beer,Caldera Brewing Company,Herbed / Spiced Beer,4.7,4.5,3.5,5.0,4.0,4.0,14374,Poured from a 22oz bomber into my Drie Fontein...,Caldera Ginger Beer Caldera Brewing Company


In [73]:
dataset_reviews.set_index(dataset_reviews['beer_id'], drop=False, inplace=True)
beers = dataset_reviews.review_text
beers.head()

beer_id
3    The label is very informative, except it didn'...
4    No dating. Had this one in the fridge, then on...
5    Compared to the previous beer I ordered, this ...
6    Smells malty. Tastes malty, then chocolate, th...
7    Cloudy yellow beer with a head that goes to ab...
Name: review_text, dtype: object

### Create `liked`, the serie containing an user's reviewed beers.

Here we will simulate an input from user with `user_id = 89_000`

`user_id = 89_000` is a belgian tripel beers lover,

`user_id = 90_001` is a stout lover.

In [6]:
rated = dataset[dataset.user_id == 89_000].copy()

In [52]:
rated

Unnamed: 0,beer_id,beer_name,brewery_name,beer_style,beer_abv,overall,aroma,appearance,palate,taste,user_id,beer_brewery,review_text
0,635,Pauwel Kwak,Brouwerij Bosteels,Belgian Strong Pale Ale,8.4,4.0,4.0,4.0,4.0,4.0,89000,Pauwel Kwak Brouwerij Bosteels,"First had: bottle at Eulogy, Philly, PA\t\tPre..."
1,656,Tripel Karmeliet,Brouwerij Bosteels,Tripel,8.4,4.0,4.0,4.0,4.0,4.0,89000,Tripel Karmeliet Brouwerij Bosteels,No thanks for this one. I thought this one was...
2,1836,La Chouffe,Brasserie d'Achouffe,Belgian Strong Pale Ale,8.0,4.0,4.0,4.0,4.0,4.0,89000,La Chouffe Brasserie d'Achouffe,How does one describe La Chouffe? SPICY! The i...
3,1385,Delirium Tremens,Brouwerij Huyghe,Belgian Strong Pale Ale,8.5,4.0,4.0,4.0,4.0,4.0,89000,Delirium Tremens Brouwerij Huyghe,"A- Poured great. Nice head, golden color.\t\tS..."
4,7578,"Carolus, Der Starke",Binding-Brauerei AG,Doppelbock,7.5,4.0,4.0,4.0,4.0,4.0,89000,"Carolus, Der Starke Binding-Brauerei AG",Poured from a half litre bottle into lager gla...
5,3970,Kronenbourg 1664,Brasseries Kronenbourg,Euro Pale Lager,5.9,1.0,1.0,1.0,1.0,1.0,89000,Kronenbourg 1664 Brasseries Kronenbourg,"Pours golden yellow, good head with lacing.\tS..."
6,2512,Chimay Grande Réserve (Blue),Bières de Chimay S.A.,Belgian Strong Dark Ale,9.0,4.5,4.5,4.5,4.5,4.5,89000,Chimay Grande Réserve (Blue) Bières de Chimay ...,"Pours a dark, hazy brown, with a foamy tan hea..."
7,672,Chimay Première (Red),Bières de Chimay S.A.,Dubbel,7.0,4.0,4.0,4.0,4.0,4.0,89000,Chimay Première (Red) Bières de Chimay S.A.,Pours a clear red brown with rocky white head....
8,646,Westmalle Trappist Tripel,Brouwerij Westmalle,Tripel,9.5,4.0,4.0,4.0,4.0,4.0,89000,Westmalle Trappist Tripel Brouwerij Westmalle,Pours a deep yellow with a big fluffy white he...
9,673,Rodenbach Grand Cru,Brouwerij Rodenbach N.V.,Flanders Red Ale,6.0,4.0,4.0,4.0,4.0,4.0,89000,Rodenbach Grand Cru Brouwerij Rodenbach N.V.,"Appearance: Dark red/Brown, small, short-lived..."


In [57]:
rated = rated.drop('review_text', axis=1).merge(dataset_reviews, on='beer_id', how='left')

In [95]:
rated

Unnamed: 0,beer_id,beer_name,brewery_name,beer_style,beer_abv,overall,aroma,appearance,palate,taste,user_id,beer_brewery,review_text
0,635,Pauwel Kwak,Brouwerij Bosteels,Belgian Strong Pale Ale,8.4,4.0,4.0,4.0,4.0,4.0,89000,Pauwel Kwak Brouwerij Bosteels,"First had: bottle at Eulogy, Philly, PA\t\tPre..."
1,656,Tripel Karmeliet,Brouwerij Bosteels,Tripel,8.4,4.0,4.0,4.0,4.0,4.0,89000,Tripel Karmeliet Brouwerij Bosteels,No thanks for this one. I thought this one was...
2,1836,La Chouffe,Brasserie d'Achouffe,Belgian Strong Pale Ale,8.0,4.0,4.0,4.0,4.0,4.0,89000,La Chouffe Brasserie d'Achouffe,How does one describe La Chouffe? SPICY! The i...
3,1385,Delirium Tremens,Brouwerij Huyghe,Belgian Strong Pale Ale,8.5,4.0,4.0,4.0,4.0,4.0,89000,Delirium Tremens Brouwerij Huyghe,"A- Poured great. Nice head, golden color.\t\tS..."
4,7578,"Carolus, Der Starke",Binding-Brauerei AG,Doppelbock,7.5,4.0,4.0,4.0,4.0,4.0,89000,"Carolus, Der Starke Binding-Brauerei AG",Poured from a half litre bottle into lager gla...
5,3970,Kronenbourg 1664,Brasseries Kronenbourg,Euro Pale Lager,5.9,1.0,1.0,1.0,1.0,1.0,89000,Kronenbourg 1664 Brasseries Kronenbourg,"Pours golden yellow, good head with lacing.\tS..."
6,2512,Chimay Grande Réserve (Blue),Bières de Chimay S.A.,Belgian Strong Dark Ale,9.0,4.5,4.5,4.5,4.5,4.5,89000,Chimay Grande Réserve (Blue) Bières de Chimay ...,"Pours a dark, hazy brown, with a foamy tan hea..."
7,672,Chimay Première (Red),Bières de Chimay S.A.,Dubbel,7.0,4.0,4.0,4.0,4.0,4.0,89000,Chimay Première (Red) Bières de Chimay S.A.,Pours a clear red brown with rocky white head....
8,646,Westmalle Trappist Tripel,Brouwerij Westmalle,Tripel,9.5,4.0,4.0,4.0,4.0,4.0,89000,Westmalle Trappist Tripel Brouwerij Westmalle,Pours a deep yellow with a big fluffy white he...
9,673,Rodenbach Grand Cru,Brouwerij Rodenbach N.V.,Flanders Red Ale,6.0,4.0,4.0,4.0,4.0,4.0,89000,Rodenbach Grand Cru Brouwerij Rodenbach N.V.,"Appearance: Dark red/Brown, small, short-lived..."


If the users reviewed the same beer more than once we concatenate the reviews. Doing this gives us the ability to use the `beer_id` as the serie index.

In [58]:
liked_df = rated[rated.overall >= 3]
if (liked_df['beer_id'].duplicated().sum()):
    liked = liked_df.groupby('beer_id')['review_text'].apply(lambda x: "%s" % ' '.join(x))
else:
    liked = liked_df.set_index(liked_df.beer_id, drop = True)['review_text']
liked.head()

beer_id
635     First had: bottle at Eulogy, Philly, PA\t\tPre...
656     No thanks for this one. I thought this one was...
1836    How does one describe La Chouffe? SPICY! The i...
1385    A- Poured great. Nice head, golden color.\t\tS...
7578    Poured from a half litre bottle into lager gla...
Name: review_text, dtype: object

### Create `menu` a dummy serie containing beers from a hypothetical menu.

In [77]:
menu = beers[beers.index.isin(menu_ocr.beer_id)]
#menu = menu.set_index(menu.beer_id, drop = True)['review_text']

In [78]:
menu

beer_id
655      While not a top pilsner, this beer is not bad,...
678      Appearance is royal: golden liquid instantly f...
837      What can i say its the worst beer i've ever ha...
3763     500 ml bottle.\t\tA- Brown. Cloudy with a frot...
3970     Pours golden yellow, good head with lacing.\tS...
4611     Enjoyed on-tap @ the Flamingo Bowl in St. Loui...
5223     A: Poured a cloudy dark copper color with almo...
5620     Bottle date of 11/29/10, consumed 01/03/2011. ...
9085     On tap at the brewpub.\t\tCloudy bright yellow...
12705    Originally reviewed on January 24, 2005. On ta...
22956    A brewpub in Farmington? Well, I've been visit...
30717    Nitro-tap at the brewpub.\t\tPours black with ...
31366    A: Dark red/brown. Head vanished also instantl...
34590    A: Pours a golden color, not too much head at ...
39759    Look: Poured a hazy gold color with a white he...
40341    A-Pours a hazy whitish-gold w/white head, dece...
43943    Reviewed on 6/28/08. On (nitro) tap at 

## Vectorizing
+ first we'll concat the menu beers in top of the liked beers,
+ then we'll feed the resulting df to a `CountVectorizer` tuned to ignore case, numbers and single letter words.

Concatenation into `payload`

In [79]:
payload = pd.concat([menu,liked])

Lemmatization of the reviews

In [80]:
#USELESS
#from nltk.stem import WordNetLemmatizer
#from nltk.tokenize import word_tokenize
#lemmatizer = WordNetLemmatizer()
#payload = payload.apply(lambda s:' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(s)]))

Extending the default `'english'` set of stop words

In [81]:
custom_words = frozenset(['abv','adds','come', 'comes', 'coming','drink','drinking','ve','thing','things','oz','think', 'thought','ll','actually','bottle'])
stop_words = ENGLISH_STOP_WORDS.union(custom_words)

### Vectorization

In [82]:
vectorizer = CountVectorizer(stop_words=stop_words,min_df=5,max_df=40,max_features=400,token_pattern=r'(?u)\b[a-z]{2,}\b')
count_matrix = vectorizer.fit_transform(payload)
count_df = pd.DataFrame(count_matrix.toarray(), index=payload.index.tolist())
count_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
655,3,1,4,0,2,0,1,14,28,8,...,1,4,10,4,1,0,2,8,7,29
678,0,1,1,0,0,0,0,1,5,1,...,0,0,1,0,0,0,0,9,3,12
837,44,8,91,18,16,4,11,12,169,22,...,7,41,31,32,14,15,46,31,9,523
3763,0,1,3,0,0,0,2,9,6,3,...,4,0,2,0,0,0,0,27,11,1
3970,7,5,38,3,9,3,3,21,65,11,...,1,9,23,22,4,6,18,29,7,136


In [83]:
vectorizer.get_feature_names_out()

array(['absolutely', 'acidic', 'alcohol', 'alcoholic', 'ale', 'ales',
       'amazing', 'amber', 'appearance', 'apple', 'apples', 'apricot',
       'aromas', 'average', 'away', 'awesome', 'background', 'bad',
       'balance', 'balanced', 'banana', 'bananas', 'beautiful', 'beers',
       'beige', 'belgian', 'belgians', 'belgium', 'bernardus', 'best',
       'big', 'bite', 'bitterness', 'black', 'blend', 'blue', 'booze',
       'boozy', 'bottled', 'bottles', 'bread', 'bready', 'bright',
       'brown', 'bubbles', 'bubbly', 'burn', 'burnt', 'buy', 'candi',
       'candied', 'candy', 'cap', 'caramel', 'carbonated', 'certainly',
       'chalice', 'champagne', 'cherries', 'cherry', 'chimay',
       'chocolate', 'cider', 'cinnamon', 'citrus', 'citrusy', 'class',
       'classic', 'clean', 'clear', 'cloudy', 'clove', 'cloves', 'cold',
       'colored', 'colour', 'complex', 'complexity', 'content', 'coors',
       'copper', 'coriander', 'cork', 'corn', 'couple', 'cream', 'creamy',
       'cris

## Compute similarities
+ compute a similarity matrix between beers from the menu and user's liked beers,
+ compute the mean for each menu beers to create a 'similarity' score.

### Similarity matrix

In [84]:
m = menu.size
cosine_sim = cosine_similarity(count_matrix[:m], count_matrix[m:])

In [85]:
similarity = pd.DataFrame(cosine_sim)
similarity.index = menu.index
similarity.columns = liked.index
similarity.columns.name = 'liked_beers_id'
similarity.index.name = 'menu_beers_id'
similarity

liked_beers_id,635,656,1836,1385,7578,2512,672,646,673,1882,...,617,58261,7456,129,13067,722,1708,645,1696,1711
menu_beers_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
655,0.509777,0.598702,0.608604,0.602793,0.332978,0.39618,0.459959,0.550575,0.272602,0.287512,...,0.623129,0.488602,0.611801,0.568064,0.66607,0.515532,0.365335,0.340786,0.376641,0.339485
678,0.43676,0.599781,0.652332,0.618876,0.146445,0.360843,0.4326,0.572883,0.252383,0.254005,...,0.635843,0.460363,0.589856,0.558184,0.660333,0.574134,0.33039,0.285197,0.331746,0.27247
837,0.323511,0.367913,0.352983,0.424979,0.176407,0.283868,0.295254,0.352247,0.206112,0.210465,...,0.357711,0.230011,0.362559,0.358386,0.468995,0.314641,0.262247,0.256272,0.258654,0.24683
3763,0.562482,0.549092,0.522207,0.520791,0.402458,0.552601,0.610281,0.513345,0.312557,0.339123,...,0.565069,0.458905,0.534148,0.533628,0.471127,0.563294,0.561564,0.519647,0.569514,0.498309
3970,0.477831,0.570552,0.569151,0.601519,0.306635,0.367216,0.412554,0.528874,0.262482,0.272148,...,0.57361,0.418282,0.5703,0.529025,0.675046,0.484181,0.340651,0.320823,0.346483,0.323826
4611,0.377593,0.565253,0.541443,0.558411,0.151688,0.327064,0.406118,0.513602,0.199276,0.20315,...,0.556415,0.369243,0.537083,0.49001,0.522693,0.540063,0.324137,0.268084,0.308365,0.227683
5223,0.590752,0.517896,0.56988,0.517029,0.279733,0.420063,0.502007,0.5295,0.263409,0.277765,...,0.603813,0.675695,0.55828,0.642753,0.579669,0.5168,0.392189,0.360893,0.408758,0.370191
5620,0.592147,0.311945,0.312084,0.295187,0.334115,0.340999,0.418796,0.292893,0.207136,0.223541,...,0.349313,0.399353,0.333481,0.411987,0.357743,0.319377,0.327824,0.290306,0.335552,0.332293
9085,0.204479,0.344868,0.317477,0.339807,0.048541,0.175422,0.226611,0.295483,0.179062,0.181261,...,0.309738,0.172139,0.303444,0.277982,0.320539,0.302536,0.163098,0.129635,0.150817,0.121136
12705,0.20407,0.13366,0.13769,0.116873,0.410861,0.297605,0.255417,0.142837,0.169633,0.17898,...,0.18284,0.142475,0.14517,0.152911,0.139406,0.156945,0.326902,0.360925,0.353239,0.336352


### Create a similarity score by averaging along columns.

In [86]:
out = similarity.mean(axis=1).sort_values(ascending=False)

In [88]:
out.index.name = 'beer_id'
out.name = 'score'
out*5

beer_id
60675    3.385128
34590    3.357626
48493    3.080062
47541    2.977820
54654    2.843908
60798    2.726645
46080    2.672466
3763     2.546639
40341    2.400369
5223     2.385072
655      2.358993
678      2.226757
3970     2.217077
60959    2.113136
39759    2.088134
4611     1.971759
58463    1.933105
5620     1.706793
837      1.516949
72684    1.506422
31366    1.491850
9085     1.123183
12705    1.107540
22956    0.996966
43943    0.969073
30717    0.839030
Name: score, dtype: float64

In [92]:
dataset[dataset.beer_id == 3763].drop_duplicates(subset=['beer_id'])

Unnamed: 0,beer_id,beer_name,brewery_name,beer_style,beer_abv,overall,aroma,appearance,palate,taste,user_id,review_text,beer_brewery
1350710,3763,Paulaner Hefe-Weissbier Dunkel,Paulaner Brauerei GmbH & Co. KG,Dunkelweizen,5.3,4.5,4.0,3.5,4.0,4.0,6370,500 ml bottle.\t\tA- Brown. Cloudy with a frot...,Paulaner Hefe-Weissbier Dunkel Paulaner Brauer...


# Appendix

## 1.Tfid vs Bag of words

In [228]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfid = TfidfVectorizer(stop_words='english',min_df=5,max_features=400,token_pattern=r'(?u)\b[a-z]{2,}\b')

tfid_count_matrix = tfid.fit_transform(payload)
pd.DataFrame(tfid_count_matrix.toarray(), index=payload.index.tolist()).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,88,89,90,91,92,93,94,95,96,97
5510,0.0,0.0,0.0,0.226147,0.0,0.0,0.0,0.0,0.217361,0.0,...,0.0,0.147372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.17304,0.0,0.142993,0.0,0.0
72176,0.0,0.0,0.0,0.0,0.0,0.213954,0.0,0.0,0.0,0.0,...,0.0,0.0,0.202365,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.162348,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.299404
69523,0.0,0.0,0.0,0.0,0.168975,0.0,0.0,0.0,0.0,0.0,...,0.0,0.114566,0.0,0.0,0.0,0.0,0.0,0.0,0.200802,0.0


In [229]:
tfid.get_feature_names_out()

array(['abv', 'aftertaste', 'alcohol', 'ale', 'amber', 'american',
       'appearance', 'apperance', 'aroma', 'banana', 'beer', 'beers',
       'belgian', 'better', 'bit', 'bitter', 'bitterness', 'bodied',
       'body', 'bottle', 'brew', 'brown', 'carbonation', 'carmel',
       'character', 'chocolate', 'citrus', 'clean', 'clear', 'cloying',
       'color', 'creamy', 'dark', 'decent', 'didn', 'does', 'doesn',
       'drinkability', 'drinkable', 'dry', 'finish', 'flavor', 'flavors',
       'floral', 'fresh', 'fruit', 'fruity', 'glass', 'golden', 'good',
       'great', 'head', 'high', 'hint', 'hop', 'hoppy', 'hops', 'just',
       'lace', 'lacing', 'lemon', 'light', 'like', 'little', 'malt',
       'malts', 'medium', 'moderate', 'mouthfeel', 'nice', 'nose', 'note',
       'notes', 'orange', 'pale', 'pours', 'quite', 'real', 'really',
       'red', 'rich', 'smell', 'smooth', 'style', 'surface', 'sweet',
       'sweetness', 'tan', 'tap', 'taste', 'tastes', 'thing', 'think',
       'try',

In [230]:
vectorizer.get_feature_names_out()

array(['abv', 'aftertaste', 'alcohol', 'ale', 'amber', 'american',
       'appearance', 'apperance', 'aroma', 'banana', 'beer', 'beers',
       'belgian', 'better', 'bit', 'bitter', 'bitterness', 'bodied',
       'body', 'bottle', 'brew', 'brown', 'carbonation', 'carmel',
       'character', 'chocolate', 'citrus', 'clean', 'clear', 'cloying',
       'color', 'creamy', 'dark', 'decent', 'didn', 'does', 'doesn',
       'drinkability', 'drinkable', 'dry', 'finish', 'flavor', 'flavors',
       'floral', 'fresh', 'fruit', 'fruity', 'glass', 'golden', 'good',
       'great', 'head', 'high', 'hint', 'hop', 'hoppy', 'hops', 'just',
       'lace', 'lacing', 'lemon', 'light', 'like', 'little', 'malt',
       'malts', 'medium', 'moderate', 'mouthfeel', 'nice', 'nose', 'note',
       'notes', 'orange', 'pale', 'pours', 'quite', 'real', 'really',
       'red', 'rich', 'smell', 'smooth', 'style', 'surface', 'sweet',
       'sweetness', 'tan', 'tap', 'taste', 'tastes', 'thing', 'think',
       'try',

In [234]:
f'{100 * (tfid.get_feature_names_out() == vectorizer.get_feature_names_out()).sum() / len(tfid.get_feature_names_out())}%'

'100.0%'

## 2. Python functions

In [3]:
def predict_content(dataset: pd.core.frame.DataFrame,
                    dataset_reviews: pd.core.frame.DataFrame,
                    menu_ocr: pd.core.frame.DataFrame, user_id: int):

    #creation of the menu serie mixing beer_id from ocr and matching reviews from beers
    beers_df = dataset_reviews.copy()
    beers_df.set_index(beers_df['beer_id'], drop=False, inplace=True)
    beers = beers_df.review_text

    menu = beers[beers.index.isin(menu_ocr.beer_id)]
    m = menu.size

    del beers_df, beers, menu_ocr

    #Create liked, the serie containing an user's reviewed beers.
    rated = dataset[dataset.user_id == user_id].copy()
    rated = rated.drop('review_text', axis=1).merge(dataset_reviews, on='beer_id', how='left')
    del dataset

    liked_df = rated[rated.overall >= 3]
    if (liked_df['beer_id'].duplicated().sum()):
        liked = liked_df.groupby('beer_id')['review_text'].apply(
            lambda x: "%s" % ' '.join(x))
    else:
        liked = liked_df.set_index(liked_df.beer_id, drop=True)['review_text']
    del liked_df

    #Vectorizing
    custom_words = frozenset([
        'abv', 'adds', 'come', 'comes', 'coming', 'drink', 'drinking', 've',
        'thing', 'things', 'oz', 'think', 'thought', 'll', 'actually', 'bottle'
    ])
    stop_words = ENGLISH_STOP_WORDS.union(custom_words)

    payload = pd.concat([menu, liked])
    vectorizer = CountVectorizer(stop_words=stop_words,
                                 min_df=5,
                                 max_df=40,
                                 max_features=400,
                                 token_pattern=r'(?u)\b[a-z]{2,}\b')
    count_matrix = vectorizer.fit_transform(payload)

    #similarity
    cosine_sim = cosine_similarity(count_matrix[:m], count_matrix[m:])
    similarity = pd.DataFrame(cosine_sim)
    similarity.index = menu.index
    similarity.columns = liked.index
    del liked, menu

    #ranking
    ranking = similarity.mean(axis=1).sort_values(ascending=False)
    ranking.index.name = 'beer_id'
    ranking.name = 'score'
    return ranking

In [4]:
predict_content(dataset, dataset_reviews, menu_ocr, 89_000)

beer_id
60675    0.677026
34590    0.671525
48493    0.616012
47541    0.595564
54654    0.568782
60798    0.545329
46080    0.534493
3763     0.509328
40341    0.480074
5223     0.477014
655      0.471799
678      0.445351
3970     0.443415
60959    0.422627
39759    0.417627
4611     0.394352
58463    0.386621
5620     0.341359
837      0.303390
72684    0.301284
31366    0.298370
9085     0.224637
12705    0.221508
22956    0.199393
43943    0.193815
30717    0.167806
Name: score, dtype: float64