# RECOMMENDER SYSTEMS

In [1]:
import numpy as np
import pandas as pd 
import statsmodels.api as st

In [None]:
beer_df = pd.read_csv("beer_reviews.csv") 
pd.set_option('display.max_rows', 500)

In [None]:
beer_df

In [None]:
beer_df = beer_df.sort_values(by=['review_time'], ascending = True).drop_duplicates(subset=['review_profilename', 'beer_beerid'] , keep="last")

In [None]:
len(beer_df)

In [None]:
#analysing the missing rows
beer_df.isnull().sum()

In [None]:
#taking only the rows with non-Null profile name 
beer_df = beer_df[beer_df['review_profilename'].notna()]

In [None]:
beer_df["brewery_name"] = beer_df["brewery_name"].fillna("unknown")
beer_df["beer_abv"] = beer_df["beer_abv"].fillna(-1)

In [None]:
len(beer_df)

In [None]:
beer_df.isnull().sum()

In [None]:
beer_df.describe()

In [None]:
#creating an additional column to create a more representative name
beer_df['beer_label'] = beer_df['brewery_name']+' // '+beer_df['beer_name']+ ' // '+beer_df['beer_style']

beer_df['beer_label']

In [None]:
#Adding myself

romans_indexes = [[16273, 5.0, 5.0, 5.0, 5.0, 5.0],
                  [5430, 3.5, 3.0, 3.0, 4.0, 3.5],
                  [1666, 3.0, 3.0, 3.0, 3.0, 3.0], 
                  [1901, 3.0, 3.0, 3.0, 3.0, 3.0],
                  [1901, 3.0, 3.0, 3.0, 3.0, 3.0],
                  [5032, 3.0, 3.0, 3.0, 3.0, 3.0],
                  [31256, 5.0, 5.0, 5.0, 5.0, 5.0],
                  [4699, 5.0, 5.0, 5.0, 5.0, 5.0],
                  [4699, 5.0, 5.0, 5.0, 5.0, 5.0],
                  [4694, 5.0, 5.0, 5.0, 5.0, 5.0],
                  [69750, 5.0, 5.0, 5.0, 5.0, 5.0],
                  [246, 4.0, 4.0, 4.0, 4.0, 4.0],
                  [5280, 3.0, 3.0, 3.0, 3.0, 3.0],
                  [6754, 3.5, 3.5, 3.0, 3.5, 3.5],
                  [2570, 3.0, 3.0, 3.0, 3.0, 3.0],
                  [5006, 4.0, 4.0, 4.0, 4.0, 4.0],
                  [55404, 4.5, 4.5, 4.5, 4.5, 4.5]
                 ]
                                 
                  
for beer in range(len(romans_indexes)):
    beer_from_db = beer_df[beer_df['beer_beerid']==romans_indexes[beer][0]].iloc[:1].copy(deep=True)
    beer_from_db['review_profilename'] = 'roman_gellert'
    beer_from_db['review_overall'] = romans_indexes[beer][1]
    beer_from_db['review_aroma'] = romans_indexes[beer][2]
    beer_from_db['review_appearance'] = romans_indexes[beer][3]
    beer_from_db['review_palate'] = romans_indexes[beer][4]
    beer_from_db['review_taste'] = romans_indexes[beer][5]
    print(beer_from_db)
    beer_df = pd.concat([beer_from_db, beer_df])


In [None]:
len(beer_df)

In [None]:
beer_df = beer_df.drop_duplicates(subset=['review_profilename', 'beer_beerid'] , keep="last")

In [None]:
len(beer_df)

In [None]:
beer_df[beer_df['review_profilename']=='roman_gellert']

In [None]:
beer_df[beer_df['beer_label'].str.lower().str.contains('guinness', regex=True, na=False)]  

In [None]:
import seaborn as sb

In [None]:
import matplotlib.pyplot as plt

# histogram of overall review
plt.title('Distibution of overall reviews')
sb.histplot(beer_df['review_overall'],bins = 12, kde=False)

In [None]:
beer_mean_overall_review = beer_df.groupby(by=['beer_style'])['beer_style'].count().sort_values(ascending = False)
len(beer_mean_overall_review)

In [None]:
import matplotlib.pyplot as plt


plt.figure(figsize=(13,4))
plt.title('Top 5 beer styles')
sb.countplot(beer_df['beer_style'], order=beer_df.beer_style.value_counts().iloc[:5].index)

In [None]:
plt.figure(figsize=(15,4))
plt.title('Top 5 reviewers based on number of reviews')
sb.countplot(beer_df['review_profilename'], order=beer_df.review_profilename.value_counts().iloc[:5].index)

In [None]:
user_export_review_count = beer_df.groupby(by='review_profilename').count()['review_overall'].sort_values(ascending = False)

plt.title('Number of reviewes by user boxtplot')
plt.xlabel('Number of reviews by user')
sb.boxplot(x=user_export_review_count)

In [None]:
user_export_review_count.describe()

In [None]:
beer_export_review_count = beer_df.groupby(by='beer_beerid').count()['review_overall'].sort_values(ascending = False)


plt.title('Number of reviewes by beer id boxtplot')
plt.xlabel('Number of reviews by beer id')
sb.boxplot(x=beer_export_review_count)

In [None]:
beer_export_review_count.describe()

In [None]:
plt.figure(figsize=(15,4))
plt.title('Top 5 mosed reviewed beers')
sb.countplot(beer_df['beer_beerid'], order=beer_df.beer_beerid.value_counts().iloc[:5].index)

In [None]:
beer_df[beer_df['beer_beerid']==92]

## Non - rersonalized recommendations

## Simple mean score

In [9]:
#as the simplest recommender system we can sort the ratings and take the top

beer_mean_overall_review = beer_df.groupby(by=['beer_beerid']).mean()['review_overall'].sort_values(ascending = False)

beer_mean_overall_review

beer_beerid
52714    5.0
6445     5.0
49011    5.0
14574    5.0
49023    5.0
        ... 
53222    1.0
52745    1.0
59853    1.0
31594    1.0
32505    1.0
Name: review_overall, Length: 66051, dtype: float64

## Confidence interval mean

In [7]:
#we count the sample size for each beer 
beer_n = beer_df.groupby(by='beer_beerid').count()['review_overall'].sort_values(ascending = False)

beer_n

beer_beerid
2093     3206
412      3038
1904     2929
4083     2644
92       2633
         ... 
53109       1
22609       1
53095       1
53038       1
77317       1
Name: review_overall, Length: 66051, dtype: int64

In [10]:
#we merge mean and sample size together in one dataset 
beer_mean_and_n = pd.merge(beer_mean_overall_review, beer_n, how='inner', on = 'beer_beerid')
beer_mean_and_n.columns = ['p','n']

beer_mean_and_n

Unnamed: 0_level_0,p,n
beer_beerid,Unnamed: 1_level_1,Unnamed: 2_level_1
52714,5.0,1
6445,5.0,1
49011,5.0,1
14574,5.0,1
49023,5.0,1
...,...,...
53222,1.0,1
52745,1.0,1
59853,1.0,1
31594,1.0,1


In [11]:
#we transform the mean value in order for it to fit wilson interval
beer_mean_and_n['p_prob'] = (beer_mean_and_n['p']-1)/4

beer_mean_and_n

Unnamed: 0_level_0,p,n,p_prob
beer_beerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
52714,5.0,1,1.0
6445,5.0,1,1.0
49011,5.0,1,1.0
14574,5.0,1,1.0
49023,5.0,1,1.0
...,...,...,...
53222,1.0,1,0.0
52745,1.0,1,0.0
59853,1.0,1,0.0
31594,1.0,1,0.0


In [12]:
#function to calculate wilson interval
from math import sqrt

def find_lower_ci_boundry(p, n):
    z = 1.96
    denom = 1 + z**2/n
    cp = p + z*z / (2*n)
    ad = sqrt((p*(1 - p) + z*z / (4*n)) / n)
    
    lower_bound = (cp - z*ad) / denom
    return lower_bound

In [13]:
#we zip n and p to run apply on a pandas dataset without issues
beer_mean_and_n['zip_p_n'] = beer_mean_and_n[['p_prob', 'n']].apply(tuple, axis=1)

beer_mean_and_n

Unnamed: 0_level_0,p,n,p_prob,zip_p_n
beer_beerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
52714,5.0,1,1.0,"(1.0, 1.0)"
6445,5.0,1,1.0,"(1.0, 1.0)"
49011,5.0,1,1.0,"(1.0, 1.0)"
14574,5.0,1,1.0,"(1.0, 1.0)"
49023,5.0,1,1.0,"(1.0, 1.0)"
...,...,...,...,...
53222,1.0,1,0.0,"(0.0, 1.0)"
52745,1.0,1,0.0,"(0.0, 1.0)"
59853,1.0,1,0.0,"(0.0, 1.0)"
31594,1.0,1,0.0,"(0.0, 1.0)"


In [14]:
#we calculate lower confidence interval boundry
beer_mean_and_n['lower_ci'] = beer_mean_and_n['zip_p_n'].apply(lambda x:find_lower_ci_boundry(x[0],x[1]))

beer_mean_and_n.sort_values(ascending = False, by='lower_ci').describe()

Unnamed: 0,p,n,p_prob,lower_ci
count,66051.0,66051.0,66051.0,66051.0
mean,3.656132,23.792145,0.664033,0.241871
std,0.621987,109.224932,0.155497,0.177039
min,1.0,1.0,0.0,0.0
25%,3.416667,1.0,0.604167,0.117906
50%,3.75,2.0,0.6875,0.185313
75%,4.0,7.0,0.75,0.343225
max,5.0,3206.0,1.0,0.887089


In [15]:
beer_df[beer_df['beer_beerid']==47658]

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid,beer_label,user_id
16039,1199,Founders Brewing Company,2009-06-12 19:33:00,4.5,4.5,4.5,tedpeer,American Double / Imperial Stout,5.0,4.5,Founders CBS Imperial Stout,10.6,47658,Founders Brewing Company // Founders CBS Imper...,2071
16098,1199,Founders Brewing Company,2011-11-20 19:43:45,4.5,4.5,4.5,ColForbinBC,American Double / Imperial Stout,5.0,4.5,Founders CBS Imperial Stout,10.6,47658,Founders Brewing Company // Founders CBS Imper...,135
16099,1199,Founders Brewing Company,2011-11-20 18:42:49,5.0,4.5,5.0,MrVonzipper,American Double / Imperial Stout,5.0,5.0,Founders CBS Imperial Stout,10.6,47658,Founders Brewing Company // Founders CBS Imper...,33
16100,1199,Founders Brewing Company,2011-11-20 07:11:54,5.0,5.0,4.5,MattyG85,American Double / Imperial Stout,4.5,5.0,Founders CBS Imperial Stout,10.6,47658,Founders Brewing Company // Founders CBS Imper...,2693
16101,1199,Founders Brewing Company,2011-11-20 04:43:56,4.5,4.5,4.0,litheum94,American Double / Imperial Stout,4.0,4.5,Founders CBS Imperial Stout,10.6,47658,Founders Brewing Company // Founders CBS Imper...,492
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16725,1199,Founders Brewing Company,2011-11-24 07:11:20,5.0,5.0,4.0,UCLABrewN84,American Double / Imperial Stout,4.0,5.0,Founders CBS Imperial Stout,10.6,47658,Founders Brewing Company // Founders CBS Imper...,17
16726,1199,Founders Brewing Company,2011-11-22 02:27:52,5.0,4.5,5.0,DarthKostrizer,American Double / Imperial Stout,5.0,5.0,Founders CBS Imperial Stout,10.6,47658,Founders Brewing Company // Founders CBS Imper...,32
16727,1199,Founders Brewing Company,2011-11-21 14:29:29,5.0,5.0,5.0,scott9890,American Double / Imperial Stout,5.0,5.0,Founders CBS Imperial Stout,10.6,47658,Founders Brewing Company // Founders CBS Imper...,4218
16728,1199,Founders Brewing Company,2011-11-20 22:15:31,5.0,5.0,5.0,rtaps,American Double / Imperial Stout,5.0,5.0,Founders CBS Imperial Stout,10.6,47658,Founders Brewing Company // Founders CBS Imper...,4219


In [16]:
#we sort by lower confidence interval boundry
beer_mean_and_n.sort_values(ascending = False, by='lower_ci')

Unnamed: 0_level_0,p,n,p_prob,zip_p_n,lower_ci
beer_beerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1545,4.617925,1272,0.904481,"(0.9044811320754718, 1272.0)",0.887089
7971,4.590461,2432,0.897615,"(0.8976151315789473, 2432.0)",0.884933
16814,4.618510,443,0.904628,"(0.904627539503386, 443.0)",0.873693
21690,4.595439,592,0.898860,"(0.8988597972972974, 592.0)",0.871942
47658,4.591424,618,0.897856,"(0.8978559870550162, 618.0)",0.871469
...,...,...,...,...,...
18351,1.000000,1,0.000000,"(0.0, 1.0)",0.000000
42646,1.000000,1,0.000000,"(0.0, 1.0)",0.000000
54166,1.000000,1,0.000000,"(0.0, 1.0)",0.000000
37224,1.000000,1,0.000000,"(0.0, 1.0)",0.000000


In [17]:
beer_lower_ci = beer_mean_and_n['lower_ci']

beer_lower_ci

beer_beerid
52714    0.206543
6445     0.206543
49011    0.206543
14574    0.206543
49023    0.206543
           ...   
53222    0.000000
52745    0.000000
59853    0.000000
31594    0.000000
32505    0.000000
Name: lower_ci, Length: 66051, dtype: float64

## Preprocessing data to insert into the DB

In [None]:
beer_df.head()

In [18]:
beer_export_review_count = beer_df.groupby(by='beer_beerid').count()['review_overall']
beer_export_review_overall_mean = beer_df.groupby(by='beer_beerid').mean()['review_overall']
beer_export_review_aroma_mean = beer_df.groupby(by='beer_beerid').mean()['review_aroma']
beer_export_review_appearance_mean = beer_df.groupby(by='beer_beerid').mean()['review_appearance']
beer_export_review_palate_mean = beer_df.groupby(by='beer_beerid').mean()['review_palate']
beer_export_review_taste_mean = beer_df.groupby(by='beer_beerid').mean()['review_taste']

beer_export = pd.merge(beer_export_review_count, beer_export_review_overall_mean
                       , how='inner', on = 'beer_beerid')
beer_export = pd.merge(beer_export, beer_export_review_aroma_mean
                       , how='inner', on = 'beer_beerid')
beer_export = pd.merge(beer_export, beer_export_review_appearance_mean
                       , how='inner', on = 'beer_beerid')
beer_export = pd.merge(beer_export, beer_export_review_palate_mean
                       , how='inner', on = 'beer_beerid')
beer_export = pd.merge(beer_export, beer_export_review_taste_mean
                       , how='inner', on = 'beer_beerid')
beer_export = pd.merge(beer_export, beer_lower_ci
                       , how='inner', on = 'beer_beerid')

beer_export = beer_export.rename(columns={'review_overall_x': 'review_num', 
                                          'review_overall_y': 'overall_mean',
                                          'review_aroma': 'aroma_mean',
                                          'review_appearance': 'appearance_mean',
                                          'review_palate': 'palate_mean',
                                          'review_taste': 'taste_mean',
                                          'lower_ci': 'overall_mean_lower_ci_bounry'
                                         })

In [19]:
beer_export = beer_export.reset_index()

In [20]:
beer_export

Unnamed: 0,beer_beerid,review_num,overall_mean,aroma_mean,appearance_mean,palate_mean,taste_mean,overall_mean_lower_ci_bounry
0,3,3,4.166667,4.000000,3.833333,4.166667,4.166667,0.282306
1,4,10,3.700000,3.800000,3.950000,3.700000,3.450000,0.374946
2,5,420,3.551190,3.207143,3.485714,3.320238,3.342857,0.590773
3,6,871,3.706659,3.514925,3.836969,3.512055,3.643513,0.644883
4,7,655,3.269466,3.177863,3.249618,3.109160,3.101527,0.529140
...,...,...,...,...,...,...,...,...
66046,77313,1,3.000000,3.000000,4.000000,3.000000,3.000000,0.054619
66047,77314,1,3.500000,3.500000,4.000000,3.000000,4.000000,0.083321
66048,77315,1,3.500000,4.500000,3.000000,3.000000,3.500000,0.083321
66049,77316,1,3.500000,3.500000,3.500000,3.500000,3.500000,0.083321


In [21]:
beer_df

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid,beer_label,user_id
0,946,Windward & Leeward Brewery Ltd.,2010-12-02 16:17:25,4.5,4.5,4.5,roman_gellert,Foreign / Export Stout,4.5,4.5,Guinness Foreign Extra (St. Lucia Version),7.5,55404,Windward & Leeward Brewery Ltd. // Guinness Fo...,0
1,1374,Heineken St.Petersburg Brewery Ltd.,2010-07-28 15:46:23,4.0,4.0,4.0,roman_gellert,Euro Strong Lager,4.0,4.0,Ohota Krepkoye (Strong),8.0,5006,Heineken St.Petersburg Brewery Ltd. // Ohota K...,0
2,672,Elbrewery Co. Ltd. Sp. z o.o.,2005-09-29 17:22:56,3.0,3.0,3.0,roman_gellert,Euro Pale Lager,3.0,3.0,E.B. Specjal,5.4,2570,Elbrewery Co. Ltd. Sp. z o.o. // E.B. Specjal ...,0
3,2656,Browary Warka Sp. z o.o.,2005-02-08 00:14:04,3.5,3.5,3.0,roman_gellert,Märzen / Oktoberfest,3.5,3.5,Warka,5.6,6754,Browary Warka Sp. z o.o. // Warka // Märzen / ...,0
4,1941,Tyskie Browary Ksi&#261;&#380;&#281;ce (SABMil...,2009-06-23 22:50:40,3.0,3.0,3.0,roman_gellert,Euro Pale Lager,3.0,3.0,Tyskie Gronie,5.6,5280,Tyskie Browary Ksi&#261;&#380;&#281;ce (SABMil...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1571490,14359,The Defiant Brewing Company,2006-11-05 00:01:32,5.0,4.0,3.5,maddogruss,Pumpkin Ale,4.0,4.0,The Horseman's Ale,5.2,33061,The Defiant Brewing Company // The Horseman's ...,5887
1571491,14359,The Defiant Brewing Company,2006-10-17 01:29:26,4.0,5.0,2.5,yelterdow,Pumpkin Ale,2.0,4.0,The Horseman's Ale,5.2,33061,The Defiant Brewing Company // The Horseman's ...,707
1571492,14359,The Defiant Brewing Company,2006-10-13 01:21:53,4.5,3.5,3.0,TongoRad,Pumpkin Ale,3.5,4.0,The Horseman's Ale,5.2,33061,The Defiant Brewing Company // The Horseman's ...,7046
1571493,14359,The Defiant Brewing Company,2006-10-05 04:37:24,4.0,4.5,4.5,dherling,Pumpkin Ale,4.5,4.5,The Horseman's Ale,5.2,33061,The Defiant Brewing Company // The Horseman's ...,3979


In [24]:
brewery_name_dict = {}

def populate_brewery_name_dict(row, brewery_name_dict):
    brewery_name_dict[row['beer_beerid']] = row['brewery_name']
    
beer_df.apply(lambda row : populate_brewery_name_dict(row, brewery_name_dict), axis = 1)

print(1)

beer_style_dict = {}

def populate_beer_style_dict(row, beer_style_dict):
    beer_style_dict[row['beer_beerid']] = row['beer_style']
    
beer_df.apply(lambda row : populate_beer_style_dict(row, beer_style_dict), axis = 1)

print(2)

beer_name_dict = {}

def populate_beer_name_dict(row, beer_name_dict):
    beer_name_dict[row['beer_beerid']] = row['beer_name']
    
beer_df.apply(lambda row : populate_beer_name_dict(row, beer_name_dict), axis = 1)

print(3)

beer_abv_dict = {}

def populate_beer_abv_dict(row, beer_abv_dict):
    beer_abv_dict[row['beer_beerid']] = row['beer_abv']
    
beer_df.apply(lambda row : populate_beer_abv_dict(row, beer_abv_dict), axis = 1)

print(4)

def populate_brewery_name(row, brewery_name_dict):
    current_id = row['beer_beerid']
    
    return brewery_name_dict[current_id]


def populate_beer_style(row, beer_style_dict):
    current_id = row['beer_beerid']
    
    return beer_style_dict[current_id]


def populate_beer_name(row, beer_name_dict):
    current_id = row['beer_beerid']
    
    return beer_name_dict[current_id]


def populate_beer_abv(row, beer_abv_dict):
    current_id = row['beer_beerid']
    
    return beer_abv_dict[current_id]



beer_export['brewery_name'] = beer_export.apply(lambda row 
                                                : populate_brewery_name(row, brewery_name_dict), axis=1)
print(1)
beer_export['beer_style'] = beer_export.apply(lambda row 
                                                : populate_beer_style(row, beer_style_dict), axis=1)
print(2)
beer_export['beer_name'] = beer_export.apply(lambda row 
                                                : populate_beer_name(row, beer_name_dict), axis=1)
print(3)
beer_export['beer_abv'] = beer_export.apply(lambda row 
                                                : populate_beer_abv(row, beer_abv_dict), axis=1)

1
2
3
4
1
2
3


In [25]:
beer_export

Unnamed: 0,beer_beerid,review_num,overall_mean,aroma_mean,appearance_mean,palate_mean,taste_mean,overall_mean_lower_ci_bounry,brewery_name,beer_style,beer_name,beer_abv
0,3,3,4.166667,4.000000,3.833333,4.166667,4.166667,0.282306,Yellow Rose Brewing Company,American IPA,Cactus Queen IPA,-1.0
1,4,10,3.700000,3.800000,3.950000,3.700000,3.450000,0.374946,Yellow Rose Brewing Company,American Stout,Wildcatter's Crude Stout,-1.0
2,5,420,3.551190,3.207143,3.485714,3.320238,3.342857,0.590773,Abita Brewing Co.,Vienna Lager,Amber,4.5
3,6,871,3.706659,3.514925,3.836969,3.512055,3.643513,0.644883,Abita Brewing Co.,English Brown Ale,Turbodog,5.6
4,7,655,3.269466,3.177863,3.249618,3.109160,3.101527,0.529140,Abita Brewing Co.,Fruit / Vegetable Beer,Purple Haze,4.2
...,...,...,...,...,...,...,...,...,...,...,...,...
66046,77313,1,3.000000,3.000000,4.000000,3.000000,3.000000,0.054619,Aass Brewery,American Blonde Ale,Aass Gourmet Pale Ale,4.7
66047,77314,1,3.500000,3.500000,4.000000,3.000000,4.000000,0.083321,Lervig Aktiebryggeri AS,American Brown Ale,Betty Brown Norwegian Brwon Ale,4.7
66048,77315,1,3.500000,4.500000,3.000000,3.000000,3.500000,0.083321,Einstök Ölgerð,Witbier,Icelandic White Beer,5.2
66049,77316,1,3.500000,3.500000,3.500000,3.500000,3.500000,0.083321,Eddyline Restaurant & Brewery,American IPA,Crank Yanker IPA,7.8


In [None]:
import json

beer_export.to_json('beer_export.json')

In [None]:
beer_df.head()

In [None]:
user_export = pd.DataFrame(beer_df['review_profilename'].unique(), columns = ['review_profilename'])

In [None]:
user_export['user_id'] = user_export.index

In [None]:
user_export

In [None]:
len(user_export['user_id'].unique())

In [None]:
user_export_review_count = beer_df.groupby(by='review_profilename').count()['review_overall']
user_export_review_overall_mean = beer_df.groupby(by='review_profilename').mean()['review_overall']
user_export_review_aroma_mean = beer_df.groupby(by='review_profilename').mean()['review_aroma']
user_export_review_appearance_mean = beer_df.groupby(by='review_profilename').mean()['review_appearance']
user_export_review_palate_mean = beer_df.groupby(by='review_profilename').mean()['review_palate']
user_export_review_taste_mean = beer_df.groupby(by='review_profilename').mean()['review_taste']
user_export_review_abv_mean = beer_df.groupby(by='review_profilename').mean()['beer_abv']

user_export = pd.merge(user_export, user_export_review_count
                       , how='inner', on = 'review_profilename')
user_export = pd.merge(user_export, user_export_review_overall_mean
                       , how='inner', on = 'review_profilename')
user_export = pd.merge(user_export, user_export_review_aroma_mean
                       , how='inner', on = 'review_profilename')
user_export = pd.merge(user_export, user_export_review_appearance_mean
                       , how='inner', on = 'review_profilename')
user_export = pd.merge(user_export, user_export_review_palate_mean
                       , how='inner', on = 'review_profilename')
user_export = pd.merge(user_export, user_export_review_taste_mean
                       , how='inner', on = 'review_profilename')
user_export = pd.merge(user_export, user_export_review_abv_mean
                       , how='inner', on = 'review_profilename')

user_export = user_export.rename(columns={'review_overall_x': 'review_num', 
                                          'review_overall_y': 'overall_mean',
                                          'review_aroma': 'aroma_mean',
                                          'review_appearance': 'appearance_mean',
                                          'review_palate': 'palate_mean',
                                          'review_taste': 'taste_mean',
                                          'beer_abv': 'beer_abv_mean'
                                         })

In [None]:
user_export

In [None]:
user_export.to_json('user_export.json')

## Collaborative filtering (User - User)

In [None]:
import time
beer_df.head()

In [None]:
#a class to zip all required dictionaries together (we use dictionaries for performance sake)
from sklearn.model_selection import train_test_split

class Beer_data:
    #initiating to empty dictionaries
    def __init__(self):
        self.user_to_beer = {}
        self.beer_to_user = {}
        self.user_beer_rating = {}
        self.user_beer_rating_test = {}
        
    
    #getters to get dictionaries out of the object
    def get_U2B(self):
        return self.user_to_beer

    
    def get_B2U(self):
        return self.beer_to_user
    
    
    def get_UBR(self):
        return self.user_beer_rating
    
    
    def get_UBRT(self):
        return self.user_beer_rating_test
        
        
    #printing the size of dictionaries
    def print_size(self):
        print(f'Number of users: {len(self.user_to_beer)}')
        print(f'Number of beers: {len(self.beer_to_user)}')
        print(f'Number of training reviews: {len(self.user_beer_rating)}')
        print(f'Number of test reviews: {len(self.user_beer_rating_test)}')
    
    
    #filling dictionaries with data from pandas dataframe
    def fill(self, reviewer_max, beer_max, beer_df, test_size):
        #determining top [reviewer_max] reviewers
        reviewer_top = beer_df.groupby(by='review_profilename')['review_overall'].count().sort_values(ascending = False)
        reviewer_top = reviewer_top[0:reviewer_max]
        
        #determining top [beer_max] most reviewed beers
        beer_top = beer_df.groupby(by='beer_beerid')['review_overall'].count().sort_values(ascending = False)
        beer_top = beer_top[0:beer_max]
        
        #taking top [reviewer_max] reviewers and top [beer_max] most reviewed beer
        beer_df_top= beer_df.loc[beer_df['review_profilename'].isin(reviewer_top.index) & beer_df['beer_beerid'].isin(beer_top.index)] 
        
        #shuffling the rows
        beer_df_top.sample(frac=1)
        beer_df_top_train, beer_df_top_test = train_test_split(beer_df_top, test_size = test_size)
        
        #filling test and train sets
        for pos in range(beer_df_top_train.shape[0]):
            i = beer_df_top_train.iloc[pos]['review_profilename']
            j = beer_df_top_train.iloc[pos]['beer_beerid']	

            if i not in self.user_to_beer:
                self.user_to_beer[i] = [j]
            else:
                self.user_to_beer[i].append(j)

            if j not in self.beer_to_user:
                self.beer_to_user[j] = [i]
            else:
                self.beer_to_user[j].append(i)

            self.user_beer_rating[(i,j)] = beer_df_top_train.iloc[pos]['review_overall']
            
        for pos in range(beer_df_top_test.shape[0]):
            i = beer_df_top_test.iloc[pos]['review_profilename']
            j = beer_df_top_test.iloc[pos]['beer_beerid']	

            self.user_beer_rating_test[(i,j)] = beer_df_top_test.iloc[pos]['review_overall']

In [None]:
beer_data = Beer_data()
beer_data.fill(3000, 1000, beer_df, 0.2)

In [None]:
beer_data.print_size()

In [None]:
from scipy import spatial
from scipy.stats import pearsonr


class UU_collaborative_filtering:
    def __init__(self, beer_data, min_corr_items = 15, top_neighbours = 25):
        self.user_to_beer = beer_data.get_U2B()
        self.beer_to_user = beer_data.get_B2U()
        self.user_beer_rating = beer_data.get_UBR()
        self.user_beer_rating_test = beer_data.get_UBRT()
        
        self.user_mean_score = {}
        self.beer_mean_score = {}
        self.corr = {}
        
        self.predictions_train = {}
        self.predictions_test = {}
        
        self.MSE_train = {}
        self.MSE_test = {}
        
        self.min_corr_items = min_corr_items
        self.top_neighbours = top_neighbours
    
    
    #calculating mean scores for all users in train set
    def calculate_user_mean_scores(self):
        for user in self.user_to_beer.keys():
            mean_sum = 0
            count = 0
    
            for beer in self.user_to_beer.get(user):
                beer_rating = self.user_beer_rating[(user,beer)]
                mean_sum =  mean_sum + beer_rating
                count = count + 1

            self.user_mean_score[user] = mean_sum/count
    
    
    #calculating mean scores for all users in test set
    def calculate_beer_mean_score(self):
        for beer in self.beer_to_user.keys():
            mean_sum = 0
            count = 0
    
            for user in self.beer_to_user.get(beer):
                beer_rating = self.user_beer_rating[(user,beer)]
                mean_sum =  mean_sum + beer_rating
                count = count + 1
        
            self.beer_mean_score[beer] = mean_sum/count
    
    
    #function to calculate corellation beetween current user and the other users 
    def calculate_corr(self):
        current_user = 0
        #loop through all existing users
        for users in self.user_to_beer.keys():
            corr_list = []
            #getting beers reviewed by user
            user_reviwed_beer = set(self.user_to_beer.get(users))
            
            #skip for user himself
            for curr_users in self.user_to_beer.keys():
                if users == curr_users:
                    continue
                
                #getting beers reviewed by another user
                database_reviewed_beers = set(self.user_to_beer.get(curr_users))

                #calculating intersecting tastes
                intersecting_tastes = user_reviwed_beer.intersection(database_reviewed_beers)

                #if the number of intersecting tastes is insufficient skip the current user
                if len(intersecting_tastes) < self.min_corr_items:
                    continue

                #placing tastes in array to calculate the similarity
                user1_ratings = []
                user2_ratings = []

                for common_beer in list(intersecting_tastes):
                    user1_ratings.append(self.user_beer_rating[(users,common_beer)])
                    user2_ratings.append(self.user_beer_rating[(curr_users,common_beer)])

                #calculating similarity
                corr_list.append((curr_users, 1 - spatial.distance.cosine(user1_ratings, user2_ratings)
                                              ))
            
            #sort the list to get top correlating neighbours and make a dict to append to global dict
            corr_list = sorted(corr_list, key=lambda x: x[1], reverse = True)[:self.top_neighbours]
            corr_list = dict(corr_list)
            
            #appending top user correlations dictionary to internal dictionary of all correlations
            self.corr[users] = corr_list
            
            #for debugging purposes
#             current_user = current_user + 1
#             print(f'Calculated corr for user nr: {current_user}')

            
    #function returning pediction
    def predict(self, user, beer):
        weights_sum = 0
        weighted_numerator = 0
        
        #loop through correlated users 
        for corr_user in self.corr[user].keys():
            
            if (len(self.corr[user]) == 0):
                continue
            
            #look if correlated user has reviewed the product
            if beer in self.user_to_beer[corr_user]:
                weight = self.corr[user][corr_user]

                weighted_numerator = weighted_numerator + (self.user_beer_rating[(corr_user, beer)] 
                                    - self.user_mean_score[corr_user]) * weight
                weights_sum = weights_sum + weight
        
        # if there are no users who reviewed the product prediction is just the mean score of 
        # ratings of a particular beer
        if weights_sum == 0:
            return self.beer_mean_score[beer]
        else:
            return self.beer_mean_score[beer] + weighted_numerator/abs(weights_sum)
           
        
    #filling up predictions
    def predict_all(self):
        self.calculate_user_mean_scores()
        self.calculate_beer_mean_score()
        self.calculate_corr()
        
        counter_train = 0
        counter_test = 0
        
        for (user, beer) in self.user_beer_rating.keys():
            self.predictions_train[(user, beer)] = self.predict(user, beer)
            counter_train = counter_train + 1
            print(f'current train set prediction nr:{counter_train}')
        
        for (user, beer) in self.user_beer_rating_test.keys():
            self.predictions_test[(user, beer)] = self.predict(user, beer)
            counter_test = counter_test + 1
            print(f'current test set prediction nr:{counter_test}')
        
        
    #getting mean sqared error for both train and test 
    def get_MSE(self):
        n = 0
        SE = 0
        
        for (user, beer) in self.user_beer_rating.keys():
           SE = SE + (self.predictions_train[(user, beer)] - self.user_beer_rating[(user, beer)])**2
           n = n + 1  
                
        self.MSE_train = SE/n

        n = 0
        SE = 0
        
        for (user, beer) in self.user_beer_rating_test.keys():
           SE = SE + (self.predictions_test[(user, beer)] - self.user_beer_rating_test[(user, beer)])**2
           n = n + 1  
                
        self.MSE_test = SE/n
        
        print(f'\n\nMSE_train = {self.MSE_train}\nMSE_test = {self.MSE_test}')
        
        
    #getting sqare root mean sqared error for both train and test     
    def get_sqrt_of_MSE(self):
        self.get_MSE()
        
        print(f'\n\nMSE_train sqrt = {sqrt(self.MSE_train)}\nMSE_test sqrt = {sqrt(self.MSE_test)}')
        
        
    def get_predictions_test(self):
        return self.predictions_test

In [None]:
model_UU_Col = UU_collaborative_filtering(beer_data, min_corr_items = 5, top_neighbours = 40)

time1 = time.process_time()
model_UU_Col.predict_all()
time2 = time.process_time() - time1

model_UU_Col.get_predictions_test()
model_UU_Col.get_sqrt_of_MSE()
print(time2)

## Matrix factorization (Keras version) // Gradient descent training

In [4]:
import matplotlib.pyplot as plt
from tensorflow import keras
from keras.models import Model
from keras.layers import Input, Embedding, Dot, Add, Flatten
from keras.regularizers import l2
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping
from keras.metrics import RootMeanSquaredError

In [None]:
beer_df.head()

In [None]:
beer_df

In [None]:
user_id_dict = {}

def populate_user_id_dict(row, user_id_dict):
#     print(f'{row.name}')
    user_id_dict[row['review_profilename']] = row['user_id']
    
user_export.apply(lambda row : populate_user_id_dict(row, user_id_dict), axis = 1)

len(user_id_dict)

In [None]:
def populate_user_id(row, user_id_dict):
#     print(f'{row.name}')
    return user_id_dict[row['review_profilename']]

beer_df['user_id'] = beer_df.apply(lambda row:populate_user_id(row, user_id_dict), axis=1)

In [None]:
user_num = beer_df['user_id'].max() + 1

In [None]:
user_num

In [None]:
label_num = beer_df['beer_beerid'].max() + 1

In [None]:
label_num

In [None]:
beer_df.reset_index(drop=True, inplace=True)

beer_df.to_json('beer_df_updated.json')

In [None]:
beer_df

In [9]:
from sklearn.model_selection import train_test_split
test_size = 0.2
beer_train, beer_test = train_test_split(beer_df, test_size = test_size)

In [10]:
K = 50 #dimensions of feature vector
r = 0 #regularization term
mu = beer_train.review_overall.mean() #global mean

#input layers
u = Input(shape=(1, )) #users
b = Input(shape=(1, )) #beers

#embedded layers
u_embedded = Embedding(user_num, K, embeddings_regularizer=l2(r))(u) 
b_embedded = Embedding(label_num, K, embeddings_regularizer=l2(r))(b) 

#embedded layers for bias terms
u_bias = Embedding(user_num, 1, embeddings_regularizer=l2(r))(u) 
b_bias = Embedding(label_num, 1, embeddings_regularizer=l2(r))(b)

R = Dot(axes=2)([u_embedded, b_embedded]) #prediction rating
R = Add()([R, u_bias, b_bias]) 
R = Flatten()(R)

NameError: name 'user_num' is not defined

In [None]:
callback = EarlyStopping(monitor='val_loss', patience=2)
mf_model = Model(inputs = [u, b], outputs = R)
mf_model.compile(
  loss='mse',
  optimizer=SGD(learning_rate = 0.1, momentum = 0.9), #using gradient descent
  metrics=['mse'],
)

time1 = time.process_time()
res = mf_model.fit(
  x = [beer_train.user_id.values, beer_train.beer_beerid.values],
  y = beer_train.review_overall.values - mu,
  callbacks=[callback],
  epochs = 10,
  batch_size = 200,
  validation_data=(
    [beer_test.user_id.values, beer_test.beer_beerid.values],
    beer_test.review_overall.values - mu
  )
)
time2 = time.process_time() - time1

print(time2)

In [None]:
mf_model.save('mf_keras_gd')

In [None]:
time1 = time.process_time()
time2 = time.process_time()-time1
print(time2)

In [None]:
3700/60

In [6]:
mf_model = keras.models.load_model('mf_keras_gd')

In [None]:
import matplotlib.pyplot as plt
np.sqrt(res.history['loss'])

In [None]:
plt.plot(np.sqrt(res.history['loss']))
plt.plot(np.sqrt(res.history['val_loss']))
plt.title('Keras MF gradient descent accuracy')
plt.ylabel('RMSE')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right')

In [None]:
#predicting a single value
mf_model.predict([beer_train.user_id.values[0:1], beer_train.beer_beerid.values[0:1]]) + mu

In [None]:
mf_model.predict([beer_test.user_id.values[0:1], beer_test.beer_beerid.values[0:1]]) + mu

In [None]:
#summary of the whole model
mf_model.summary()

In [None]:
beer_df

In [None]:
lower_boundry_user = 20
higher_boundry_user = 1000

#Taking a subset of users for predictions 
beer_review_count_by_user = beer_df.groupby(by='user_id').count()
beer_review_count_by_user_diminished = beer_review_count_by_user[(beer_review_count_by_user['review_overall'] > lower_boundry_user) & (beer_review_count_by_user['review_overall'] < higher_boundry_user)]
beer_review_count_by_user_diminished = beer_review_count_by_user_diminished.reset_index()

beer_df_diminished = beer_df[beer_df.user_id.isin(beer_review_count_by_user_diminished.user_id)]
beer_df_diminished



ids = beer_df_diminished['user_id'].unique()[200:260]

ids = np.append(ids, 0)
ids


In [None]:
len(beer_df['beer_beerid'].unique())

In [4]:
user_count = beer_df.groupby(by='user_id').count()['review_overall'].sort_values(ascending = True)

In [5]:
user_count

user_id
33387       1
24378       1
24377       1
24372       1
24370       1
         ... 
199      3489
155      3517
64       4559
129      4653
43       5782
Name: review_overall, Length: 33388, dtype: int64

In [10]:
user_with_prediction_ids = []
users_for_surp_knn = []

user_with_prediction_ids.extend(list(user_count[(user_count < 500) & (user_count > 450)].index)[:15])
user_with_prediction_ids.extend(list(user_count[(user_count < 5) & (user_count > 2)].index)[:15])
user_with_prediction_ids.extend(list(user_count[(user_count < 50) & (user_count > 45)].index)[:15])
user_with_prediction_ids.append(0)
user_with_prediction_ids

[2028,
 865,
 1281,
 2310,
 1369,
 6048,
 3693,
 594,
 1122,
 2579,
 2183,
 1895,
 5861,
 3946,
 236,
 17089,
 29201,
 29539,
 22160,
 22221,
 10192,
 21429,
 6544,
 12570,
 29853,
 21435,
 6657,
 21438,
 26310,
 9570,
 9164,
 3401,
 11168,
 6240,
 11022,
 10347,
 5478,
 1964,
 5803,
 9788,
 6416,
 7788,
 6785,
 9142,
 5263,
 0]

In [15]:
user_count_array = []

for user in range(len(user_with_prediction_ids)):
    user_count_array.append(user_count[user_with_prediction_ids[user]])
    
user_count_array


user_and_review_count_df = pd.DataFrame()
user_and_review_count_df['user_id'] = user_with_prediction_ids
user_and_review_count_df['count'] = user_count_array
user_and_review_count_df

user_and_review_count_df.to_json('user_and_review_count.json')

In [12]:
beer_df[beer_df['review_profilename'] == 'roman_gellert']

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid,beer_label,user_id
0,946,Windward & Leeward Brewery Ltd.,2010-12-02 16:17:25,4.5,4.5,4.5,roman_gellert,Foreign / Export Stout,4.5,4.5,Guinness Foreign Extra (St. Lucia Version),7.5,55404,Windward & Leeward Brewery Ltd. // Guinness Fo...,0
1,1374,Heineken St.Petersburg Brewery Ltd.,2010-07-28 15:46:23,4.0,4.0,4.0,roman_gellert,Euro Strong Lager,4.0,4.0,Ohota Krepkoye (Strong),8.0,5006,Heineken St.Petersburg Brewery Ltd. // Ohota K...,0
2,672,Elbrewery Co. Ltd. Sp. z o.o.,2005-09-29 17:22:56,3.0,3.0,3.0,roman_gellert,Euro Pale Lager,3.0,3.0,E.B. Specjal,5.4,2570,Elbrewery Co. Ltd. Sp. z o.o. // E.B. Specjal ...,0
3,2656,Browary Warka Sp. z o.o.,2005-02-08 00:14:04,3.5,3.5,3.0,roman_gellert,Märzen / Oktoberfest,3.5,3.5,Warka,5.6,6754,Browary Warka Sp. z o.o. // Warka // Märzen / ...,0
4,1941,Tyskie Browary Ksi&#261;&#380;&#281;ce (SABMil...,2009-06-23 22:50:40,3.0,3.0,3.0,roman_gellert,Euro Pale Lager,3.0,3.0,Tyskie Gronie,5.6,5280,Tyskie Browary Ksi&#261;&#380;&#281;ce (SABMil...,0
5,81,Heineken Nederland B.V.,2010-07-07 16:19:53,4.0,4.0,4.0,roman_gellert,Euro Pale Lager,4.0,4.0,Heineken Lager Beer,5.0,246,Heineken Nederland B.V. // Heineken Lager Beer...,0
6,401,Baltika Breweries,2011-06-05 20:51:58,5.0,5.0,5.0,roman_gellert,Keller Bier / Zwickel Bier,5.0,5.0,Baltika Razlivnoe,5.3,69750,Baltika Breweries // Baltika Razlivnoe // Kell...,0
7,401,Baltika Breweries,2005-09-24 17:59:54,5.0,5.0,5.0,roman_gellert,Baltic Porter,5.0,5.0,Baltika #6 Porter,7.0,4694,Baltika Breweries // Baltika #6 Porter // Balt...,0
8,401,Baltika Breweries,2008-09-21 01:12:57,5.0,5.0,5.0,roman_gellert,Euro Pale Lager,5.0,5.0,Baltika #5 Gold,5.3,4699,Baltika Breweries // Baltika #5 Gold // Euro P...,0
9,1941,Tyskie Browary Ksi&#261;&#380;&#281;ce (SABMil...,2009-08-28 05:06:59,5.0,5.0,5.0,roman_gellert,Czech Pilsener,5.0,5.0,Tyskie Ksiazece,5.7,31256,Tyskie Browary Ksi&#261;&#380;&#281;ce (SABMil...,0


In [None]:
user_count[user_count < 50]

In [None]:
beer_ids_temp = beer_df['beer_beerid'].unique()

ratings = []
for curr_id in ids:
    ratings.append(mf_model.predict([np.array([curr_id]*len(beer_ids_temp)), beer_ids_temp]) + mu)

In [None]:
ratings = []
user_ids = []
beer_ids = []
is_prediction = []

top_boundry = 200

# curr = 0
# for row in range(beer_df_diminished.shape[0]):
#     curr+=1
# #     print(curr)
#     ratings.append(beer_df_diminished['review_overall'].iloc[row])
#     user_ids.append(beer_df_diminished['user_id'].iloc[row])
#     beer_ids.append(beer_df_diminished['beer_beerid'].iloc[row])
#     is_prediction.append(0)
    

beer_ids_temp = beer_df['beer_beerid'].unique()

curr = 0
for user_id in user_with_prediction_ids:
    curr+=0
    top_recommendations = []
    
    reviwed_beer_list = beer_df[beer_df['user_id'] == user_id]['beer_beerid'].values
    beer_ids_not_reviewed = [beer for beer in beer_ids_temp if beer not in reviwed_beer_list]
    
    predictions = mf_model.predict([np.array([user_id] * len(beer_ids_not_reviewed)), np.array(beer_ids_not_reviewed)]) + mu
    for prediction in range(len(predictions)):
        top_recommendations.append((beer_ids_not_reviewed[prediction], predictions[prediction]))
        
    top_recommendations = sorted(top_recommendations, key=lambda x: x[1], reverse = True)[:top_boundry]
    
    for recommendation in range(len(top_recommendations)):
        ratings.append(top_recommendations[recommendation][1][0])
        user_ids.append(user_id)
        beer_ids.append(top_recommendations[recommendation][0])
        is_prediction.append(1)
        
#     print((curr/len(user_ids_temp))*100)
    
    
ratings_export = pd.DataFrame()
ratings_export['ratings'] = ratings
ratings_export['user_id'] = user_ids
ratings_export['beer_id'] = beer_ids
ratings_export['is_prediction'] = is_prediction



ratings_export.to_json('keras_mf_esimations.json')

ratings_export



In [None]:
ratings_export[ratings_export['user_id'] == 0]

In [None]:
beer_df[beer_df['beer_beerid']==8626]

## Surpise models

In [5]:
beer_df=pd.read_json('beer_df_updated.json')

In [3]:
beer_df

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid,beer_label,user_id
0,946,Windward & Leeward Brewery Ltd.,2010-12-02 16:17:25,4.5,4.5,4.5,roman_gellert,Foreign / Export Stout,4.5,4.5,Guinness Foreign Extra (St. Lucia Version),7.5,55404,Windward & Leeward Brewery Ltd. // Guinness Fo...,0
1,1374,Heineken St.Petersburg Brewery Ltd.,2010-07-28 15:46:23,4.0,4.0,4.0,roman_gellert,Euro Strong Lager,4.0,4.0,Ohota Krepkoye (Strong),8.0,5006,Heineken St.Petersburg Brewery Ltd. // Ohota K...,0
2,672,Elbrewery Co. Ltd. Sp. z o.o.,2005-09-29 17:22:56,3.0,3.0,3.0,roman_gellert,Euro Pale Lager,3.0,3.0,E.B. Specjal,5.4,2570,Elbrewery Co. Ltd. Sp. z o.o. // E.B. Specjal ...,0
3,2656,Browary Warka Sp. z o.o.,2005-02-08 00:14:04,3.5,3.5,3.0,roman_gellert,Märzen / Oktoberfest,3.5,3.5,Warka,5.6,6754,Browary Warka Sp. z o.o. // Warka // Märzen / ...,0
4,1941,Tyskie Browary Ksi&#261;&#380;&#281;ce (SABMil...,2009-06-23 22:50:40,3.0,3.0,3.0,roman_gellert,Euro Pale Lager,3.0,3.0,Tyskie Gronie,5.6,5280,Tyskie Browary Ksi&#261;&#380;&#281;ce (SABMil...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1571490,14359,The Defiant Brewing Company,2006-11-05 00:01:32,5.0,4.0,3.5,maddogruss,Pumpkin Ale,4.0,4.0,The Horseman's Ale,5.2,33061,The Defiant Brewing Company // The Horseman's ...,5887
1571491,14359,The Defiant Brewing Company,2006-10-17 01:29:26,4.0,5.0,2.5,yelterdow,Pumpkin Ale,2.0,4.0,The Horseman's Ale,5.2,33061,The Defiant Brewing Company // The Horseman's ...,707
1571492,14359,The Defiant Brewing Company,2006-10-13 01:21:53,4.5,3.5,3.0,TongoRad,Pumpkin Ale,3.5,4.0,The Horseman's Ale,5.2,33061,The Defiant Brewing Company // The Horseman's ...,7046
1571493,14359,The Defiant Brewing Company,2006-10-05 04:37:24,4.0,4.5,4.5,dherling,Pumpkin Ale,4.5,4.5,The Horseman's Ale,5.2,33061,The Defiant Brewing Company // The Horseman's ...,3979


In [3]:
beer_df_surp = beer_df[['user_id','beer_beerid', 'review_overall']]

In [None]:
beer_df_surp.to_json('')

In [7]:
beer_df_surp

Unnamed: 0,user_id,beer_beerid,review_overall
0,0,55404,4.5
1,0,5006,4.0
2,0,2570,3.0
3,0,6754,3.5
4,0,5280,3.0
...,...,...,...
1571490,5887,33061,5.0
1571491,707,33061,4.0
1571492,7046,33061,4.5
1571493,3979,33061,4.0


In [8]:
users_for_surp_knn = []

users_for_surp_knn.extend(list(user_count[(user_count < 2000) & (user_count > 30)].index)[:5000])
beer_df_surp_knn = beer_df_surp[(beer_df_surp['user_id'].isin(user_with_prediction_ids))|(beer_df_surp['user_id'].isin(users_for_surp_knn))]
beer_df_initial_knn = beer_df[(beer_df['user_id'].isin(user_with_prediction_ids))|(beer_df_surp['user_id'].isin(users_for_surp_knn))]
beer_df_surp_knn

Unnamed: 0,user_id,beer_beerid,review_overall
0,0,55404,4.5
1,0,5006,4.0
2,0,2570,3.0
3,0,6754,3.5
4,0,5280,3.0
...,...,...,...
1571481,1913,33061,3.5
1571482,1896,33061,4.5
1571484,1790,33061,4.0
1571485,3363,33061,4.0


In [9]:
beer_df_initial_knn = beer_df[(beer_df['user_id'].isin(user_with_prediction_ids))|(beer_df['user_id'].isin(users_for_surp_knn))]
beer_df_initial_knn

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid,beer_label,user_id
0,946,Windward & Leeward Brewery Ltd.,2010-12-02 16:17:25,4.5,4.5,4.5,roman_gellert,Foreign / Export Stout,4.5,4.5,Guinness Foreign Extra (St. Lucia Version),7.5,55404,Windward & Leeward Brewery Ltd. // Guinness Fo...,0
1,1374,Heineken St.Petersburg Brewery Ltd.,2010-07-28 15:46:23,4.0,4.0,4.0,roman_gellert,Euro Strong Lager,4.0,4.0,Ohota Krepkoye (Strong),8.0,5006,Heineken St.Petersburg Brewery Ltd. // Ohota K...,0
2,672,Elbrewery Co. Ltd. Sp. z o.o.,2005-09-29 17:22:56,3.0,3.0,3.0,roman_gellert,Euro Pale Lager,3.0,3.0,E.B. Specjal,5.4,2570,Elbrewery Co. Ltd. Sp. z o.o. // E.B. Specjal ...,0
3,2656,Browary Warka Sp. z o.o.,2005-02-08 00:14:04,3.5,3.5,3.0,roman_gellert,Märzen / Oktoberfest,3.5,3.5,Warka,5.6,6754,Browary Warka Sp. z o.o. // Warka // Märzen / ...,0
4,1941,Tyskie Browary Ksi&#261;&#380;&#281;ce (SABMil...,2009-06-23 22:50:40,3.0,3.0,3.0,roman_gellert,Euro Pale Lager,3.0,3.0,Tyskie Gronie,5.6,5280,Tyskie Browary Ksi&#261;&#380;&#281;ce (SABMil...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1571481,14359,The Defiant Brewing Company,2011-10-02 23:44:13,3.5,4.0,3.5,ummswimmin,Pumpkin Ale,3.0,3.5,The Horseman's Ale,5.2,33061,The Defiant Brewing Company // The Horseman's ...,1913
1571482,14359,The Defiant Brewing Company,2010-12-02 02:35:38,4.5,4.0,4.0,pzrhsau,Pumpkin Ale,4.5,4.0,The Horseman's Ale,5.2,33061,The Defiant Brewing Company // The Horseman's ...,1896
1571484,14359,The Defiant Brewing Company,2010-11-10 05:16:44,4.0,4.5,4.0,infi,Pumpkin Ale,4.0,4.0,The Horseman's Ale,5.2,33061,The Defiant Brewing Company // The Horseman's ...,1790
1571485,14359,The Defiant Brewing Company,2010-11-04 17:03:26,4.0,4.5,4.5,njmoons,Pumpkin Ale,3.5,3.5,The Horseman's Ale,5.2,33061,The Defiant Brewing Company // The Horseman's ...,3363


In [10]:
from surprise import Dataset
from surprise.dataset import Reader
from surprise.model_selection import train_test_split

In [11]:
reader = Reader(rating_scale=(0, 5))

In [12]:
data_surprise = Dataset.load_from_df(beer_df_surp, reader)
data_surprise_knn = Dataset.load_from_df(beer_df_surp_knn, reader)

# K-NN surprise 

In [13]:
from surprise.prediction_algorithms.knns import KNNWithZScore
from surprise.prediction_algorithms.knns import KNNBasic
from surprise.prediction_algorithms.knns import KNNWithMeans
from surprise.prediction_algorithms.knns import KNNBaseline
from surprise.model_selection.search import GridSearchCV
from surprise.model_selection.validation import cross_validate
from surprise import SVD, accuracy
from surprise.model_selection import train_test_split

In [None]:
param_grid = {'k': [10, 40], 'min_k': [1,  10]}

## K-NN

In [16]:
sim_options = {
    "name": "cosine",
    "user_based": True,
}
k = 40
min_k = 5

In [14]:
trainset_surprise, testset_surprise = train_test_split(data_surprise, test_size=0.2)
trainset_surprise_knn, testset_surprise_knn = train_test_split(data_surprise_knn, test_size=0.2)
trainset_surprise_knn_full = data_surprise_knn.build_full_trainset()
trainset_surprise_full = data_surprise.build_full_trainset()

In [48]:
knn_basic_model = KNNBasic(k = k, min_k = min_k, sim_option = sim_options, verbose=True)
knn_means_model = KNNWithMeans(k = k, min_k = min_k, sim_option = sim_options, verbose=True)
knn_z_model = KNNWithZScore(k = k, min_k = min_k, sim_option = sim_options, verbose=True)
knn_baseline_model = KNNBaseline(k = k, min_k = min_k, sim_option = sim_options, verbose=True)

In [15]:
from time import perf_counter
from surprise import trainset

In [31]:
def predict_surprise(model, top_boundary, initial_df, file_name, users_with_predictions):
    ratings = []
    user_ids = []
    beer_ids = []
    is_prediction = []
    beer_ids_temp = initial_df['beer_beerid'].unique()
    

    for user_id in users_with_predictions:
        top_recommendations = []

        reviwed_beer_list = initial_df[initial_df['user_id'] == user_id]['beer_beerid'].values
        beer_ids_not_reviewed = [beer for beer in beer_ids_temp if beer not in reviwed_beer_list]

        predictions = []
        for beer in range(len(beer_ids_not_reviewed)):

            try:
                user_raw_id = trainset.to_raw_uid(user_id)
            except:
                user_raw_id = user_id

            try:
                beer_raw_id = trainset.to_raw_iid(beer_ids_not_reviewed[beer])
            except:
                beer_raw_id = beer_ids_not_reviewed[beer]
            
            predictions.append(model.predict(user_raw_id, beer_raw_id, verbose=False).est)
            
        for prediction in range(len(predictions)):
            top_recommendations.append((beer_ids_not_reviewed[prediction], predictions[prediction]))

        top_recommendations = sorted(top_recommendations, key=lambda x: x[1], reverse = True)[:top_boundary]

        for recommendation in range(len(top_recommendations)):
            ratings.append(top_recommendations[recommendation][1])
            user_ids.append(int(user_id))
            beer_ids.append(int(top_recommendations[recommendation][0]))
            is_prediction.append(1)

    #     print((curr/len(user_ids_temp))*100)


    ratings_export = pd.DataFrame()
    ratings_export['ratings'] = ratings
    ratings_export['user_id'] = user_ids
    ratings_export['beer_id'] = beer_ids
    ratings_export['is_prediction'] = is_prediction



    ratings_export.to_json(file_name+'.json')

    ratings_export

In [23]:
start = perf_counter()
knn_basic_model.fit(trainset_surprise_knn)
stop = perf_counter()-start
print(stop)

Computing the msd similarity matrix...
Done computing similarity matrix.
6.438866399999995


In [24]:
predictions_test = knn_basic_model.test(testset_surprise_knn)
accuracy.rmse(predictions_test)

RMSE: 0.6390


0.6389650463845626

In [26]:
knn_basic_model.fit(trainset_surprise_knn_full)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x20b1594bf70>

In [32]:
predict_surprise(knn_basic_model, 200, beer_df_initial_knn, "surprise_knn_basic", user_with_prediction_ids)

In [38]:
del(knn_basic_model)

In [33]:
start = perf_counter()
knn_means_model.fit(trainset_surprise_knn)
stop = perf_counter()-start
print(stop)

Computing the msd similarity matrix...
Done computing similarity matrix.
6.635482799999977


In [34]:
predictions_test = knn_means_model.test(testset_surprise_knn)
accuracy.rmse(predictions_test)

RMSE: 0.6318


0.631750003542497

In [35]:
knn_means_model.fit(trainset_surprise_knn_full)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x20c0e154100>

In [36]:
predict_surprise(knn_means_model, 200, beer_df_initial_knn, "surprise_knn_means", user_with_prediction_ids)

In [37]:
del(knn_means_model)

In [39]:
start = perf_counter()
knn_z_model.fit(trainset_surprise_knn)
stop = perf_counter()-start
print(stop)

Computing the msd similarity matrix...
Done computing similarity matrix.
7.035629299999982


In [41]:
predictions_test = knn_z_model.test(testset_surprise_knn)
accuracy.rmse(predictions_test)

RMSE: 0.6353


0.6352560142030818

In [49]:
knn_z_model.fit(trainset_surprise_knn_full)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithZScore at 0x20b2c791280>

In [50]:
predict_surprise(knn_z_model, 200, beer_df_initial_knn, "surprise_knn_z", user_with_prediction_ids)

In [51]:
del(knn_z_model)

In [45]:
start = perf_counter()
knn_baseline_model.fit(trainset_surprise_knn)
stop = perf_counter()-start
print(stop)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
8.145165899999938


In [46]:
predictions_test = knn_baseline_model.test(testset_surprise_knn)
accuracy.rmse(predictions_test)

RMSE: 0.6158


0.6158084377176758

In [52]:
knn_baseline_model.fit(trainset_surprise_knn_full)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x20b2d03d3d0>

In [53]:
predict_surprise(knn_baseline_model, 200, beer_df_initial_knn, "surprise_knn_baseline", user_with_prediction_ids)

In [54]:
del(knn_baseline_model)

## MF

In [82]:
from surprise.prediction_algorithms.matrix_factorization import SVD

In [89]:
mf_param_grid = {'n_epochs': [10, 20, 30], 
                  'biased': [True, False],
                  'n_factors': [10, 25, 50, 100]}

In [90]:
mf_grid = GridSearchCV(SVD, mf_param_grid, measures=['RMSE'])

In [91]:
mf_grid.fit(data_surprise)

In [94]:
mf_grid.best_score["rmse"]

0.6031426070888231

In [93]:
mf_grid.best_params["rmse"]

{'n_epochs': 30, 'biased': True, 'n_factors': 10}

In [44]:
mf_model = SVD(n_epochs = 30, biased=True, n_factors=10)

In [45]:
start = perf_counter()
mf_model.fit(trainset_surprise)
stop = perf_counter()-start
print(stop)

14.03780209999968


In [50]:
predictions_test = mf_model.test(testset_surprise)
accuracy.rmse(predictions_test)

RMSE: 0.6045


0.6044526464063864

In [46]:
predict_surprise(mf_model, 200, beer_df, "surprise_mf_basic", user_with_prediction_ids)

In [103]:
del(mf_model)
del(mf_grid)

In [16]:
from surprise.prediction_algorithms.matrix_factorization import NMF

In [27]:
nmf_param_grid = {'n_epochs': [30], 
                  'biased': [True],
                  'n_factors': [10, 25, 50, 100]}

In [28]:
nmf_grid = GridSearchCV(NMF, nmf_param_grid, measures=['RMSE'])

In [None]:
nmf_grid.fit(data_surprise)

In [None]:
nmf_grid.best_score["rmse"]

In [None]:
nmf_grid.best_params["rmse"]

In [47]:
nmf_model = NMF(n_epochs = 30, biased=True, n_factors=10)

In [48]:
start = perf_counter()
nmf_model.fit(trainset_surprise)
stop = perf_counter()-start
print(stop)

18.314723100000265


In [51]:
predictions_test = nmf_model.test(testset_surprise)
accuracy.rmse(predictions_test)

RMSE: 0.6108


0.6108353666846058

In [49]:
predict_surprise(nmf_model, 200, beer_df, "surprise_mf_nn", user_with_prediction_ids)

## SVD++

In [33]:
from surprise.prediction_algorithms.matrix_factorization import SVDpp

In [37]:
svd_pp = SVDpp(n_epochs = 30, n_factors=10, verbose=True)

In [38]:
start = perf_counter()
svd_pp.fit(trainset_surprise)
stop = perf_counter()-start
print(stop)

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
 processing epoch 20
 processing epoch 21
 processing epoch 22
 processing epoch 23
 processing epoch 24
 processing epoch 25
 processing epoch 26
 processing epoch 27
 processing epoch 28
 processing epoch 29
1877.3336154999997


In [39]:
predictions_test = svd_pp.test(testset_surprise)
accuracy.rmse(predictions_test)

RMSE: 0.6037


0.6037032105171044

In [40]:
predict_surprise(svd_pp, 200, beer_df, "surprise_mf_svdpp", user_with_prediction_ids)

## Co-clustering

In [52]:
from surprise.prediction_algorithms.co_clustering import CoClustering

In [57]:
cc_model = CoClustering(n_cltr_u=10, n_cltr_i=10, n_epochs=30, random_state=None, verbose=True)

In [58]:
start = perf_counter()
cc_model.fit(trainset_surprise)
stop = perf_counter()-start
print(stop)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
116.6840514000005


In [59]:
predictions_test = cc_model.test(testset_surprise)
accuracy.rmse(predictions_test)

RMSE: 0.6807


0.6807203751804353

In [56]:
predict_surprise(cc_model, 200, beer_df, "surprise_cc", user_with_prediction_ids)