## Imports and Preprocessing

In [1]:
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import seaborn as sns
sns.set()

# Import CSV
file_path = '/Users/phani/Downloads/beer_reviews/beer_reviews.csv'
beer_data = pd.read_csv(file_path, delimiter=',', encoding='utf-8')

#Split DataFrames
_reviews = ['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste']
_attribs = ['beer_style', 'beer_abv']
_ids = ['brewery_name', 'beer_name', 'beer_beerid']
_users = ['review_profilename']
beer_data.drop(['brewery_id', 'review_time'], axis=1, inplace=True)

## General Recommendation

### Recommend beers based on overall reviews from all reviewers

A simple ordering of data based on review_overall should give us a list with beers and their corresponding ranking in the list. However, the review_overall values for the beers are scattered and all the beers are not rated by all the users. 
Hence, we calculate the means of review_overall values for a given beer and assign that value as the overall rating for that beer. Then we rank the beers based on review_overall and then pick the top few beers as recommendations based on this data.

#### Data Cleaning 
Number of reviews are not same for all beers. So, we will calculate the sample mean from the number of reviews we gathered for a beer to assign one overall_review for each beer. However, some beers have only one review where as others have more than one reviews. Hence, we need to clean up the data to include only those beers where we can calculate the mean within a certain margin of error. 

#### Choosing Sample Means: 
The statistics way to chose the threshold number of reviews (min number of samples) is to compute the minimum number of required reviews for a beer to predict the mean with 95% confidence interval. 

We use this formula: ($\frac{\sigma^2 * Z^2}{m^2}$), where $\sigma$ is the standard deviation of the sample, Z-score for a confidence interval of 95% is 1.96 and m is the allowed margin of error. 

In [None]:
# define a new dataframe with four attributes
samplesDF = beerData[["beer_beerid","beer_name","review_overall", "review_profilename"]]

# drop duplicate reviews for the same beer
samplesDF = samplesDF.drop_duplicates(["beer_beerid","review_profilename"])

# set indices for determining levels
samplesDF = samplesDF.set_index(["beer_beerid","beer_name"])

# Calculate nSamples, sampleMeans, sampleStdDev

nSamples = samplesDF.groupby(level=0).count().to_dict()
sampleMeans = samplesDF.groupby(level=0).mean().to_dict()
sampleStdDev = samplesDF.groupby(level=0).std()

# Define Margin of Error and Z-score for 95% confidence interval
mError = 0.1
zScore = 1.96

#### 1. filter out sampleMeans with less number of reviews than minimum required to achieve 95% confidence interval 
#### 2. sort sampleMeans and rank beer_ids from the sorted sampleMeans
#### 3. reject samples with std dev = 0.0

In [None]:
sampleMeansTemp = {}
for key in nSamples.keys(): 
    if key == "review_overall": # we are only interested in overall_review
        for beerID in nSamples[key].keys(): # get the values - beer_beerid and overall review
            if sampleStdDev[key][beerID] > 0:
                nSamplesRequired = (sampleStdDev[key][beerID] * zScore/mError)**2
            if nSamples[key][beerID] > nSamplesRequired:
                sampleMeansTemp[beerID] =  sampleMeans[key][beerID]

# redefine sampleMeans by sorted overall_reviews 
sampleMeans = sorted(sampleMeansTemp.items(), key=lambda x: x[1] , reverse=True)

1. Filter out the beerIDs that are not included in sampleMeans list
2. make a new dataframe 

#### Note: appending rows to make a new data frame takes a lot of time. 
3. So take the original data frame and drop the rows by comparing beerIDs

In [None]:
reviewBeerIDs = [beerKey[0] for beerKey in sampleMeans]
# drop the duplicate beerIDs 
newBeerDF = beerData.drop_duplicates(["beer_beerid"])
beerIDsAll = newBeerDF.beer_beerid.tolist()

# list the iDs that we need to discard
discardBeerIDs = [beerID for beerID in beerIDsAll if beerID not in reviewBeerIDs]
newBeerDF = newBeerDF.set_index(["beer_beerid"])
newBeerDF = newBeerDF.drop(discardBeerIDs)

#drop other labels and leave only few for visualization
newBeerDF = newBeerDF.drop(['brewery_id','review_time', 'review_overall','review_aroma','review_taste',
                    'review_palate','review_profilename','beer_abv','review_appearance'], axis=1)

In [None]:
# Create a column review_overall with values from sampleMeans
review_overall = []
for beerIndex in newBeerDF.index.tolist():
    for keyIndex in range(len(sampleMeans)):
        if sampleMeans[keyIndex][0] == beerIndex:
            review_overall.append(sampleMeans[keyIndex][1])

# add the column review_overall values from sampleMeans list
newBeerDF['review_overall'] = review_overall

### Print the top ten beers in the list - General Recommendations

In [None]:
# sort the dataframe by overall reviews and print the top ten beers in the list
newBeerDF = newBeerDF.sort_values(by='review_overall', ascending=False)
newBeerDF.head(10)

## Feature importance

### Find out which features are important in determining the overall quality of the beer

In the reviews data set, we have four features listed for each beer: Appearance (or body), Aroma, Palate and Taste

Here we will try to correlate the overall rating for a given beer with each of the four features and then decide which of the above four features is the most important in determining the overall quality of the beer.

Since there are many users(reviewers) for each beer and the ratings are spread within a certain range of values, we will use the Samplemeans of the ratings for each beer as a representative value to develop correlation between features and overall rating. 

We can use two approaches: 
1. Compare the standardized $R^2$ values from the four features and rank them
2. Starting with a model with n-1 features, compare the $\Delta$ in $R^2$ when the left out feature is added and then chose the feature that results in the greatest $\Delta$

#### Data Cleaning 
We will use the same procedure for cleaning data and chosing sample means for building data and features. 

In [None]:
# Features and Data Matrix
# Create a new dataframe with the relevant features and other columns
featureDF = beerData[["beer_beerid", "review_profilename",'review_appearance','review_aroma', 
                      'review_palate','review_taste','review_overall']]
featureDF = featureDF.drop_duplicates(["beer_beerid","review_profilename"])
featureDF = featureDF.set_index("beer_beerid")

# discard the beers that didn't meet our screening criterion of 95% confidence level
# discardBeerIDs are taken from above
featureDF = featureDF.drop(discardBeerIDs)
featureDF = featureDF.reset_index()

# Make the list of all beers that pass the screenign criterion.
beerIDList = sorted(featureDF.beer_beerid.unique())

# Reindex the dataframe for extracting features.
featureDF = featureDF.set_index(["beer_beerid","review_profilename"])

#Debug Info:
    #print(len(beerIDList),len(profileList))

In [None]:
# Means and Standard deviation
featuresDict = featureDF.groupby(level=0).mean().to_dict()

#Appearance
appearanceSampleMeans = featuresDict['review_appearance']
#Aroma
aromaSampleMeans = featuresDict['review_aroma']
#Palate
palateSampleMeans = featuresDict['review_palate']
#Taste
tasteSampleMeans = featuresDict['review_taste']
#Overall review for Data Matrix
reviewSampleMeans = featuresDict['review_overall']

# Construct a numpy matrix with features Sample Means
featureMatrix = np.zeros(len(beerIDList*5)).reshape(len(beerIDList),5)
featuresMeansDicts = [appearanceSampleMeans,aromaSampleMeans,
                      palateSampleMeans,tasteSampleMeans]

# Populate the first element of the feature matrix with beerID
for beerIndex in range(len(beerIDList)):
    featureMatrix[beerIndex][0] = beerIDList[beerIndex]
    
featureIndex = 1 # feature index in feature Matrix
for featureDict in featuresMeansDicts:
    for beerIndex in range(len(beerIDList)):
        for key in featureDict.keys():
            if key == beerIDList[beerIndex]:
                featureMatrix[beerIndex][featureIndex] = featureDict[key]
    featureIndex += 1

# Construct the dataMatrix with Sample Means of overall review
dataMatrix = np.zeros(len(beerIDList*2)).reshape(len(beerIDList),2)

# Populate the first element of the Data matrix with beerID
# Second element with the overall review
#for beerIndex in range(len(beerIDList)):
#    dataMatrix[beerIndex][0] = beerIDList[beerIndex]

for beerIndex in range(len(beerIDList)):
    dataMatrix[beerIndex][0] = beerIDList[beerIndex]
    for key in reviewSampleMeans.keys():
        if key == beerIDList[beerIndex]:
            dataMatrix[beerIndex][1] = reviewSampleMeans[key]     

In [None]:
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
fig = plt.figure(figsize=(10,10))

X = np.array([featureMatrix[i][1:] for i in range(featureMatrix.shape[0])])
# Standardize the feature matrix: subtract mean and make it unit variance. $X = \frac{X-\mu}{\sigma}$
X = StandardScaler().fit_transform(X) 
y = np.array([dataMatrix[i][1] for i in range(dataMatrix.shape[0])])

# Transpose to index individual columns
X = X.T

for index in range(4):
    X_feature = X[index]
    X_feature = X_feature[:,np.newaxis]
    regressor = LinearRegression()
    regressor.fit(X_feature, y)
    score = regressor.score(X_feature,y)
    xMin = X_feature.min() * regressor.coef_[0] + regressor.intercept_
    xMax = X_feature.max() * regressor.coef_[0] + regressor.intercept_
    
    ax = fig.add_subplot(2,2,index+1)
    ax.plot([X_feature.min(), X_feature.max()], [xMin, xMax],linewidth=3.0, color='k')
    ax.plot(X[index], y, 'ro')

    if index == 0:
        ax.set_ylabel('Appearance')
        ax.text(0,2,'$R^2$ = %.2f'%score)
    elif index == 1:
        ax.set_ylabel('Aroma')
        ax.text(0,2,'$R^2$ = %.2f'%score)
    elif index == 2:
        ax.set_ylabel('Palate')
        ax.text(0,2,'$R^2$ = %.2f'%score)
    else:
        ax.set_ylabel('Taste')
        ax.text(0,2,'$R^2$ = %.2f'%score)


In [None]:
features = {0:"Appearance",1:"Aroma",2:"Palate",3:"Taste"}
deltaR2 = []
for index in range(4):
    X_feature = np.delete(X.T,index,axis=1)
    regressor = LinearRegression()
    regressor.fit(X_feature, y)
    scoreOne = regressor.score(X_feature,y)
    regressor.fit(X.T, y)
    scoreTwo = regressor.score(X.T,y)
    deltaR2.append(scoreTwo - scoreOne)
    print("Change in $R^2$ value upon including ", features[index]," = ", "%.3f"%deltaR2[index])
    
    

In [None]:
fig = plt.figure(figsize=(6,2))
ax = fig.add_subplot(111)
width = 0.5
ax.bar(list(features.keys()), deltaR2, width, color='r')
ax.set_xticks([i+width*0.5 for i in list(features.keys())])
ax.set_xticklabels(('Appearance', 'Aroma', 'Palate', 'Taste'))
ax.set_xlabel("Beer Charateristics")
ax.set_ylabel("$\Delta R^2$")
ax.set_title("Feature Importance ($ \propto \Delta R^2$)")