## Important characteristics/features that determine the overall rating

In this notebook, the general correlation between the various features and the overall rating is plotted. By simple leave-one-out cross validation, we will determine the most important feature among the four that significantly contributes to the overall rating

This is an extension to the previous notebook. Some of the code is repeated at the top to build from scratch.

In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [3]:
beerData = pd.read_csv('/Users/phani/Downloads/beer_reviews/beer_reviews.csv', delimiter=",", encoding='utf-8')
#for i in range(len(beer_data.columns)):
#    print("Column",i, ": ", beer_data.columns[i])

In [4]:
# We will follow the same logic as in previous notebook to list beerIDs with less number of samples
samplesDF = beerData[["beer_beerid","beer_name","review_overall", "review_profilename"]]
# drop duplicate reviews for the same beer
samplesDF = samplesDF.drop_duplicates(["beer_beerid","review_profilename"])
samplesDF = samplesDF.set_index(["beer_beerid","beer_name"])
nSamples = samplesDF.groupby(level=0).count().to_dict()
sampleMeans = samplesDF.groupby(level=0).mean().to_dict()
sampleStdDev = samplesDF.groupby(level=0).std()
# Define Margin of Error and Z-score for 95% confidence interval
mError = 0.1
zScore = 1.96
# filter out sampleMeans with less number of reviews than minimum required
# to achieve 95% confidence interval, sort sampleMeans and rank beer_ids 
# from the sorted sampleMeans
# reject samples with std dev = 0.0

sampleMeansTemp = {}
for key in nSamples.keys(): 
    if key == "review_overall": # we are only interested in overall_review
        for beerID in nSamples[key].keys(): # get the values - beer_beerid and overall review
            if sampleStdDev[key][beerID] > 0:
                nSamplesRequired = (sampleStdDev[key][beerID] * zScore/mError)**2
            if nSamples[key][beerID] > nSamplesRequired:
                sampleMeansTemp[beerID] =  sampleMeans[key][beerID]

# redefine sampleMeans by sorted overall_reviews 
sampleMeans = sorted(sampleMeansTemp.items(), key=lambda x: x[1] , reverse=True)

# Filter out the beerIDs that are not included in sampleMeans list
# make a new dataframe
# appending rows to make a new data frame takes a lot of time. So we will take this approach. 
#Take the original data frame and drop the rows by comparing beerIDs

reviewBeerIDs = [beerKey[0] for beerKey in sampleMeans]
# drop the duplicate beerIDs 
samplesDF = samplesDF.reset_index()
samplesDF = beerData.drop_duplicates(["beer_beerid"])
beerIDsAll = samplesDF.beer_beerid.tolist()

# list the iDs that we need to discard
discardBeerIDs = [beerID for beerID in beerIDsAll if beerID not in reviewBeerIDs]


# Construct Feature and Data Matrices
featureDF = beerData[["beer_beerid", "review_profilename",'review_appearance','review_aroma', 
                      'review_palate','review_taste','review_overall']]
featureDF = featureDF.drop_duplicates(["beer_beerid","review_profilename"])
featureDF = featureDF.set_index("beer_beerid")
featureDF = featureDF.drop(discardBeerIDs)
featureDF = featureDF.reset_index()

# Make lists that match data matrix indices
beerIDList = sorted(featureDF.beer_beerid.unique())
profileList = featureDF.review_profilename.unique()

# We are interested in beer features. So we index the data frame using ID and profilename
# and calculate sample means of all features
featureDF = featureDF.set_index(["beer_beerid","review_profilename"])

# features sampleMeans
featuresDict = featureDF.groupby(level=0).mean().to_dict()
sampleMeans = featuresDict['review_overall']
appearanceSampleMeans = featuresDict['review_appearance']
aromaSampleMeans = featuresDict['review_aroma']
palateSampleMeans = featuresDict['review_palate']
tasteSampleMeans = featuresDict['review_taste']

# Define dataMatrix
dataMatrix = np.zeros(len(beerIDList*2)).reshape(len(beerIDList),2)

for beerIndex in range(len(beerIDList)):
        for key in sampleMeans.keys():
            if key == beerIDList[beerIndex]:
                dataMatrix[beerIndex][1] = sampleMeans[key]
                
# Define featureMatrix
featureMatrix = np.zeros(len(beerIDList*5)).reshape(len(beerIDList),5)
featuresMeansDicts = [appearanceSampleMeans,aromaSampleMeans,
                      palateSampleMeans,tasteSampleMeans]

for beerIndex in range(len(beerIDList)):
    featureMatrix[beerIndex][0] = beerIDList[beerIndex]
    dataMatrix[beerIndex][0] = beerIDList[beerIndex]
    
featureIndex = 1 # feature index in feature Matrix
for featureDict in featuresMeansDicts:
    for beerIndex in range(len(beerIDList)):
        for key in featureDict.keys():
            if key == beerIDList[beerIndex]:
                featureMatrix[beerIndex][featureIndex] = featureDict[key]
    featureIndex += 1

X = np.array([featureMatrix[i][1:] for i in range(featureMatrix.shape[0])if dataMatrix[i][1] > 0])
y = np.array([dataMatrix[i][1] for i in range(dataMatrix.shape[0])if dataMatrix[i][1] > 0])

