## Importing the early dependancies and data

In [1]:
import pandas as pd
import numpy as np

In [None]:
# %load get_data.py
def get_data() :
    
    import pandas as pd
    
    # rating_update is the original data 'beer_review.csv' with beer_style updated
    # The updates change beer_style to match the styles that are given in 'beer_description.csv'

    csv_beer = pd.read_csv("/home/grimoire/Projects/BeerRatings/rating_update.csv")
    beer_ratings = pd.DataFrame(csv_beer)
    
    return beer_ratings


In [None]:
beer_ratings = get_data()

# The purpose of this notebook will be to:
* Verify all values fall into appropriate ranges
* Find values that may cause issues
* Ameliorate, or remove bad values if necessary

# The hunt for bad review_profilename

In [None]:
beer_ratings.info()

The RangeIndex gives use a total of 1,586,614 observations. We have 1,586,266 non-null object observations in review_profilename

In [None]:
profilename_array = [name for name in beer_ratings.review_profilename.unique()]

Building an array of the unique/distinct profile names

In [None]:
print(True in [name == 'nan' for name in profilename_array])

Checking for 'nan' values in the array

In [None]:
print(True in [name == 'nan' for name in list(map(str, profilename_array))])

* Profile names should be type str. We're going to map the array so all values are type string. 
* After doing this we rechecked for 'nan' and find that there are nan values. 
In python nan can be a float value. So when checking for literal 'nan' it looked for an exact str match. 

Checking for these values as an array can save time before doing a large conversion over a dataframe.

In [None]:
beer_ratings['review_profilename'] = beer_ratings['review_profilename'].astype('str')
print(beer_ratings[beer_ratings['review_profilename'] == 'nan'].head(5))

In [None]:
bad_ProfileIndices = beer_ratings.loc[beer_ratings['review_profilename'] == 'nan',:]['review_profilename'].index

Collect the indices of the 'nan' entries in review_profilename

In [None]:
for index in bad_ProfileIndices :
    beer_ratings.loc[index,'review_profilename'] = 'OTHER'

Set the values as 'OTHER'. In instances where we're not using the profile names we still have the rest of the data. In cases that we're going to use profile names we can find the old 'nan' values under 'OTHER'

In [None]:
print(beer_ratings[beer_ratings['review_profilename'] == 'OTHER'].head(5))

In [None]:
beer_ratings.info()

All of the review_profilename anomolies have been ameliorated. Onto the next section...

# The hunt for beer_abv null (bad) values

In [None]:
beer_ratings.info()

In [None]:
beer_ratings.beer_abv.unique()

In [None]:
beer2 = beer_ratings.copy()

In [None]:
beer2.info()

In [None]:
beer2['abv'] = beer2.beer_abv.values.astype('str')

In [None]:
beer2.info()

In [None]:
len(beer2[beer2.abv =='nan'].beer_name)

In [None]:
len(beer2[beer2.abv =='nan'].beer_name.unique())

There are a total of 67,785 observations in the beer_abv column that have 'nan' values.
Of these 67,785 observations there are 14,110 individual specific beers with 'nan' values in beer_abv

In [None]:
max(beer2[beer2.abv !='nan'].beer_abv)

In [None]:
beer2[beer2[beer2.abv !='nan'] != 57.7].beer_abv.mean()

In [None]:
beer2[beer2.abv !='nan'].beer_abv.mean()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.hist(beer2[beer2.abv !='nan'].beer_abv, bins = 100)
plt.show()

In [None]:
beer2.keys()

In [None]:
beer1 = beer2.loc[:,['beer_name', 'abv']]

In [None]:
beer1['count'] = 0

In [None]:
beer1 = beer1.groupby(['beer_name','abv'], axis=0).count()

In [None]:
beer_ratings.keys()

In [None]:
beer2 = beer2[beer2.abv !='nan']

In [None]:
len(beer2[beer2.abv !='nan'].beer_abv.values)

In [None]:
beer2.info()

In [None]:
beer2 = beer2[beer2.beer_abv < 15] # Arbitrary number, cut off is when beer is basically carbonated malt wine...

In [None]:
import matplotlib.pyplot as plt

plt.hist(beer2.beer_abv.values, bins = 20)
plt.show()