# This notebook will be the second part to cleaning focusing on the bad beer_abv values

## Import dependancies

In [1]:
import pandas as pd
import numpy as np

# Loading the data

In [2]:
# %load get_data.py
def get_data() :
    
    import pandas as pd
    
    # rating_update is the original data 'beer_review.csv' with
    # bad brewery and profile names updated
    # beer_style updated
    # The updates change beer_style to match the styles that are given in 'beer_description.csv'

    csv_beer = pd.read_csv("/home/grimoire/Projects/BeerRatings/rating_update.csv")
    beer_ratings = pd.DataFrame(csv_beer)
    
    return beer_ratings


In [3]:
beer_ratings = get_data()

descriptions = pd.DataFrame(pd.read_csv('/home/grimoire/Projects/BeerRatings/beer_description.csv'))

# The hunt for beer_abv null (bad) values

In [4]:
beer_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 13 columns):
brewery_id            1586614 non-null int64
brewery_name          1586614 non-null object
review_time           1586614 non-null int64
review_overall        1586614 non-null float64
review_aroma          1586614 non-null float64
review_appearance     1586614 non-null float64
review_profilename    1586614 non-null object
beer_style            1586614 non-null object
review_palate         1586614 non-null float64
review_taste          1586614 non-null float64
beer_name             1586614 non-null object
beer_abv              1518829 non-null float64
beer_beerid           1586614 non-null int64
dtypes: float64(6), int64(3), object(4)
memory usage: 157.4+ MB


## Changing the values, same as first Data_cleaning notebook
* This is done for comparisons

In [5]:
beer_ratings['beer_abv'] = beer_ratings.beer_abv.values.astype('str')

## A look at how many bad observations we have, 
## and how many unique bad beer entries there are

In [6]:
print(len(beer_ratings[beer_ratings['beer_abv']=='nan']))
print(len(beer_ratings[beer_ratings['beer_abv']=='nan'].beer_name.unique()))

67785
14110


There are a total of 67,785 observations in the beer_abv column that have 'nan' values.

Of these 67,785 observations there are 14,110 individual specific beers with 'nan' values in beer_abv

## To make corrections to this we're going find what the average ABV is per style

In [7]:
descriptions['style_mean_ABV'] = (descriptions.abv_low.values + descriptions.abv_high.values) / 2

In [8]:
descriptions.head()

Unnamed: 0,style,description,abv_low,abv_high,ibu_low,ibu_high,style_mean_ABV
0,German Bock,Bock is a bottom fermenting lager that general...,6.3,7.6,20,30,6.95
1,German Doppelbock,"“Doppel” meaning “double,” this style is a big...",6.6,7.9,17,27,7.25
2,German Eisbock,Eisbock is an extremely strong beer with a typ...,7.0,14.0,25,35,10.5
3,German Maibock,Also called “Heller Bock” (meaning “Pale Bock”...,6.3,8.1,20,38,7.2
4,German Weizenbock,The German-style Weizenbock is a wheat version...,7.0,9.5,15,35,8.25


# We will fill the 'nan' values based on their styles ABV mean values
* This takes A LONG time using this method

In [9]:
for bad_index in beer_ratings[beer_ratings['beer_abv'] == 'nan'].index :
    # At the bad index find the style of the beer
    style = beer_ratings.iloc[bad_index, 7]
    
    # Select the Row from descriptions
    # That matches the style at the bad index
    # And select the mean ABV value from descriptions for that style
    good_value = descriptions[descriptions['style'] == style]['style_mean_ABV'].values[0]
    
    # Change nan value to good_value selected from above
    beer_ratings.iloc[bad_index, 11] = good_value

# Checking to see if we have any remaining 'nan' values

In [10]:
beer_ratings[beer_ratings['beer_abv'] == 'nan']['beer_abv']

Series([], Name: beer_abv, dtype: object)

# Ensuring the values are back as float type numbers (Decimals)

In [29]:
beer_ratings['beer_abv'] = beer_ratings['beer_abv'].astype(float)

# Final verification of data

In [30]:
beer_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 13 columns):
brewery_id            1586614 non-null int64
brewery_name          1586614 non-null object
review_time           1586614 non-null int64
review_overall        1586614 non-null float64
review_aroma          1586614 non-null float64
review_appearance     1586614 non-null float64
review_profilename    1586614 non-null object
beer_style            1586614 non-null object
review_palate         1586614 non-null float64
review_taste          1586614 non-null float64
beer_name             1586614 non-null object
beer_abv              1586614 non-null float64
beer_beerid           1586614 non-null int64
dtypes: float64(6), int64(3), object(4)
memory usage: 157.4+ MB


# Saving the data

In [31]:
beer_ratings.to_csv("/home/grimoire/Projects/BeerRatings/rating_update.csv", index=False)