In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt 





# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# A quick EDA of a Scotch Whisky Dataset.


I wanted to try and clean up this Dataset in a way which allows me to find good value Whiskys, especially from Distilleries or regions which I prefer. 

Let's start with a quick look at the data.

In [None]:
scotch_data = pd.read_csv('../input/22000-scotch-whisky-reviews/scotch_review.csv')
scotch_data.shape

In [None]:
scotch_data.columns


Ultimately I would like to find Scotch with good score to price ratios.

In [None]:
#scotch_data.head(10)

It appears as though the entire "Currency" column is listed in dollars, this should be easy to confirm.

In [None]:
scotch_data['currency'].value_counts()

2247 rows with 2247 instances of '$', we can drop this column. 

In [None]:
scotch_data.drop('currency', axis=1, inplace=True)
scotch_data.head()


It appears as though we have a redundant built in index which is unnamed, we can drop this too.

In [None]:
scotch_data.drop('Unnamed: 0', axis=1,inplace=True)


I think ABV should be it's own column, it appears that most of the name entries end with the ABV%, let's see if this is consistant throughout the dataset.


In [None]:
print(scotch_data['name'].str.endswith('%').value_counts())


Our data contains 2247 rows but only 2186 end with %. Let's find those which don't before we try to separate out the ABV.

In [None]:
problem_locations = scotch_data['name'].loc[lambda df: df.str.endswith('%') == False]
problems = []
problems = problem_locations.index.values
print(problems)


In [None]:
pd.options.display.max_colwidth = 150
display(problem_locations)

It would have been nice to have some sort of uniformity in the name syntax, but we can make this work.

In [None]:
scotch_data['name'].str.replace("ABV", "") # We already know the percentages are ABV, and we are going to create a new column anyway.
scotch_data['name'].str.rstrip(" ") # Remove any whitespace leftover from the removal of ABV.


The column is now looking more slightly more uniform, we can remove the ABVs from name and move them  into their own column where they belong.

Note: For anyone wondering why ABV is relevant, I generally prefer bottlings to be ≥ 43% to allow for the addition of water to open them up. Additionally cask strength bottlings like those with ABVs in the 50s or 60s are usually watered down to a more palatable AVB which means more pours from the same sized bottle. 

In [None]:
scotch_data['ABV'] = scotch_data['name'].str.rpartition(',')[2]
scotch_data['name'] = scotch_data['name'].str.rpartition(',')[0]


scotch_data.head()

The rpartition function handled the ABV problem beautifully. The dataset is looking cleaner, but there are still some ugly column names like review.point, we can easily clean these up.

In [None]:
scotch_data['Score'] = scotch_data['review.point']
scotch_data.drop('review.point', axis=1,inplace=True)


In [None]:
scotch_data = scotch_data.rename(columns={'name': 'Name', 'category': 'Category', 'price' : 'Price', 'description' : 'Description'})
#Capitalizing the columns.

At this point I downloaded the cleaned dataset into an excel file to give it a quick look over and see if I spotted any inconsistencies or major discrepancies. I noticed a few of the very expensive bottlings had strange formatting and so I fixed them via their indicies, more information is provided below.

This clearly would not be a viable option in a larger dataset, but it took me only a minute to look over this particular set. 

In [None]:
scotch_data.loc[[19,95,410,1000,1215], 'Price'] = '15000'

#Price for item 576 was listed at 44/liter, the standard bottle size is 750ml so we just did 44*.75 to come up with a price of $33.

scotch_data.loc[576, 'Price'] = '33'



When I tried converting the prices into integers, I ran into trouble with a few very expensive scotches which had prices written with commas. I considered using pd.to_numeric with errors= 'coerce', but that would mean either manually re-entering the prices or removing those scotches from the list, which I don't want to do at this time (Even though I assume they will be major outliers in the future). Instead, we can just remove the commas from all entries. 

I need to create a list of the indexes of these problematic prices.

In [None]:
messed_up_entries = scotch_data[scotch_data['Price'].str.contains("/set")] 
print(messed_up_entries.index)

In [None]:
scotch_data['Price'] = scotch_data['Price'].str.replace(',', '').astype(float)


In [None]:
scotch_data['Price'].round(decimals=0)
scotch_data.head()

In [None]:

plt.figure(figsize=(25,15)) 
plt.title('Score distribution for different types of Whisky', fontdict={'fontsize': 40})
sns.set(font_scale=2)
sns.swarmplot(x='Category', y='Score', data=scotch_data)



The vast majority of our data are comprised of Single Malts, which refers to bottlings that come from one particular distillery. We do however have enough data to see that all types of Whisky have a somewhat similar distribution, with a slight exception for Single Grain Whisky which is lacking any bottling that recieived a score greater than 95.

We can take a look to see if there is any clear relationship between Score and Price, which seems like a reasonable assumption to make. 

In [None]:
plt.figure(figsize=(25,15))
plt.title('Score vs Price',fontdict={'fontsize': 40})
sns.set(font_scale=2)
b = sns.scatterplot(x='Price', y='Score', data=scotch_data, hue='Category', s=75)
b.set(xlim=(0,150))

I limited the X axis here due to some extremely high priced Whiskys, which were clear outliers; likely a rare old bottling sold at auction and not a bottle which is available in stores. 

From this data there really doesn't seem to be much of a relationship between Price and Score. If we look at the cheaply priced Whiskys we can see that it's dominated mostly by blends, which is not surprising. At all prices there are bottles with scores from around 80 to 85, but there seems to be an increase in quality initially at around \\$30, at which point we start to see an increase in scores between 85 and 90 points, and again at a \\$50 price point where we start to see multiple scores greater than 90. 

As I had expected, there isn't much of a relationship between price and score, beyond a particular price point.

I'm interested in taking a look at just the entries from Islay, as this is my preferred region due to the smoky profile of the whiskys distilled there. The data isn't organized in a way where I could select by region, so I'll need to get a bit creative.

In [None]:
#A list of all of the Islay distilleries, although port charlotte and port ellen are not 
#currently producing, there may be an old bottle or two kicking around.

non_islay = []

islay_dist = ['Ardbeg', 'Bowmore', 'Bruichladdich', 'Bunnahabhain', 'Caol Ila', 'Kilchomin', 'Lagavulin','Laphroaig','Port Charlotte', 'Port Ellen']

df_filtered = scotch_data[scotch_data['Name'].str.contains('Laphroaig|Ardbeg|Bowmore|Bunnahabhain|Caol Ila|Port Ellen|Port Charlotte|Kilchomin|Lagavulin|Bruichladdich')]

df_filtered.sample(10)

In [None]:
"""Creating a new column to measure a Score to Price ratio."""

scotch_data['Score to Price'] = round((scotch_data['Score']/scotch_data['Price']),4)

df_filtered['Score to Price'] = round((df_filtered['Score']/df_filtered['Price']), 4)
df_filtered.sort_values('Score to Price', ascending=True)


"""I decided I was unwilling to spend more than $300 on a bottle, so I filtered out the more expensive bottlings."""

df_filtered = df_filtered[df_filtered['Price'] <= 300]
df_filtered.sample(10)

At this point, I am looking to graph my filtered dataframe with Islay Whiskys only, I dropped the outliers (the collector bottles over \\$10,000) and look at the bottles within a reasonable price range, we will look at \\$300 and below.

In [None]:
"""We are going to create a new column by iterating over the Name column to find Distilleries."""

dist_list = []

for idx, row in df_filtered.iterrows():
    if 'Ardbeg' in row.Name:
        dist_list.append('Ardbeg')
    elif 'Bowmore' in row.Name:
        dist_list.append('Bowmore')
    elif 'Bruichladdich' in row.Name:
        dist_list.append('Bruichladdich')
    elif 'Bunnahabhain' in row.Name:
        dist_list.append('Bunnahabhain')
    elif 'Caol Ila' in row.Name:
        dist_list.append('Caol Ila')
    elif 'Kilchomin' in row.Name:
        dist_list.append('Kilchomin') 
    elif 'Lagavulin' in row.Name:
        dist_list.append('Lagavulin')
    elif 'Laphroaig' in row.Name:
        dist_list.append('Laphroaig')
    elif 'Port Charlotte' in row.Name:
        dist_list.append('Port Charlotte')
    elif 'Port Ellen' in row.Name:
        dist_list.append('Port Ellen')

m = np.array(dist_list)

df_filtered['Distillery'] = m

df_filtered.sample(10)


In [None]:


plt.figure(figsize=(25,15))
plt.title('Score vs Price', fontdict={'fontsize':40})
sns.set(font_scale=2)
g = sns.scatterplot(x='Price', y='Score', data=df_filtered, hue='Distillery', s=125)
g.set(xlim=(0,150))


One last quick look at the top 21 Whiskys from Islay with a Score to Price ratio of 1.5 or greater.

In [None]:
df_top = df_filtered[df_filtered['Score to Price'] > 1.5]
df_top.head(21)

Ultimately I am happy with this cleaned up data as it will help me find the best value Whiskys by distillery whenever I am in the market for a new bottle.

I would certainly appreciate constructive comments and insights as I am relatively new to this sort of work and looking to start a career in Data science. 