# Data Analysis - Cocktails [APPENDIX]

Thomas BORDES, Damien WILLETT.

## Appendix: 'cocktails' dataset

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
import pandas as pd
import seaborn as sns
sns.set_palette(sns.color_palette('hls'))
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from scipy.stats import norm
from scipy import linalg
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator

: 

In [2]:
cocktails = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-26/cocktails.csv')

In [None]:
cocktails.shape

In [None]:
len(np.unique(cocktails['ingredient']))

### Data cleaning

Columns that do not provide interesting information are removed.

In [5]:
cocktails.drop(columns=['date_modified', 'drink_thumb', 'video','iba'], inplace=True)

We transform the columns into lowercase characters.

In [6]:
cocktails['alcoholic'] = cocktails['alcoholic'].str.lower()
cocktails['category'] = cocktails['category'].str.lower()
cocktails['glass'] = cocktails['glass'].str.lower()

Some measurements are corrected (in particular input errors).

In [7]:
cocktails.loc[cocktails['measure'] == '1-2 dash', 'measure'] = '1.5 dashes'
cocktails.loc[cocktails['measure'] == '2 Dashes', 'measure'] = '2 dashes'

cocktails.loc[cocktails['measure'] == '1/2 Shot', 'measure'] = '1/2 shot'
cocktails.loc[cocktails['measure'] == '1/8 Shot', 'measure'] = '1/8 shot'
cocktails.loc[cocktails['measure'] == '1/4 Shot', 'measure'] = '1/4 shot'
cocktails.loc[cocktails['measure'] == '1/2 shot Bacardi', 'measure'] = '1/2 shot'
cocktails.loc[cocktails['measure'] == '1 shot Bacardi', 'measure'] = '1 shot'
cocktails.loc[cocktails['measure'] == '1 shot Jamaican', 'measure'] = '1 shot'
cocktails.loc[cocktails['measure'] == '1 Shot', 'measure'] = '1 shot'
cocktails.loc[cocktails['measure'] == '1-2 Shot', 'measure'] = '2 shot'
cocktails.loc[cocktails['measure'] == '1-2 shot', 'measure'] = '2 shot'
cocktails.loc[cocktails['measure'] == '2 shot', 'measure'] = '2 shots'

cocktails.loc[cocktails['measure'] == 'dash', 'measure'] = '1 dash'
cocktails.loc[cocktails['measure'] == '1 .75 oz', 'measure'] = '1.75 oz'
cocktails.loc[cocktails['measure'] == '6 Fresh', 'measure'] = '6'
cocktails.loc[cocktails['measure'] == 'splash', 'measure'] = '1 splash'

Some units of measurement are converted to oz.

In [8]:
cocktails.loc[cocktails['measure'] == '1 dash', 'measure'] = '.02 oz'
cocktails.loc[cocktails['measure'] == '1.5 dashes', 'measure'] = '.025 oz'
cocktails.loc[cocktails['measure'] == '2 dashes', 'measure'] = '.04 oz'
cocktails.loc[cocktails['measure'] == '3 dashes', 'measure'] = '.06 oz'
cocktails.loc[cocktails['measure'] == '4 dashes', 'measure'] = '.08 oz'

cocktails.loc[cocktails['measure'] == '1 tbsp', 'measure'] = '.5 oz'
cocktails.loc[cocktails['measure'] == '2 tbsp', 'measure'] = '1 oz'

cocktails.loc[cocktails['measure'] == '1/8 shot', 'measure'] = '0.1875 oz'
cocktails.loc[cocktails['measure'] == '1/4 shot', 'measure'] = '0.375 oz'
cocktails.loc[cocktails['measure'] == '1/3 shot', 'measure'] = '0.5 oz'
cocktails.loc[cocktails['measure'] == '1/2 shot', 'measure'] = '0.75 oz'
cocktails.loc[cocktails['measure'] == '3/4 shot', 'measure'] = '1.125 oz'
cocktails.loc[cocktails['measure'] == '1 shot', 'measure'] = '1.5 oz'
cocktails.loc[cocktails['measure'] == '1 1/2 shot', 'measure'] = '2.25 oz'
cocktails.loc[cocktails['measure'] == '1 3/4 shot', 'measure'] = '2.625 oz'
cocktails.loc[cocktails['measure'] == '2 shots', 'measure'] = '3 oz'
cocktails.loc[cocktails['measure'] == '2 1/2 shots', 'measure'] = '3.75 oz'
cocktails.loc[cocktails['measure'] == '3 shots', 'measure'] = '4.5 oz'
cocktails.loc[cocktails['measure'] == '4 shots', 'measure'] = '6 oz'
cocktails.loc[cocktails['measure'] == '5 shots', 'measure'] = '7.5 oz'

cocktails.loc[cocktails['measure'] == '1/4 cl', 'measure'] = '0.01 oz'
cocktails.loc[cocktails['measure'] == '1 cl', 'measure'] = '0.33 oz'
cocktails.loc[cocktails['measure'] == '1 cl', 'measure'] = '0.5 oz'
cocktails.loc[cocktails['measure'] == '2 cl', 'measure'] = '0.67 oz'
cocktails.loc[cocktails['measure'] == '3 cl', 'measure'] = '1 oz'
cocktails.loc[cocktails['measure'] == '4 cl', 'measure'] = '1.35 oz'
cocktails.loc[cocktails['measure'] == '5 cl', 'measure'] = '1.69 oz'
cocktails.loc[cocktails['measure'] == '6 cl', 'measure'] = '2 oz'
cocktails.loc[cocktails['measure'] == '7 cl', 'measure'] = '2.36 oz'
cocktails.loc[cocktails['measure'] == '8 cl', 'measure'] = '2.7 oz'
cocktails.loc[cocktails['measure'] == '10 cl', 'measure'] = '3.36 oz'
cocktails.loc[cocktails['measure'] == '15 cl', 'measure'] = '5 oz'
cocktails.loc[cocktails['measure'] == '16 cl', 'measure'] = '5.4 oz'

We replace the fractional writing of measurements with decimal writing.

In [9]:
cocktails.loc[cocktails['measure'] == '1/4 oz', 'measure'] = '0.25 oz'
cocktails.loc[cocktails['measure'] == '1/3 oz', 'measure'] = '0.33 oz'
cocktails.loc[cocktails['measure'] == '1/2 oz', 'measure'] = '0.5 oz'
cocktails.loc[cocktails['measure'] == '3/4 oz', 'measure'] = '0.75 oz'
cocktails.loc[cocktails['measure'] == '1 1/2 oz', 'measure'] = '1.5 oz'
cocktails.loc[cocktails['measure'] == '1 1/4 oz', 'measure'] = '1.25 oz'
cocktails.loc[cocktails['measure'] == '2 1/2 oz', 'measure'] = '2.5 oz'

We create a new column in which we extract the numerical value of the measurement.

In [10]:
cocktails['measure_number'] = cocktails['measure'].str.extract('(\d*\.\d+|\d+)', expand=False).astype(float)

In [None]:
cocktails.head()

In [None]:
cocktails.shape

In [None]:
len(np.unique(cocktails['ingredient']))

### Exploratory analysis

In [None]:
len(np.unique(cocktails['ingredient']))

We plot the histograms of the types of cocktails (with, without or possibility of alcohol), the glasses used for the cocktails, the categories of cocktails.

In [15]:
cocktails_sorted_type = cocktails.groupby(['id_drink', 'alcoholic'], as_index=False).count()
cocktails_sorted_type.sort_values(by='alcoholic', inplace=True)

cocktails_sorted_verre = cocktails.groupby(['id_drink', 'glass'], as_index=False).count()
cocktails_sorted_verre.sort_values(by='glass', inplace=True)

cocktails_sorted_categ = cocktails.groupby(['id_drink', 'category'], as_index=False).count()
cocktails_sorted_categ.sort_values(by='category', inplace=True)

In [None]:
cocktails_sorted_verre.head()

In [None]:
plt.figure(figsize=(10, 7))
plt.gcf().subplots_adjust(wspace=0.6, hspace=0.4)

plt.subplot(222)
fig1 = sns.histplot(y=cocktails_sorted_type['alcoholic'])
fig1.set(title="Types of cocktails", ylabel="Alcohol level", xlabel="Frequency", xlim=[0,500])
for p in fig1.patches:
    fig1.annotate(p.get_width(), (p.get_width()+2.8, p.get_y()+0.6))

plt.subplot(121)
fig2 = sns.histplot(y=cocktails_sorted_verre['glass'])
fig2.set(title="Glasses used for cocktails", ylabel="Glass", xlabel="Frequency", xlim=[0,130])
for p in fig2.patches:
    fig2.annotate(p.get_width(), (p.get_width()+0.8, p.get_y()+0.8))

plt.subplot(224)
fig3 = sns.histplot(y=cocktails_sorted_categ['category'])
fig3.set(title="Cocktail categories", ylabel="Category", xlabel="Frequency", xlim=[0,300])
for p in fig3.patches:
    fig3.annotate(p.get_width(), (p.get_width()+1.6, p.get_y()+0.8))

plt.savefig('imgs/exploratory_analysis_cocktails.pdf', bbox_inches='tight')
plt.show()
plt.close()

We plot the histogram of the number of ingredients per cocktail.

In [None]:
A = cocktails.groupby(['row_id'], as_index=False)['ingredient_number'].max()

cocktails_alcohol = cocktails[cocktails['alcoholic'].isin(["alcoholic"])]
cocktails_wt_alcohol = cocktails[cocktails['alcoholic'].isin(["non alcoholic"])]

plt.figure(figsize=(10,5))

plt.subplot(1, 2, 1)
fig1 = sns.countplot(data=cocktails_alcohol.groupby(['row_id'], as_index=False)['ingredient_number'].max(), x='ingredient_number')
fig1.set(title="Alcoholic cocktails", xlabel="Number of ingredients", ylabel="Number of cocktails")
fig1.xaxis.set_major_locator(plticker.MultipleLocator(base=1))
fig1.yaxis.set_major_locator(plticker.MultipleLocator(base=50))
for p in fig1.patches:
    fig1.annotate(p.get_height(), (p.get_x()+p.get_width()/2., p.get_height()+4), ha='center', va='center')

plt.subplot(1, 2, 2)
fig2 = sns.countplot(data=cocktails_wt_alcohol.groupby(['row_id'], as_index=False)['ingredient_number'].max(), x='ingredient_number')
fig2.set(title="Non-alcoholic cocktails", xlabel="Number of ingredients", ylabel="Number of cocktails")
fig2.xaxis.set_major_locator(plticker.MultipleLocator(base=1))
fig2.yaxis.set_major_locator(plticker.MultipleLocator(base=50))
for p in fig2.patches:
    fig2.annotate(p.get_height(), (p.get_x()+p.get_width()/2., p.get_height()+0.35), ha='center', va='center')

plt.savefig('imgs/nb_ingredients_cocktails_alcohol_and_non_alcohol.pdf', bbox_inches='tight')
plt.show()
plt.close()

We look at the 3 most important ingredients in common between cocktails with and without alcohol.

In [None]:
freq_alcohol = cocktails_alcohol['ingredient'].value_counts()
freq_wt_alcohol = cocktails_wt_alcohol['ingredient'].value_counts()

def common_member(a, b): 
    a_set = set(a) 
    b_set = set(b) 
  
    if (a_set & b_set): 
        print(a_set & b_set) 
    else: 
        print("No common elements")  

#On prend les 10 premirs ingrédients les plus utilisées de chaque et on calcule les éléments commun
commun = common_member(freq_alcohol[0:10].index.tolist(),freq_wt_alcohol[0:10].index.tolist())