Correspondence Analysis of alcoholic beverages and where to drink them

In [1]:
# Importing libs
import numpy as np
import pandas as pd
from prince import CA
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

ModuleNotFoundError: No module named 'prince'

Importing and processing data

In [None]:

df=pd.read_excel("data\\data.xlsx",sheet_name="data")
""" 
Data from a survey that asked where was the best place (from a list of 11 places) to drink 12 types of alcoolic beverages
 
weight: variable to match the sociodemographic characteristics of the sample to the population
 
Variables with "place" and "drink": each column is a beverage type and it's value is the name of the place if it was select or 0 if it wasnt
Some of the beverages that were in the survey weren't asked in this part so we will have to delete some of the variables
"""
df.head(5)

In [None]:
places=12 ### number of places +1
drinks=24 ### number of drinks +1

#* Getting variable names based on number of places and drinks
var_names=[]
for iplaces in range(1,places):
    for idrinks in range(11,drinks):
       var_names.append(f"place_{iplaces}_drink_{idrinks}") 

# Making one column with the values of all variables and changing it's value to binary (1 if they like the mix of the drink with the place) to make it easier to work with weights
df2 = pd.melt(df,id_vars=['ID','weight'],value_vars=var_names, value_name=('place_and_drink')).dropna()
df2.loc[df2['place_and_drink'] != 0, 'place_and_drink'] = 1

# Applying weights
df2['place_and_drink_weighted']=df2['place_and_drink']*df2['weight']
# creating variable for place and drink
for iplaces in range(1,places):    
    for idrinks in range(11,drinks):
        df2.loc[df2['variable']==f"place_{iplaces}_drink_{idrinks}",('drink')]=idrinks
        df2.loc[df2['variable']==f"place_{iplaces}_drink_{idrinks}",('place')]=iplaces

In [None]:
# Creating the contingency table
contingency_table = pd.pivot_table(
                  df2, 
                  values='place_and_drink_weighted', 
                  index=['place'],
                  columns=['drink'],
                  aggfunc=np.sum
)
# Labeling
column_names=[
'Beer 1',
'Cheap Spirit',
'Beer 2',
'Flavoured Spirit 4',
'Flavoured Spirit 1',
'Beer 3',
'Flavoured Beer 1',
'Flavoured Beer 2',
'Flavoured Spirit 2',
'Flavoured Spirit 3'
]
row_names=[
'Night club',
'Restaurant',
'Grocery Shop',
'Friend\'s house',
'Home',
'Show',
'House party',
'Pub',
'Bar',
'Beach',
'Pool'
]
contingency_table.columns=column_names
contingency_table.index=row_names

print (contingency_table)

Preparing correspondence analysis

In [None]:
ca=CA(n_components=2,n_iter=10,random_state=18012021)
ca.fit(contingency_table)
ca.row_coordinates(contingency_table)
ca.column_coordinates(contingency_table)
# variation explained by CA.
ca.explained_inertia_


Creating Map

In [None]:
# Creating datasets with the coordinates
coord_x=ca.row_coordinates(contingency_table)
coord_y=ca.column_coordinates(contingency_table)
coord_x.columns=['x','y']
coord_y.columns=['x','y']
# Assigning a series variable to know which coordinate belongs to places/drinks and merging dfs

coord=pd.concat([coord_x.assign(series='place'),coord_y.assign(series='drink')])
plt.figure(figsize=(10,10),dpi=1000,frameon=False)
sns.set_theme(style="whitegrid")
plot = sns.scatterplot(
    x='x',y='y',data=coord,hue='series',s=150,style='series',legend=False,ax=None,palette=sns.color_palette("flare", 2),markers=['o','D']
)

# Adding labels to points
i=-1
for x, y in zip(coord['x'], coord['y']):
    i+=1
    color='black'
    if i>10:    color=sns.color_palette("flare", 2)[1] # Changing drinks colors
    if x!=coord['x'].nlargest(2)[0] and x!=coord['x'].nlargest(2)[1]:
        plt.text(x = x+0.015,y = y-0.0025,s=coord.index[i],weight='bold',color=color)
    elif x==coord['x'].nlargest(2)[0] : # Making all labels fit in the plot
        plt.text(x = x-0.055,y = y-0.0025,s=coord.index[i],weight='bold',color=color)
    else:   plt.text(x = x-0.145,y = y-0.0025,s=coord.index[i],weight='bold',color=color)


# Cleaning axes

plot.set(xticklabels=[])
plot.set(xlabel=None)
plot.set(yticklabels=[])
plot.set(ylabel=None)
# Setting title
plt.title('Association map',weight='bold',size=20)


