In [1]:
# Title: "Create and save exploratory data analysis figures of coffee quality dataset"
# Authors: "Arlin Cherian, Kristin Bunyan, Michelle Wang, Berkay Bulut"
# date: 2021-11-19

In [2]:
# loading packages and set up

import numpy as np
import pandas as pd
import altair as alt
from sklearn import datasets
# from sklearn.pipeline import Pipeline, make_pipeline
# from sklearn.compose import ColumnTransformer, make_column_transformer
# from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
# Loading data set
coffee_df = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-07-07/coffee_ratings.csv')
#check number of rows and columns
coffee_df.shape

(1339, 43)

In [4]:
# Explore the structure of the dataset
coffee_df.info()

# Explore numerical features in the data set
coffee_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1339 entries, 0 to 1338
Data columns (total 43 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   total_cup_points       1339 non-null   float64
 1   species                1339 non-null   object 
 2   owner                  1332 non-null   object 
 3   country_of_origin      1338 non-null   object 
 4   farm_name              980 non-null    object 
 5   lot_number             276 non-null    object 
 6   mill                   1021 non-null   object 
 7   ico_number             1182 non-null   object 
 8   company                1130 non-null   object 
 9   altitude               1113 non-null   object 
 10  region                 1280 non-null   object 
 11  producer               1107 non-null   object 
 12  number_of_bags         1339 non-null   int64  
 13  bag_weight             1339 non-null   object 
 14  in_country_partner     1339 non-null   object 
 15  harv

Unnamed: 0,total_cup_points,number_of_bags,aroma,flavor,aftertaste,acidity,body,balance,uniformity,clean_cup,sweetness,cupper_points,moisture,category_one_defects,quakers,category_two_defects,altitude_low_meters,altitude_high_meters,altitude_mean_meters
count,1339.0,1339.0,1339.0,1339.0,1339.0,1339.0,1339.0,1339.0,1339.0,1339.0,1339.0,1339.0,1339.0,1339.0,1338.0,1339.0,1109.0,1109.0,1109.0
mean,82.089851,154.182972,7.566706,7.520426,7.401083,7.535706,7.517498,7.518013,9.834877,9.835108,9.856692,7.503376,0.088379,0.479462,0.173393,3.556385,1750.713315,1799.347775,1775.030545
std,3.500575,129.987162,0.37756,0.398442,0.404463,0.379827,0.370064,0.408943,0.554591,0.763946,0.616102,0.473464,0.048287,2.549683,0.832121,5.312541,8669.440545,8668.805771,8668.62608
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,81.08,14.0,7.42,7.33,7.25,7.33,7.33,7.33,10.0,10.0,10.0,7.25,0.09,0.0,0.0,0.0,1100.0,1100.0,1100.0
50%,82.5,175.0,7.58,7.58,7.42,7.58,7.5,7.5,10.0,10.0,10.0,7.5,0.11,0.0,0.0,2.0,1310.64,1350.0,1310.64
75%,83.67,275.0,7.75,7.75,7.58,7.75,7.67,7.75,10.0,10.0,10.0,7.75,0.12,0.0,0.0,4.0,1600.0,1650.0,1600.0
max,90.58,1062.0,8.75,8.83,8.67,8.75,8.58,8.75,10.0,10.0,10.0,10.0,0.28,63.0,11.0,55.0,190164.0,190164.0,190164.0


According to the dataset decription our target, `total_cup_points` is a point scale from 0-100 with mean scale rating of 82.1 points, minimum of 0.0 and maximum of 90.6 points.

In [16]:
# How many unique country of origin of coffee beans are included in the dataset?
print(coffee_df['country_of_origin'].nunique())
print(coffee_df['country_of_origin'].unique())

36
['Ethiopia' 'Guatemala' 'Brazil' 'Peru' 'United States'
 'United States (Hawaii)' 'Indonesia' 'China' 'Costa Rica' 'Mexico'
 'Uganda' 'Honduras' 'Taiwan' 'Nicaragua' 'Tanzania, United Republic Of'
 'Kenya' 'Thailand' 'Colombia' 'Panama' 'Papua New Guinea' 'El Salvador'
 'Japan' 'Ecuador' 'United States (Puerto Rico)' 'Haiti' 'Burundi'
 'Vietnam' 'Philippines' 'Rwanda' 'Malawi' 'Laos' 'Zambia' 'Myanmar'
 'Mauritius' 'Cote d?Ivoire' nan 'India']


In [18]:
# What are some regions of coffee production are included in the dataset?
coffee_df['region'].nunique()
coffee_df['region'].unique()

array(['guji-hambela', nan, 'oromia', 'oromiya',
       'snnp/kaffa zone,gimbowereda', 'antioquia', 'kona', 'sulawesi',
       'yirgacheffe', 'yunnan', 'gedio', 'san ramon', 'xalapa', 'sidamo',
       'south of minas', 'kapchorwa eastern', 'comayagua',
       'leye, alishan township, chiayi county', 'vale da grama',
       'west and central valley', 'muranga', 'chiang rai',
       'sul de minas - carmo de minas', 'natou county', 'nyeri',
       'eastern uganda', 'tolima', 'kiambu', 'sipi, mt elgon',
       'nuevo oriente', 'eastern', 'huila', 'boquete', 'acatenango',
       'addis ababa', 'cajamarca', 'eastern highlands province',
       'apaneca', 'ataco, apaneca - ilamatepec mountain range',
       'kirinyaga', 'bulambuli eastern region', 'huehuetenango',
       'kapchorwa', 'west valley', 'central kenya', 'oriente',
       'santander', 'lintong',
       'kefa zone, gimbo distict, at a place called woka araba, south west ethiopia.',
       'pasto', 'aricha', 'cundinamarca', 'tarrazu'

In [7]:
# What are the different colours of coffee beans included in the dataset?
coffee_df['color'].unique()

array(['Green', nan, 'Bluish-Green', 'None', 'Blue-Green'], dtype=object)

In [8]:
# Splitting datasets to train and test datasets
coffee_train_df, coffee_test_df = train_test_split(coffee_df, test_size=0.2, random_state=123)

In [9]:
print(coffee_train_df.shape)
print(coffee_test_df.shape)

(1071, 43)
(268, 43)


In [10]:
# Explore numerical features in the data set
coffee_train_df.describe()

Unnamed: 0,total_cup_points,number_of_bags,aroma,flavor,aftertaste,acidity,body,balance,uniformity,clean_cup,sweetness,cupper_points,moisture,category_one_defects,quakers,category_two_defects,altitude_low_meters,altitude_high_meters,altitude_mean_meters
count,1071.0,1071.0,1071.0,1071.0,1071.0,1071.0,1071.0,1071.0,1071.0,1071.0,1071.0,1071.0,1071.0,1071.0,1070.0,1071.0,884.0,884.0,884.0
mean,82.045285,156.030812,7.569944,7.519636,7.39788,7.527264,7.516321,7.512446,9.83141,9.829356,9.847077,7.493697,0.087395,0.479925,0.179439,3.547152,1865.307274,1912.705093,1889.006184
std,3.704791,128.330262,0.380755,0.410109,0.416852,0.397568,0.386557,0.425025,0.573938,0.809033,0.657785,0.484516,0.048594,2.693592,0.883032,5.310136,9705.302479,9704.503133,9704.452384
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,81.08,15.0,7.42,7.33,7.25,7.33,7.33,7.33,10.0,10.0,10.0,7.25,0.08,0.0,0.0,0.0,1100.0,1100.0,1100.0
50%,82.5,200.0,7.58,7.58,7.42,7.5,7.5,7.5,10.0,10.0,10.0,7.5,0.11,0.0,0.0,2.0,1310.64,1350.0,1310.64
75%,83.58,275.0,7.75,7.75,7.58,7.75,7.67,7.75,10.0,10.0,10.0,7.75,0.12,0.0,0.0,4.0,1600.0,1642.75,1600.0
max,90.58,600.0,8.75,8.83,8.67,8.75,8.58,8.75,10.0,10.0,10.0,10.0,0.28,63.0,11.0,47.0,190164.0,190164.0,190164.0


In [53]:
#cleaning dataset
coffee_train_df_clean = coffee_train_df[coffee_train_df['color'].notna()]

In [49]:
# One exploratory question we wanted to explore was to see if the colour 
# of the coffee bean affect the total quality rating?

In [93]:
coffee_color_boxplot = alt.Chart(coffee_train_df_clean, title = "Figure 1. Coffee quality rating by coffee bean colour").mark_boxplot().encode(
    x = alt.X('total_cup_points', title = "Total quality rating (0-100 scale)"),
    y = alt.Y('color', title = "Colour of coffee beans"))
coffee_color_boxplot_mean = coffee_color_boxplot + coffee_color_boxplot.mark_circle(fill='black',size=11).encode(x='mean(total_cup_points)')
coffee_color_boxplot_mean

In [73]:
coffee_color_boxplot_mean.save("coffeebean_color_rating.svg")

In [74]:
# Another exploratory question we wanted to explore was to see if the country of origin
# of the coffee bean affect the total quality rating?
coffee_train_df_clean = coffee_train_df[coffee_train_df['country_of_origin'].notna()]

In [94]:
coffee_coo_boxplot = alt.Chart(coffee_train_df_clean, title = 
                               "Figure 2. Coffee quality rating by coffee bean country of origin").mark_boxplot().encode(
    x = alt.X('total_cup_points', scale = alt.Scale(domain=(55, 100)), 
              title = "Total quality rating (0-100 scale)"),
    y = alt.Y('country_of_origin', title = "Country of Origin"))
coffee_coo_boxplot_mean = (coffee_coo_boxplot 
                           + coffee_coo_boxplot.mark_circle(fill='black',size=11).encode(x='mean(total_cup_points)'))
coffee_coo_boxplot_mean

In [77]:
coffee_coo_boxplot_mean.save("coffee_coo_boxplot_mean.svg")

In [95]:
# We may want to investigate the relationships between numerical features to better 
# understand which features might be correlated to the target.

numeric_cols = list(coffee_train_df.select_dtypes(exclude='object').drop(columns =['altitude_low_meters', 'altitude_high_meters']))
splom = (
    alt.Chart(coffee_train_df)
    .mark_point(opacity = 0.5, size = 0.5)
    .encode(
        x = alt.X(alt.repeat("repeat"), type = "quantitative", scale = alt.Scale(zero =  False)),
        y = alt.Y("total_cup_points", scale = alt.Scale(zero = False))
    )
    .properties(height = 80, width = 80)
    .repeat(numeric_cols, columns = 3)
    .configure_axis(labelFontSize = 11)
)

splom

In [None]:
# # We may want to investigate the relationships between numerical features to better understand which features are correlated to each other.
# numeric_cols = list(coffee_train_df.select_dtypes(exclude='object').drop(columns =['altitude_low_meters', 'altitude_high_meters', 'aroma', 'flavor', 'aftertaste', 'acidity', 'body', 'balance', 'uniformity', 'sweetness']))
# splom = (
#     alt.Chart(coffee_train_df)
#     .mark_point(opacity = 0.5, size = 0.5)
#     .encode(
#         x = alt.X(alt.repeat("column"), type = "quantitative", scale = alt.Scale(zero =  False)),
#         y = alt.Y(alt.repeat("row"), type = "quantitative", scale = alt.Scale(zero = False))
#     )
#     .properties(height = 80, width = 80)
#     .repeat(column = numeric_cols, row = numeric_cols)
#     .configure_axis(labelFontSize = 11)

# )

# splom

#'aroma', 'flavor', 'aftertaste', 'acidity', 'body', 'balance', 'uniformity', 'sweetness'