In [1]:
# Title: "Create and save exploratory data analysis figures of coffee quality dataset"
# Authors: "Arlin Cherian, Kristin Bunyan, Michelle Wang, Berkay Bulut"
# date: 2021-11-19

In [2]:
# loading packages and set up

import numpy as np
import pandas as pd
import altair as alt
from sklearn import datasets
from sklearn.model_selection import train_test_split

In [3]:
# Loading data set
coffee_df = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-07-07/coffee_ratings.csv')
# check number of rows and columns
coffee_df.shape
coffee_df = coffee_df.query("species == 'Arabica'")
coffee_df

Unnamed: 0,total_cup_points,species,owner,country_of_origin,farm_name,lot_number,mill,ico_number,company,altitude,...,color,category_two_defects,expiration,certification_body,certification_address,certification_contact,unit_of_measurement,altitude_low_meters,altitude_high_meters,altitude_mean_meters
0,90.58,Arabica,metad plc,Ethiopia,metad plc,,metad plc,2014/2015,metad agricultural developmet plc,1950-2200,...,Green,0,"April 3rd, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1950.00,2200.00,2075.00
1,89.92,Arabica,metad plc,Ethiopia,metad plc,,metad plc,2014/2015,metad agricultural developmet plc,1950-2200,...,Green,1,"April 3rd, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1950.00,2200.00,2075.00
2,89.75,Arabica,grounds for health admin,Guatemala,"san marcos barrancas ""san cristobal cuch",,,,,1600 - 1800 m,...,,0,"May 31st, 2011",Specialty Coffee Association,36d0d00a3724338ba7937c52a378d085f2172daa,0878a7d4b9d35ddbf0fe2ce69a2062cceb45a660,m,1600.00,1800.00,1700.00
3,89.00,Arabica,yidnekachew dabessa,Ethiopia,yidnekachew dabessa coffee plantation,,wolensu,,yidnekachew debessa coffee plantation,1800-2200,...,Green,2,"March 25th, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1800.00,2200.00,2000.00
4,88.83,Arabica,metad plc,Ethiopia,metad plc,,metad plc,2014/2015,metad agricultural developmet plc,1950-2200,...,Green,2,"April 3rd, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1950.00,2200.00,2075.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1306,68.33,Arabica,juan carlos garcia lopez,Mexico,el centenario,,"la esperanza, municipio juchique de ferrer, ve...",1104328663,terra mia,900,...,,20,"September 17th, 2013",AMECAFE,59e396ad6e22a1c22b248f958e1da2bd8af85272,0eb4ee5b3f47b20b049548a2fd1e7d4a2b70d0a7,m,900.00,900.00,900.00
1307,67.92,Arabica,myriam kaplan-pasternak,Haiti,200 farms,,coeb koperativ ekselsyo basen (350 members),,haiti coffee,~350m,...,Blue-Green,16,"May 24th, 2013",Specialty Coffee Association,36d0d00a3724338ba7937c52a378d085f2172daa,0878a7d4b9d35ddbf0fe2ce69a2062cceb45a660,m,350.00,350.00,350.00
1308,63.08,Arabica,"exportadora atlantic, s.a.",Nicaragua,finca las marías,017-053-0211/ 017-053-0212,beneficio atlantic condega,017-053-0211/ 017-053-0212,exportadora atlantic s.a,1100,...,Green,5,"June 6th, 2018",Instituto Hondureño del Café,b4660a57e9f8cc613ae5b8f02bfce8634c763ab4,7f521ca403540f81ec99daec7da19c2788393880,m,1100.00,1100.00,1100.00
1309,59.83,Arabica,juan luis alvarado romero,Guatemala,finca el limon,,beneficio serben,11/853/165,unicafe,4650,...,Green,4,"May 24th, 2013",Asociacion Nacional Del Café,b1f20fe3a819fd6b2ee0eb8fdc3da256604f1e53,724f04ad10ed31dbb9d260f0dfd221ba48be8a95,ft,1417.32,1417.32,1417.32


In [4]:
# Explore the structure of the dataset
coffee_df.info()

# Explore numerical features in the data set
coffee_df.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1311 entries, 0 to 1310
Data columns (total 43 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   total_cup_points       1311 non-null   float64
 1   species                1311 non-null   object 
 2   owner                  1304 non-null   object 
 3   country_of_origin      1310 non-null   object 
 4   farm_name              955 non-null    object 
 5   lot_number             270 non-null    object 
 6   mill                   1001 non-null   object 
 7   ico_number             1165 non-null   object 
 8   company                1102 non-null   object 
 9   altitude               1088 non-null   object 
 10  region                 1254 non-null   object 
 11  producer               1081 non-null   object 
 12  number_of_bags         1311 non-null   int64  
 13  bag_weight             1311 non-null   object 
 14  in_country_partner     1311 non-null   object 
 15  harv

Unnamed: 0,total_cup_points,number_of_bags,aroma,flavor,aftertaste,acidity,body,balance,uniformity,clean_cup,sweetness,cupper_points,moisture,category_one_defects,quakers,category_two_defects,altitude_low_meters,altitude_high_meters,altitude_mean_meters
count,1311.0,1311.0,1311.0,1311.0,1311.0,1311.0,1311.0,1311.0,1311.0,1311.0,1311.0,1311.0,1311.0,1311.0,1310.0,1311.0,1084.0,1084.0,1084.0
mean,82.115927,153.887872,7.563806,7.51807,7.397696,7.533112,7.517727,7.517506,9.833394,9.83312,9.903272,7.497864,0.088863,0.426392,0.177099,3.591915,1759.548954,1808.843803,1784.196379
std,3.515761,129.733734,0.378666,0.399979,0.405119,0.381599,0.359213,0.406316,0.559343,0.77135,0.530832,0.47461,0.047957,1.832415,0.840583,5.350371,8767.847252,8767.187498,8767.016913
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,81.17,14.5,7.42,7.33,7.25,7.33,7.33,7.33,10.0,10.0,10.0,7.25,0.09,0.0,0.0,0.0,1100.0,1100.0,1100.0
50%,82.5,175.0,7.58,7.58,7.42,7.5,7.5,7.5,10.0,10.0,10.0,7.5,0.11,0.0,0.0,2.0,1310.64,1350.0,1310.64
75%,83.67,275.0,7.75,7.75,7.58,7.75,7.67,7.75,10.0,10.0,10.0,7.75,0.12,0.0,0.0,4.0,1600.0,1650.0,1600.0
max,90.58,1062.0,8.75,8.83,8.67,8.75,8.58,8.75,10.0,10.0,10.0,10.0,0.28,31.0,11.0,55.0,190164.0,190164.0,190164.0


According to the dataset decription our target, `total_cup_points` is a point scale from 0-100 with mean scale rating of 82.1 points, minimum of 0.0 and maximum of 90.6 points.

In [5]:
# How many unique country of origin of coffee beans are included in the dataset?
print(coffee_df['country_of_origin'].nunique())
print(coffee_df['country_of_origin'].unique())

36
['Ethiopia' 'Guatemala' 'Brazil' 'Peru' 'United States'
 'United States (Hawaii)' 'Indonesia' 'China' 'Costa Rica' 'Mexico'
 'Uganda' 'Honduras' 'Taiwan' 'Nicaragua' 'Tanzania, United Republic Of'
 'Kenya' 'Thailand' 'Colombia' 'Panama' 'Papua New Guinea' 'El Salvador'
 'Japan' 'Ecuador' 'United States (Puerto Rico)' 'Haiti' 'Burundi'
 'Vietnam' 'Philippines' 'Rwanda' 'Malawi' 'Laos' 'Zambia' 'Myanmar'
 'Mauritius' 'Cote d?Ivoire' nan 'India']


In [6]:
# What are some regions of coffee production are included in the dataset?
print(coffee_df['region'].nunique())
print(coffee_df['region'].unique())

343
['guji-hambela' nan 'oromia' 'oromiya' 'snnp/kaffa zone,gimbowereda'
 'antioquia' 'kona' 'sulawesi' 'yirgacheffe' 'yunnan' 'gedio' 'san ramon'
 'xalapa' 'sidamo' 'south of minas' 'kapchorwa eastern' 'comayagua'
 'leye, alishan township, chiayi county' 'vale da grama'
 'west and central valley' 'muranga' 'chiang rai'
 'sul de minas - carmo de minas' 'natou county' 'nyeri' 'eastern uganda'
 'tolima' 'kiambu' 'sipi, mt elgon' 'nuevo oriente' 'eastern' 'huila'
 'boquete' 'acatenango' 'addis ababa' 'cajamarca'
 'eastern highlands province' 'apaneca'
 'ataco, apaneca - ilamatepec mountain range' 'kirinyaga'
 'bulambuli eastern region' 'huehuetenango' 'kapchorwa' 'west valley'
 'central kenya' 'oriente' 'santander' 'lintong'
 'kefa zone, gimbo distict, at a place called woka araba, south west ethiopia.'
 'pasto' 'aricha' 'cundinamarca' 'tarrazu' 'veracruz' 'grama valley'
 'mexico' 'chuva, san marcos' 'mbeya' 'coatepec'
 'dongshan dist., tainan city 臺南市東山區' 'thailand' 'marcala' 'santa ana'

In [7]:
# What are the different colours of coffee beans included in the dataset?
coffee_df['color'].unique()

array(['Green', nan, 'Bluish-Green', 'None', 'Blue-Green'], dtype=object)

In [8]:
# Splitting datasets to train and test datasets
coffee_train_df, coffee_test_df = train_test_split(coffee_df, test_size=0.2, random_state=123)

In [9]:
print(coffee_train_df.shape)
print(coffee_test_df.shape)

(1048, 43)
(263, 43)


In [10]:
# Explore numerical features in the data set
coffee_train_df.describe()

Unnamed: 0,total_cup_points,number_of_bags,aroma,flavor,aftertaste,acidity,body,balance,uniformity,clean_cup,sweetness,cupper_points,moisture,category_one_defects,quakers,category_two_defects,altitude_low_meters,altitude_high_meters,altitude_mean_meters
count,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1047.0,1048.0,858.0,858.0,858.0
mean,82.178588,156.144084,7.559666,7.519179,7.399714,7.531097,7.522338,7.517948,9.84124,9.869513,9.92229,7.495363,0.088702,0.458969,0.17574,3.582061,1532.257826,1580.123429,1556.190628
std,2.542936,130.775664,0.317137,0.340838,0.352207,0.31711,0.289447,0.350441,0.48868,0.547691,0.380899,0.43517,0.04821,1.990962,0.862244,5.4439,6473.547509,6472.8937,6472.806952
min,63.08,1.0,5.08,6.08,6.17,6.08,6.33,6.08,6.0,5.33,6.0,5.17,0.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,81.08,15.0,7.42,7.33,7.17,7.33,7.33,7.33,10.0,10.0,10.0,7.25,0.09,0.0,0.0,0.0,1100.0,1100.0,1100.0
50%,82.5,200.0,7.58,7.5,7.42,7.5,7.5,7.5,10.0,10.0,10.0,7.5,0.11,0.0,0.0,2.0,1300.0,1320.0,1310.64
75%,83.58,275.0,7.75,7.75,7.58,7.75,7.67,7.75,10.0,10.0,10.0,7.75,0.12,0.0,0.0,4.0,1600.0,1650.0,1600.0
max,90.58,1062.0,8.75,8.83,8.67,8.75,8.58,8.75,10.0,10.0,10.0,10.0,0.28,31.0,11.0,55.0,190164.0,190164.0,190164.0


In [11]:
#cleaning dataset
coffee_train_df_clean = coffee_train_df[coffee_train_df['color'].notna()]
coffee_train_df_cleaned = coffee_train_df_clean.query("color != 'None'")

In [12]:
# One exploratory question we wanted to explore was to see if the colour 
# of the coffee bean affect the total quality rating?

In [13]:
coffee_color_boxplot = alt.Chart(coffee_train_df_cleaned).mark_boxplot().encode(
    x = alt.X('total_cup_points', title = "Total quality rating (0-100 scale)"),
    y = alt.Y('color', title = "Colour of coffee beans"))
coffee_color_boxplot_mean = coffee_color_boxplot + coffee_color_boxplot.mark_circle(fill='black',size=11).encode(x='mean(total_cup_points)')
coffee_color_boxplot_mean

In [14]:
coffee_color_boxplot_mean.save("coffee_quality_eda_images/coffeebean_color_rating.svg")

In [15]:
# Another exploratory question we wanted to explore was to see if the country of origin
# of the coffee bean affect the total quality rating?
coffee_train_df_clean = coffee_train_df[coffee_train_df['country_of_origin'].notna()]

In [16]:
coffee_coo_boxplot = alt.Chart(coffee_train_df_clean).mark_boxplot().encode(
    x = alt.X('total_cup_points', scale = alt.Scale(domain=(55, 100)), 
              title = "Total quality rating (0-100 scale)"),
    y = alt.Y('country_of_origin', title = "Country of Origin"))
coffee_coo_boxplot_mean = (coffee_coo_boxplot 
                           + coffee_coo_boxplot.mark_circle(fill='black',size=11).encode(x='mean(total_cup_points)'))
coffee_coo_boxplot_mean

In [17]:
coffee_coo_boxplot_mean.save("coffee_quality_eda_images/coffee_coo_boxplot_mean.svg")

In [18]:
# We may want to investigate the relationships between numerical features to better 
# understand which features might be correlated to the target.

numeric_cols = list(coffee_train_df.select_dtypes(exclude='object').drop(columns =['altitude_low_meters', 'altitude_high_meters', 'aroma', 'flavor', 'aftertaste', 'acidity', 'body', 'balance', 'uniformity', 'sweetness', 'clean_cup', 'cupper_points']))
splom = (
    alt.Chart(coffee_train_df)
    .mark_point(opacity = 0.5, size = 0.5)
    .encode(
        x = alt.X(alt.repeat("repeat"), type = "quantitative", scale = alt.Scale(zero =  False)),
        y = alt.Y("total_cup_points", scale = alt.Scale(zero = False))
    )
    .properties(height = 80, width = 80)
    .repeat(numeric_cols, columns = 3)
    .configure_axis(labelFontSize = 11)
)

splom

In [19]:
splom.save("coffee_quality_eda_images/numerical_features_corr.svg")

At this point of the exploratory data analysis project, we have only explored the relationship of numerical features and the target, `total_cup_points`. We plan to do further cleaning and preparation of dataset to explore relationships of other categorical features in this dataset as well. 