# Data cleaning

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./dataset.csv')

Show some information and statistics about the dataset

In [3]:
df.shape

(2312, 10)

In [4]:
df.head()

Unnamed: 0,REF,Company (Manufacturer),Company Location,Review Date,Country of Bean Origin,Specific Bean Origin or Bar Name,Cocoa Percent,Ingredients,Most Memorable Characteristics,Rating
0,2454.0,5150,U.S.A.,2019.0,Madagascar,"Bejofo Estate, batch 1",76%,"3- B,S,C","cocoa, blackberry, full body",3.75
1,2458.0,5150,U.S.A.,2019.0,Dominican Republic,"Zorzal, batch 1",76%,"3- B,S,C","cocoa, vegetal, savory",3.5
2,2454.0,5150,U.S.A.,2019.0,Tanzania,"Kokoa Kamili, batch 1",76%,"3- B,S,C","rich cocoa, fatty, bready",3.25
3,797.0,A. Morin,France,2012.0,Peru,Peru,63%,"4- B,S,C,L","fruity, melon, roasty",3.75
4,797.0,A. Morin,France,2012.0,Bolivia,Bolivia,70%,"4- B,S,C,L","vegetal, nutty",3.5


In [5]:
df.columns

Index(['REF', 'Company (Manufacturer)', 'Company Location', 'Review Date',
       'Country of Bean Origin', 'Specific Bean Origin or Bar Name',
       'Cocoa Percent', 'Ingredients', 'Most Memorable Characteristics',
       'Rating'],
      dtype='object')

In [6]:
df.describe(include='all')

Unnamed: 0,REF,Company (Manufacturer),Company Location,Review Date,Country of Bean Origin,Specific Bean Origin or Bar Name,Cocoa Percent,Ingredients,Most Memorable Characteristics,Rating
count,2312.0,2312,2312,2312.0,2312,2312,2312,2224,2312,2312.0
unique,,541,67,,63,1436,46,21,2270,
top,,Soma,U.S.A.,,Venezuela,Madagascar,70%,"3- B,S,C","creamy, nutty, cocoa",
freq,,52,966,,245,55,936,885,4,
mean,1317.821367,,,2013.760813,,,,,,3.184083
std,695.42575,,,3.591105,,,,,,0.451961
min,5.0,,,2006.0,,,,,,1.0
25%,741.0,,,2011.0,,,,,,3.0
50%,1351.0,,,2014.0,,,,,,3.25
75%,1912.0,,,2016.0,,,,,,3.5


## Column transformations

Remove % symbol from Cocoa Percent and convert to float.

In [7]:
df['Cocoa Percent'] = df['Cocoa Percent'].str.replace('%', '').astype('float')

Encode ingredients.

From http://flavorsofcacao.com:

> "#" = represents the number of ingredients in the chocolate; B = Beans, S = Sugar, S* = Sweetener other than white cane or beet sugar, C = Cocoa Butter, V = Vanilla, L = Lecithin, Sa = Salt

For each of the above ingredients, let's create a column which indicates whether that ingredient is used or not.

In [8]:
df['Beans'] = df['Ingredients'].str.contains('B') + 0
df['Cocoa Butter'] = df['Ingredients'].str.contains('C') + 0
df['Vanilla'] = df['Ingredients'].str.contains('V') + 0
df['Lecithin'] = df['Ingredients'].str.contains('L') + 0
df['Sugar'] = df['Ingredients'].str.contains('S(?!\*|a)') + 0
df['Sweetener'] = df['Ingredients'].str.contains('S\*') + 0
df['Salt'] = df['Ingredients'].str.contains('Sa') + 0

In [28]:
def split_words(s):
    result = s.lower().replace('and', ',').split(',')
    return list(map(lambda x: x.strip(), result))

characteristics = df['Most Memorable Characteristics'].apply(split_words).explode().unique()

In [20]:
df[df.columns[1]].value_counts().head(30)

Soma                          52
Arete                         32
Fresco                        31
Bonnat                        29
Pralus                        26
A. Morin                      25
Domori                        23
Valrhona                      22
Guittard                      22
Zotter                        21
Coppeneur                     19
Hotel Chocolat (Coppeneur)    19
Dandelion                     18
Mast Brothers                 18
Dick Taylor                   17
Castronovo                    17
Scharffen Berger              17
Smooth Chocolator, The        17
Rogue                         16
Pierre Marcolini              16
Duffy's                       16
Artisan du Chocolat           16
Palette de Bine               15
Szanto Tibor                  15
Map Chocolate                 14
Bittersweet Origins           14
Tejas                         14
Pacari                        13
Amedei                        13
Michel Cluizel                13
Name: Comp

In [18]:
df[df.columns[1]].unique()

array(['5150', 'A. Morin', 'Acalli', 'Adi aka Fijiana (Easy In Ltd)',
       'Aelan', 'Aequare (Gianduja)', 'Ah Cacao', "Akesson's (Pralus)",
       'Alain Ducasse', 'Alexandre', 'Altus aka Cao Artisan', 'Amano',
       'Amatller (Simon Coll)', 'Amazing Cacao', 'Amazona', 'Ambrosia',
       'Amedei', 'AMMA', 'Anahata', 'Animas', 'Ara', 'Arete', 'Argencove',
       'Artisan du Chocolat', 'Artisan du Chocolat (Casa Luker)',
       'Askinosie', 'Atypic', 'Auro', 'Bahen & Co.', 'Baiani', 'Bakau',
       'Bankston', 'Bar Au Chocolat', "Baravelli's", 'Batch', 'Bean',
       'Beau Cacao', 'Beehive', 'Belcolade', 'Bellflower', 'Belvie',
       'Belyzium', 'Benns', 'Benoit Nihant', 'Bernachon',
       'Beschle (Felchlin)', 'Bisou', 'Bitacora', 'Bittersweet Origins',
       'Bixby', 'Black Mountain', 'Black River (A. Morin)', 'Blanxart',
       'Blue Bandana', 'Boho', 'Bonaterra', 'Bonnat',
       'Bouga Cacao (Tulicorp)', 'Bowler Man', 'Box Chocolate',
       'Brasstown', "Brasstown aka It's Ch