In [1]:
# Set the graphs as interactive.
%matplotlib inline
# Import necessary libraries for initial analysis
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as color

from scipy import stats
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [10]:
cookies_validate = pd.read_csv("../data/cookies_validate.csv")
cookies_validate.drop(columns=["id", "diameter"], inplace=True)

In [11]:
cookies_validate.head()

Unnamed: 0,sugar to flour ratio,sugar index,bake temp,chill time,calories,density,pH,grams baking soda,bake time,quality,butter type,weight,mixins,crunch factor,aesthetic appeal
0,0.62,19.25,400.0,41.0,172.0,1.0002,7.98,0.67,9.7,0,melted,16.6,"nuts, chocolate",1.85,3
1,0.35,1.0,520.0,35.0,146.0,0.993,8.45,0.44,10.0,0,melted,13.8,chocolate,1.43,3
2,0.39,10.4,440.0,20.0,142.0,0.9974,8.2,0.53,10.0,0,melted,17.0,chocolate,1.57,3
3,0.33,1.1,570.0,21.0,82.0,0.991,8.32,0.46,10.9,0,melted,12.4,"chocolate, oats",1.44,3
4,0.37,13.5,600.0,52.0,192.0,0.9975,8.0,0.44,9.1,0,melted,14.8,chocolate,1.51,3


In [13]:
# we observe that mixins there's a not space applied in 'nuts,raisins', then we replace with space
cookies_validate["mixins"] = cookies_validate["mixins"].str.replace("nuts,raisins", "nuts, raisins")

# Pre-processing categorical features

### One Hot Encoding

In [15]:
cookies_validate = pd.concat([cookies_validate,pd.get_dummies(cookies_validate['butter type'], prefix='butter type')],axis=1)
cookies_validate.drop(['butter type'],axis=1, inplace=True)

### Ordinal Encoding

In [17]:
cookies_validate['mixins'] = cookies_validate['mixins'].astype('category')
cookies_validate['mixins'] = cookies_validate['mixins'].cat.codes

In [21]:
cookies_validate[cookies_validate["bake temp"] > 1000]

Unnamed: 0,sugar to flour ratio,sugar index,bake temp,chill time,calories,density,pH,grams baking soda,bake time,quality,weight,mixins,crunch factor,aesthetic appeal,butter type_cubed,butter type_melted
190,0.54,1.4,1570.0,34.0,132.0,0.99449,8.11,0.53,9.0,0,14.4,9,1.3,3,0,1
193,0.52,1.3,1480.0,45.0,149.0,0.99468,8.08,0.56,8.7,0,14.2,4,1.42,3,0,1
303,0.37,10.0,1690.0,22.0,210.0,0.99776,8.02,0.64,9.5,0,15.4,4,1.16,3,0,1
330,0.38,13.1,1120.0,14.0,94.0,0.99792,8.02,0.48,9.2,0,13.8,9,1.8,3,0,1
349,1.0,1.1,1540.0,46.0,114.0,0.9931,7.95,0.43,9.2,0,14.4,0,1.62,3,0,1
469,0.2,5.7,1180.0,61.0,172.0,0.9946,8.24,0.43,9.5,0,12.8,9,1.59,3,0,1
523,0.38,2.8,1440.0,12.0,65.0,0.9908,7.95,0.64,11.4,0,12.2,0,1.16,3,0,1
530,0.36,6.7,1850.0,51.5,151.0,0.99528,8.17,0.42,9.3,0,13.0,9,1.01,3,0,1
555,0.27,10.7,1100.0,20.0,103.0,0.99672,8.08,0.41,9.0,0,13.2,4,1.74,3,0,1
595,0.23,8.9,1050.0,22.0,155.0,0.99692,8.01,0.58,9.5,0,14.4,9,1.93,3,0,1


In [19]:
cookies_validate

Unnamed: 0,sugar to flour ratio,sugar index,bake temp,chill time,calories,density,pH,grams baking soda,bake time,quality,weight,mixins,crunch factor,aesthetic appeal,butter type_cubed,butter type_melted
0,0.62,19.25,400.0,41.0,172.0,1.00020,7.98,0.67,9.7,0,16.6,4,1.85,3,0,1
1,0.35,1.00,520.0,35.0,146.0,0.99300,8.45,0.44,10.0,0,13.8,0,1.43,3,0,1
2,0.39,10.40,440.0,20.0,142.0,0.99740,8.20,0.53,10.0,0,17.0,0,1.57,3,0,1
3,0.33,1.10,570.0,21.0,82.0,0.99100,8.32,0.46,10.9,0,12.4,1,1.44,3,0,1
4,0.37,13.50,600.0,52.0,192.0,0.99750,8.00,0.44,9.1,0,14.8,0,1.51,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
774,0.02,1.90,780.0,18.0,30.0,0.99712,8.40,0.75,9.8,0,13.8,6,1.01,3,1,0
775,0.19,5.20,940.0,19.0,98.0,0.99713,8.16,0.52,9.6,0,14.8,5,1.68,3,1,0
776,0.00,2.10,600.0,6.0,13.0,0.99664,8.59,0.61,10.0,0,12.4,5,1.38,3,1,0
777,0.08,2.30,670.0,19.0,32.0,0.99648,8.52,0.57,11.0,0,13.4,8,1.63,3,1,0
