# DATA EPLORATION AND ENGINEERING

## 1) Loading the wine data

In [5]:
import pandas as pd

# import numpy for numerical analysis
import numpy as np

# import libs for diagrams inline with the text
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# other utilities
from sklearn import datasets, preprocessing, metrics


### 1.1) Winequality red

In [None]:
url = "https://raw.githubusercontent.com/foxdocs/cph-bi-2024/main/Data/UCL-Wine/winequality-red.csv"

red_wine_df = pd.read_csv(url, delimiter=";", header=0)

In [17]:
red_wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


### 1.2) Winequality white

In [13]:
url_white = "https://raw.githubusercontent.com/foxdocs/cph-bi-2024/main/Data/UCL-Wine/winequality-white.csv"

white_wine_df = pd.read_csv(url_white, delimiter=";", header=0)

In [15]:
white_wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [24]:
white_wine_df.dtypes


fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

## 2) Cleaning the data

### 2.0) General exploration

#### 2.0) Red Wine

In [66]:
red_wine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [68]:
red_wine_df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


### 2.1) Exploring the missing values

#### 2.1.1) Red Wine

In [33]:
red_wine_df.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

#### 2.1.2) White Wine

In [35]:
white_wine_df.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

### 2.2) Removing duplicates

#### 2.2.1) Red Wine

In [114]:
duplicates_rw = red_wine_df.duplicated(keep="first").sum()
duplicates_rw

240

#### 2.2.2) White wine

### 2.3) Checking for outliers

#### 2.3.1) Red Wine

##### 2.3.1.1) Fixed acidity

In [96]:
# 25 percentile: The value that 25 % of the data is below
Q1 = red_wine_df["fixed acidity"].quantile(0.25)

# 75 percentile: The value that 75 % of the data is above
Q3 = red_wine_df["fixed acidity"].quantile(0.75)

IQR = Q3-Q1

Lower_Fence = Q1 - (1.5*IQR)
Upper_Fence = Q3 + (1.5*IQR)

# A boolean condition that identifies outliers in the "fixed acidity" column of the red_wine_df DataFrame
outliers_condition = (red_wine_df["fixed acidity"] < Lower_Fence) | (red_wine_df["fixed acidity"] > Upper_Fence)

outliers = red_wine_df[(outliers_condition)]

number_of_outliers = outliers.shape[0]
print("Number of outliers: " + str(number_of_outliers))
outliers

Number of outliers: 49


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
205,12.8,0.3,0.74,2.6,0.095,9.0,28.0,0.9994,3.2,0.77,10.8,7
206,12.8,0.3,0.74,2.6,0.095,9.0,28.0,0.9994,3.2,0.77,10.8,7
243,15.0,0.21,0.44,2.2,0.075,10.0,24.0,1.00005,3.07,0.84,9.2,7
244,15.0,0.21,0.44,2.2,0.075,10.0,24.0,1.00005,3.07,0.84,9.2,7
264,12.5,0.56,0.49,2.4,0.064,5.0,27.0,0.9999,3.08,0.87,10.9,5
294,13.3,0.34,0.52,3.2,0.094,17.0,53.0,1.0014,3.05,0.81,9.5,6
328,13.4,0.27,0.62,2.6,0.082,6.0,21.0,1.0002,3.16,0.67,9.7,6
338,12.4,0.49,0.58,3.0,0.103,28.0,99.0,1.0008,3.16,1.0,11.5,6
339,12.5,0.28,0.54,2.3,0.082,12.0,29.0,0.9997,3.11,1.36,9.8,7
347,13.8,0.49,0.67,3.0,0.093,6.0,15.0,0.9986,3.02,0.93,12.0,6


###### (Standard deviation - just my own exploration)

In [99]:
std_fix_aci = red_wine_df["fixed acidity"].std()
print(std_fix_aci)

1.7410963181277006


In [101]:
mean_fix_aci = red_wine_df["fixed acidity"].mean()
print(mean_fix_aci)

8.31963727329581


In [103]:
Lower_Fence_from_std = mean_fix_aci - (3*std_fix_aci)
print("Lower Fence from std: " + str(Lower_Fence_from_std))
print("Lower Fence from quantiles: " + str(Lower_Fence))



Lower Fence from std: 3.0963483189127077
Lower Fence from quantiles: 3.95


###### Removing outliers 

In [76]:
red_wine_df_f1 = red_wine_df[~outliers_condition] 

In [78]:
red_wine_df_f1.iloc[240:250]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
242,7.7,0.58,0.1,1.8,0.102,28.0,109.0,0.99565,3.08,0.49,9.8,6
245,7.3,0.66,0.0,2.0,0.084,6.0,23.0,0.9983,3.61,0.96,9.9,6
246,7.1,0.68,0.07,1.9,0.075,16.0,51.0,0.99685,3.38,0.52,9.5,5
247,8.2,0.6,0.17,2.3,0.072,11.0,73.0,0.9963,3.2,0.45,9.3,5
248,7.7,0.53,0.06,1.7,0.074,9.0,39.0,0.99615,3.35,0.48,9.8,6
249,7.3,0.66,0.0,2.0,0.084,6.0,23.0,0.9983,3.61,0.96,9.9,6
250,10.8,0.32,0.44,1.6,0.063,16.0,37.0,0.9985,3.22,0.78,10.0,6
251,7.1,0.6,0.0,1.8,0.074,16.0,34.0,0.9972,3.47,0.7,9.9,6
252,11.1,0.35,0.48,3.1,0.09,5.0,21.0,0.9986,3.17,0.53,10.5,5
253,7.7,0.775,0.42,1.9,0.092,8.0,86.0,0.9959,3.23,0.59,9.5,5


In [82]:
columns = red_wine_df_f1.shape[1]
rows = red_wine_df_f1.shape[0]

print("Number of columns: " + str(columns))
print("Number of rows: " + str(rows))


Number of columns: 12
Number of rows: 1550


In [88]:
row_2 = red_wine_df_f1.iloc[2]

In [90]:
print(row_2)

fixed acidity            7.800
volatile acidity         0.760
citric acid              0.040
residual sugar           2.300
chlorides                0.092
free sulfur dioxide     15.000
total sulfur dioxide    54.000
density                  0.997
pH                       3.260
sulphates                0.650
alcohol                  9.800
quality                  5.000
Name: 2, dtype: float64
