# Chapter 9

## Understanding the problem space

# Setup

In [1]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

### Create conda env

conda create -n ch_10 python=3.9 scikit-learn


### Launch Jupyter Notebook


launch Jupyter Notebook from conda, or Navigator

### Load in data

In [2]:
import pandas as pd

df_red =  pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', sep=';')
df_white =  pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv', sep=';')

In [3]:
df_red.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [4]:
df_white.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


In [5]:
df_white.shape

(4898, 12)

In [6]:
df_red["wine_type"] = 1
df_white["wine_type"] = 0

In [7]:
df_red.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine_type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1


In [8]:
df_raw = pd.concat([df_red,df_white])
df_raw

# Don't use merge as thats when there are overlapping elements in the datasets

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine_type
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,1
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,0
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,0
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,0
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,0


In [9]:
df_raw.iloc[[0,1,-2,-1]]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine_type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
4896,5.5,0.29,0.3,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,0
4897,6.0,0.21,0.38,0.8,0.02,22.0,98.0,0.98941,3.26,0.32,11.8,6,0


In [10]:
df_raw.iloc[[0,1,-2,-3]]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine_type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
4896,5.5,0.29,0.3,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,0
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,0


In [11]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6497 entries, 0 to 4897
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         6497 non-null   float64
 1   volatile acidity      6497 non-null   float64
 2   citric acid           6497 non-null   float64
 3   residual sugar        6497 non-null   float64
 4   chlorides             6497 non-null   float64
 5   free sulfur dioxide   6497 non-null   float64
 6   total sulfur dioxide  6497 non-null   float64
 7   density               6497 non-null   float64
 8   pH                    6497 non-null   float64
 9   sulphates             6497 non-null   float64
 10  alcohol               6497 non-null   float64
 11  quality               6497 non-null   int64  
 12  wine_type             6497 non-null   int64  
dtypes: float64(11), int64(2)
memory usage: 710.6 KB


In [12]:
df_raw.index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            4888, 4889, 4890, 4891, 4892, 4893, 4894, 4895, 4896, 4897],
           dtype='int64', length=6497)

In [13]:
df_raw[["fixed acidity","residual sugar","density",  "alcohol", "quality"]].describe()

Unnamed: 0,fixed acidity,residual sugar,density,alcohol,quality
count,6497.0,6497.0,6497.0,6497.0,6497.0
mean,7.215307,5.443235,0.994697,10.491801,5.818378
std,1.296434,4.757804,0.002999,1.192712,0.873255
min,3.8,0.6,0.98711,8.0,3.0
25%,6.4,1.8,0.99234,9.5,5.0
50%,7.0,3.0,0.99489,10.3,6.0
75%,7.7,8.1,0.99699,11.3,6.0
max,15.9,65.8,1.03898,14.9,9.0


### Basic shape

In [14]:
df_raw.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine_type
count,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0
mean,7.215307,0.339666,0.318633,5.443235,0.056034,30.525319,115.744574,0.994697,3.218501,0.531268,10.491801,5.818378,0.246114
std,1.296434,0.164636,0.145318,4.757804,0.035034,17.7494,56.521855,0.002999,0.160787,0.148806,1.192712,0.873255,0.430779
min,3.8,0.08,0.0,0.6,0.009,1.0,6.0,0.98711,2.72,0.22,8.0,3.0,0.0
25%,6.4,0.23,0.25,1.8,0.038,17.0,77.0,0.99234,3.11,0.43,9.5,5.0,0.0
50%,7.0,0.29,0.31,3.0,0.047,29.0,118.0,0.99489,3.21,0.51,10.3,6.0,0.0
75%,7.7,0.4,0.39,8.1,0.065,41.0,156.0,0.99699,3.32,0.6,11.3,6.0,0.0
max,15.9,1.58,1.66,65.8,0.611,289.0,440.0,1.03898,4.01,2.0,14.9,9.0,1.0


### Check for missing values

In [15]:
df_raw.isnull().sum()


fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
wine_type               0
dtype: int64

In [16]:
df_raw['quality'].value_counts()

6    2836
5    2138
7    1079
4     216
8     193
3      30
9       5
Name: quality, dtype: int64

In [None]:
import matplotlib.pyplot as plt

df_raw.hist(bins = 100, figsize = (10,12))
plt.show()

In [None]:
df_raw[df_raw['quality']>7].describe()

In [None]:
high_qaul_wine = df_raw[df_raw['quality']>=7]['alcohol']
high_qaul_wine

In [None]:
low_qaul_wine = df_raw[df_raw['quality']<6]['alcohol']
low_qaul_wine

In [None]:
high_qaul_wine = df_raw[df_raw['quality']>=7]['alcohol']
low_qaul_wine = df_raw[df_raw['quality']<6]['alcohol']

plt.hist(high_qaul_wine, bins = 10,alpha=0.7, label='High Qaulity')
plt.hist(low_qaul_wine, bins = 10,alpha=0.7, label='All')
plt.legend(loc='upper right')

plt.show()

## Check for redundant features

The correlation is just returns a dataframe so we can take and manipulate as needed.

In [None]:
corr_matrix = df_raw.corr().round(2)

In [None]:
corr_matrix = corr_matrix.applymap(lambda x: abs(x))
corr_matrix[corr_matrix > .4].style.background_gradient(cmap='Reds')

In [None]:
corr_matrix[['total sulfur dioxide','free sulfur dioxide']]

From this we see that "free sulfur dioxide" has the higher correlation, so we should keep that and drop "total sulfur dioxide"

In [None]:
df = df_raw.drop('total sulfur dioxide', axis = 1)
df.head(1)

## Focusing on the most important features 

In [None]:
corr_matrix['quality'].sort_values(ascending=False)

## Prepparing the data for training

### Scaling the data

### Split into train and test split

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(['quality'],axis=1)
y = df['quality']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=33)

In [None]:
# https://scikit-learn.org/stable/modules/preprocessing.html
from sklearn.preprocessing import MinMaxScaler, StandardScaler

min_max_scaler = MinMaxScaler()
std_scaler = StandardScaler()

minmax_scaled_X_train = min_max_scaler.fit_transform(X_train)
std_scaled_X_train = std_scaler.fit_transform(X_train)

## MinMax normilization

In [None]:
minmax_scaled_X = pd.DataFrame(minmax_scaled_X_train,columns=X.columns)
minmax_scaled_X.head()

## Standard Normalization

In [None]:
df_std_scaled_X = pd.DataFrame(std_scaled_X_train,columns=X.columns)
df_std_scaled_X.head()

## Evaluating potential models

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

kfold = StratifiedKFold(n_splits=3, random_state=33, shuffle=True)

lr_cve = cross_val_score(LinearRegression(), minmax_scaled_X, y_train, cv=kfold, scoring='neg_median_absolute_error')
knn_cve = cross_val_score(KNeighborsRegressor(), minmax_scaled_X, y_train, cv=kfold, scoring='neg_median_absolute_error')
svm_cve = cross_val_score(SVR(), minmax_scaled_X, y_train, cv=kfold, scoring='neg_median_absolute_error')

In [None]:
print(f'Linear Regression MSE score: {lr_cve.mean()}')
print(f'KNN MSE score:{knn_cve.mean()}')
print(f'SVM MSE score: {svm_cve.mean()}')

In [None]:
kfold = StratifiedKFold(n_splits=3, random_state=33, shuffle=True)

lr_cve = cross_val_score(LinearRegression(), df_std_scaled_X, y_train, cv=kfold, scoring='neg_median_absolute_error')
knn_cve = cross_val_score(KNeighborsRegressor(), df_std_scaled_X, y_train, cv=kfold, scoring='neg_mean_squared_error')
svm_cve = cross_val_score(SVR(), df_std_scaled_X, y_train, cv=kfold, scoring='neg_mean_squared_error')

In [None]:
print(f'Linear Regression MSE score: {lr_cve.mean()}')
print(f'KNN MSE score:{knn_cve.mean()}')
print(f'SVM MSE score: {svm_cve.mean()}')

# Training your models

## Training a SVM

In [None]:
# from sklearn.pipeline import make_pipeline
svm_model = SVR().fit(df_std_scaled_X, y_train)

## Analyzing Regression model results with MSE and R2 score

## Training an SVM 

In [None]:
svr_model = SVR().fit(df_std_scaled_X, y_train) 

X_test_scaled = min_max_scaler.fit_transform(X_test)

y_pred_svr = svr_model.predict(X_test_scaled) 
y_pred_svr

## Training a KNN

In [None]:
knn_model = KNeighborsRegressor().fit(minmax_scaled_X, y_train)

y_pred_knn = knn_model.predict(X_test_scaled)

In [None]:
from sklearn.metrics import mean_squared_error 
MSE_knn = mean_squared_error(y_test,y_pred_knn)
print(MSE_knn)

## Linear Regression

In [None]:
y_train

In [None]:
lr = LinearRegression().fit(X_train, y_train)
print(lr)

In [None]:
y_pred = lr.predict(X_test)
y_pred

In [None]:
print(y_pred[5:20])

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

MSE_lr = mean_absolute_error(y_test,y_pred)
print(f'{MSE_lr:.2f}')

In [None]:
from sklearn.metrics import r2_score

r2_wine = r2_score(y_test,y_pred)
print(f'{r2_wine:.2f}')