# Red Wine Quality Project
***
***

# Goal
***
- Predict the quality of red wines based on some of their various physiochemical attributes

# Setup
***

In [1]:
# establishing environment
import sklearn
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE

import warnings
warnings.filterwarnings("ignore")

# Acquire
Acquiring the data for this project
***

In [2]:
# acquiring data from local csv
wines = pd.read_csv('winequality-red.csv')

wines.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


# Prepare
Preparing the data for this project

## Changing spaces in column names to '_'

In [3]:
# replacing spaces with underscores in column names
wines.columns = wines.columns.str.replace(' ', '_')

## Splitting data

In [4]:
# splitting data
train_validate, test = train_test_split(wines, test_size=.2, random_state=123)
train, validate = train_test_split(train_validate, test_size=.3, random_state=123)

## Scaling data

In [14]:
# creating empty list
col_to_scale = []

# filling list with names of columns to scale
for col in wines:
    if (wines[col].min() < 0 or wines[col].max() > 1) and col != 'quality':
        col_to_scale.append(col)

col_to_scale

['fixed_acidity',
 'volatile_acidity',
 'residual_sugar',
 'free_sulfur_dioxide',
 'total_sulfur_dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol']

In [21]:
# creating list of DFs
df_list = ['train', 'validate', 'test']

# creating empty list to store scaled column names in
scaled_col_names = []

# creating cols that will hold scaled values
# filling empty list with scaled column names
for df in df_list:
    for col in col_to_scale:
        wine_d[df][col + '_s'] = 0
        scaled_col_names.append(col + '_s')

# reducing list of scaled column names to only unique values
scaled_col_names = list(set(scaled_col_names))

scaled_col_names

['residual_sugar_s',
 'alcohol_s',
 'density_s',
 'total_sulfur_dioxide_s',
 'pH_s',
 'sulphates_s',
 'fixed_acidity_s',
 'free_sulfur_dioxide_s',
 'volatile_acidity_s']

In [22]:
# creating scaler object
scaler = sklearn.preprocessing.MinMaxScaler()

# fitting scaler to train column and scaling after
train[scaled_col_names] = scaler.fit_transform(train[col_to_scale])

# scaling data in validate and test dataframes
validate[scaled_col_names] = scaler.transform(validate[col_to_scale])
test[scaled_col_names] = scaler.transform(test[col_to_scale])

# Explore
Exploring the data to find insights about what drives wine quality