In [2]:
from sklearn.datasets import load_wine
import pandas as pd
import numpy as np
data = load_wine()

In [3]:
df = pd.DataFrame(data['data'], columns=list(data.feature_names))

In [4]:
df.describe()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0


In [9]:
df.shape

(178, 13)

## Preprocessing

Preprocessing the data as necessary to ensure that all values can be processed by a classification model.

- All of the data is numeric, so none needs to be converted from text to numbers
- Scaling IS necessary though! Use StandardScaler instead of MinMaxScaler because is much less affected by outliers
- No features appear to be categorical, so all can be run through the StandardScaler


In [5]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Create pipeline with standard scaler
num_pipeline = Pipeline([
  ('scaler', StandardScaler()),
])

In [6]:
df_std = num_pipeline.fit_transform(df)

## Correlations

Observing the correlation dataframe and graphs that have somewhat of a linear trend in the correlation matrix, these are the highest correlating features:

- total_phenols and flavanoids
- od280/od315_of_diluted_wines and flavanoids
- od280/od315_of_diluted_wines and total_phenols
- proanthocyanins and flavanoids

In [12]:
# Highest correlating attributes: attr1 and attr2 and the correlation between them, sorted by top correlating
def get_correlations(df):
  corrs = {'attr1': [], 'attr2': [], 'corr': []}

  for attr1 in df.columns:
    for attr2 in df.columns:
      if attr1 != attr2:
        corrs['attr1'].append(attr1)
        corrs['attr2'].append(attr2)
        corrs['corr'].append(df[attr1].corr(df[attr2]))

  return pd.DataFrame(corrs).sort_values('corr', ascending=False).head(7)

In [13]:
get_correlations(df)

Unnamed: 0,attr1,attr2,corr
65,total_phenols,flavanoids,0.864564
77,flavanoids,total_phenols,0.864564
138,od280/od315_of_diluted_wines,flavanoids,0.787194
82,flavanoids,od280/od315_of_diluted_wines,0.787194
137,od280/od315_of_diluted_wines,total_phenols,0.699949
70,total_phenols,od280/od315_of_diluted_wines,0.699949
102,proanthocyanins,flavanoids,0.652692
