In [1]:
#tabular data imports :
import pandas as pd
import numpy as np
from pydataset import data

# visualization imports:
import matplotlib.pyplot as plt
import seaborn as sns
# Custom palette with specified colors
custom_palette = sns.color_palette(["#33B8C6", "#C64133", "#4682B4"])
# Using a matplotlib colormap
custom_palette2 = sns.color_palette(["#33B8C6","#50C2CE","#6DCCD6","#8AD6DE","#A7E0E6","#C4EAEE","#E1F4F6"])

from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

# success metrics from earlier in the week: mean squared error and r^2 explained variance
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures

#stats
from scipy.stats import pearsonr, spearmanr
from scipy.stats import shapiro

import warnings
warnings.filterwarnings("ignore")
import wrangle as w
import explore as e
import os
directory = os.getcwd()
pd.set_option('display.max_columns', None)

In [2]:
# Calling the function that combines all the steps before and creates a .csv file from the df (to run faster and locally)
df = w.prepare_wine(w.red_wine_url, w.white_wine_url)
df.head(3)

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color,wine_quality
wine_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
w-1,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,white,Average
w-2,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,white,Average
w-3,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,white,Average


# EXPLORATION:
## UNIVARIATE

In [3]:
e.summarize(df)

--------------------------------
--------------------------------
Information on DataFrame: 
Shape of Dataframe: (6497, 14)
--------------------------------
Basic DataFrame info:
<class 'pandas.core.frame.DataFrame'>
Index: 6497 entries, w-1 to r-1599
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed_acidity         6497 non-null   float64
 1   volatile_acidity      6497 non-null   float64
 2   citric_acid           6497 non-null   float64
 3   residual_sugar        6497 non-null   float64
 4   chlorides             6497 non-null   float64
 5   free_sulfur_dioxide   6497 non-null   float64
 6   total_sulfur_dioxide  6497 non-null   float64
 7   density               6497 non-null   float64
 8   pH                    6497 non-null   float64
 9   sulphates             6497 non-null   float64
 10  alcohol               6497 non-null   float64
 11  quality               6497 non-null   int64  
 

In [4]:
df.columns

Index(['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
       'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality', 'color', 'wine_quality'],
      dtype='object')

In [5]:
cols = ['fixed_acidity','volatile_acidity','citric_acid','residual_sugar','chlorides',\
        'free_sulfur_dioxide','total_sulfur_dioxide','density','pH','sulphates',\
        'alcohol','quality','color','wine_quality']

# Determine the number of rows needed for subplots
n_rows = len(cols) // 2 + len(cols) % 2
fig, axes = plt.subplots(n_rows, 2, figsize=(16, n_rows*4))

for i, col in enumerate(cols):
    ax = axes.flatten()[i]
    sns.histplot(df[col], bins=50, ax=ax, palette='viridis')  # Using seaborn for a more refined look
    ax.set_title(col.upper(), fontsize=12)  # Set title for each subplot
    ax.tick_params(axis='x', rotation=30)   # Rotate x-axis labels
    ax.grid(False)  # Hide gridlines

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

In [6]:
# comparing the numbers of wine
pd.crosstab(df.color, df.wine_quality)

wine_quality,Average,High,Low
color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
red,1518,18,63
white,4535,180,183


## BIVARIATE

In [7]:
# split data
train, validate, test = e.split_data(df)


    train -> (3898, 14)
    validate -> (1299, 14)
    test -> (1300, 14)


In [8]:
train.head(3)

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color,wine_quality
wine_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
w-2549,6.3,0.26,0.42,7.1,0.045,62.0,209.0,0.99544,3.2,0.53,9.5,6,white,Average
w-4295,7.4,0.22,0.28,9.0,0.046,22.0,121.0,0.99468,3.1,0.55,10.8,5,white,Average
r-956,8.5,0.21,0.52,1.9,0.09,9.0,23.0,0.99648,3.36,0.67,10.4,5,red,Average


In [9]:
# Figuring out how many unique values each column has
for col in cols:
    unique_count = df[col].nunique()
    print(f'# of uniques in {col}: {unique_count}')

# of uniques in fixed_acidity: 106
# of uniques in volatile_acidity: 187
# of uniques in citric_acid: 89
# of uniques in residual_sugar: 316
# of uniques in chlorides: 214
# of uniques in free_sulfur_dioxide: 135
# of uniques in total_sulfur_dioxide: 276
# of uniques in density: 998
# of uniques in pH: 108
# of uniques in sulphates: 111
# of uniques in alcohol: 111
# of uniques in quality: 7
# of uniques in color: 2
# of uniques in wine_quality: 3


# pairplot with hue of quality_category
sns.pairplot(data=train, corner=True, hue='wine_quality', palette=custom_palette, plot_kws={'alpha': 1})
plt.show()

# pairplot with hue of color
sns.pairplot(data=train, corner=True, hue='color', palette=custom_palette)
plt.show()

# pairplot with hue of quality
sns.pairplot(data=train, corner=True, hue='quality', palette='viridis')
plt.show()

# Determine the number of rows needed for subplots
n_rows = len(cols) // 2 + len(cols) % 2
fig, axes = plt.subplots(n_rows, 2, figsize=(16, n_rows*4))

for i, col in enumerate(cols):
    ax = axes.flatten()[i]
    # Use seaborn for a more refined look with hue based on 'color'
    sns.histplot(df, x=col, hue='color', bins=50, ax=ax, palette=custom_palette)  # 'viridis' is an example palette, you can change it
    ax.set_title(col.upper(), fontsize=12)  # Set title for each subplot
    ax.tick_params(axis='x', rotation=30)   # Rotate x-axis labels
    ax.grid(False)  # Hide gridlines

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

In [10]:
df.head()

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color,wine_quality
wine_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
w-1,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,white,Average
w-2,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,white,Average
w-3,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,white,Average
w-4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,white,Average
w-5,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,white,Average


In [11]:
train_scaled, validate_scaled, test_scaled = e.preprocess_wine(df)


    train -> (3898, 15)
    validate -> (1299, 15)
    test -> (1300, 15)


In [12]:
train_scaled.head(3)

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,white,High,Low
wine_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
w-2549,0.165217,0.144,0.42,0.099693,0.055092,0.419244,0.563107,0.160594,0.362205,0.169492,0.217391,0.5,1.0,0.0,0.0
w-4295,0.26087,0.112,0.28,0.128834,0.056761,0.14433,0.319001,0.145942,0.283465,0.180791,0.405797,0.333333,1.0,0.0,0.0
r-956,0.356522,0.104,0.52,0.019939,0.130217,0.054983,0.047157,0.180644,0.488189,0.248588,0.347826,0.333333,0.0,0.0,0.0


In [13]:
X1 =train_scaled[['density', 'pH']]

In [14]:
e.create_cluster(train, X1, 2)

(         fixed_acidity  volatile_acidity  citric_acid  residual_sugar  \
 wine_id                                                                 
 w-2549             6.3              0.26         0.42             7.1   
 w-4295             7.4              0.22         0.28             9.0   
 r-956              8.5              0.21         0.52             1.9   
 w-1800             6.4              0.26         0.21             7.1   
 w-3442             7.1              0.25         0.28             1.2   
 ...                ...               ...          ...             ...   
 w-2565             6.6              0.41         0.16             1.4   
 w-1760             6.6              0.62         0.20             8.7   
 w-4362             6.5              0.20         0.33             1.5   
 r-612             13.2              0.38         0.55             2.7   
 r-1240             6.5              0.67         0.00             4.3   
 
          chlorides  free_sulfur_dio