In [None]:
#Imports all the necessary tools to conduct our analysis. 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import seaborn as sns
import io
import pandas as pd
import ipywidgets as widgets
import voila
#Sets visibility for the plots
sns.set(style="whitegrid")


## A quick preview of the data table we'll be working with. 

In [None]:
winedata_csv = 'https://raw.githubusercontent.com/SpencerRW117/WineAnalysis/main/winequalityN.csv'
#Make the primary dataframe here, named df
df = pd.read_csv(winedata_csv)
df.head()

# We begin our analysis with a descriptive overview of our data

## A tableview of descriptive statistics (DESCRIPTIVE VISUALIZATION 1)

In [None]:
#Define the quality_score variable as the last field of our table
quality_score = df['quality']
#Utilizing a built-in data cleaning technique to drop null values from the frame
df.dropna(inplace=True)
#Generate a table of descriptive statistics for EACH numeric variable
df.describe()

## Quality score distribution (DESCRIPTIVE VISUALIZATION 2)

In [None]:
# Display a histogram showcasing the distribution of quality scores throughout the dataset
plt.hist(quality_score, bins = 5)

## Heatmap of correlation values between chemical attributes (DESCRIPTIVE VISUALIZATION 3)

In [None]:
# Create a heatmap display showing the correlation coefficients for each variable
plt.figure(figsize = (12, 12))
train_corr = df.corr()
sns.heatmap(train_corr, cmap = "coolwarm", annot=True, fmt='.1f', linewidths = 0.05)

## A scatterplot of quality score vs alcohol content (DESCRIPTIVE VISUALIZATION 4)

In [None]:
#Alcohol content has the strongest correlation to quality, so lets see a display of that
sns.lmplot(x = "alcohol", y = "quality", data = df)

# With our new descriptive understanding, we continute to creating a predictive model for our data

In [None]:
# Separate the dataset into the chemical variables (input) and quality variable (output)
x = df[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide',
    'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']]
y = df['quality']

# Divide the data into testing and training components
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)


In [None]:
#Define and train the machine learning model. 
model = RandomForestRegressor(n_estimators=200, random_state=30)
model.fit(x_train, y_train)

## Use the sliders below to input the results of a chemical analysis, press "Calculate" to recieve a quality prediction. 

In [35]:
#Create the 11 widget sliders for the chemical analysis parameters
fixed_acidity = widgets.FloatSlider(value=7.2, min=3.8, max=10.0, step=0.01, description = "Fixed Acidity")
volatile_acidity = widgets.FloatSlider(value=0.33, min=0.08, max=1.0, step=0.01, description = "Volatile Acidity")
citric_acid = widgets.FloatSlider(value=0.33, min=0.0, max=1.0, step=0.01, description = "Citric Acid")
residual_sugar = widgets.FloatSlider(value=5.4, min=1, max=20, step=0.01, description = "Residual Sugar")
chlorides = widgets.FloatSlider(value=0.05, min=0.01, max=0.1, step=0.001, description = "Chlorides")
free_SO2 = widgets.IntSlider(value=30, min=1, max=110, step=1, description = "Free SO2")
total_SO2 = widgets.IntSlider(value=115, min=50, max=225, step=1, description = "Total SO2")
density = widgets.FloatSlider(value=0.99, min=0.95, max=1.0, step=0.0001, description = "Density")
pH = widgets.FloatSlider(value=3.2, min=2.75, max=3.80, step=0.01, description = "pH")
sulfates = widgets.FloatSlider(value=0.0, min=0.35, max=0.75, step=0.001, description = "Sulfates")
alcohol = widgets.FloatSlider(value=10.5, min=8, max=15.0, step=0.01, description = "Alcohol")
#Create the container for the widgets
widget_box = widgets.VBox([fixed_acidity, volatile_acidity, citric_acid, residual_sugar, chlorides, free_SO2, total_SO2,
                          density, pH, sulfates, alcohol])
children = [widget_box]
tab = widgets.Tab()
tab.children = children
tab.set_title(0, "Chemical Properties")
#Display the container with the 11 sliders
tab


Tab(children=(VBox(children=(FloatSlider(value=7.2, description='Fixed Acidity', max=10.0, min=3.8, step=0.01)…

# To run additional predictions, adjust the sliders, re-run the cell below, and click the "Predict Quality" button again. 

In [36]:
# Define an on_click handler for the calculate button, display the button, and output our prediction. 
def on_calculate_click(_):
    prediction = model.predict([[fixed_acidity.value, volatile_acidity.value, citric_acid.value, residual_sugar.value,
            chlorides.value, free_SO2.value, total_SO2.value,
            density.value, pH.value, sulfates.value, alcohol.value]])
    with out:
        out.clear_output()
        print("##### PREDICTION #####")
        print("For a wine with the selected chemical properties, our model predicts a quality score of:", round(prediction[0], 2))
        print("##### ACCURACY #####")
        print("Mean Absolute Error: " , round(mean_absolute_error(y_test, model.predict(x_test)), 2))
    return

button = widgets.Button(description="Predict Quality")
out = widgets.Output()

button.on_click(on_calculate_click)
widgets.VBox([button, out])

VBox(children=(Button(description='Predict Quality', style=ButtonStyle()), Output()))