# Final Project

Paul Parks, Alden Caterio, Mayank Bhatt

In [None]:
# imports
import pandas as pd
from scipy import stats
import statsmodels.api as sm
import random
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm
from sklearn.metrics import mean_absolute_error
from tabulate import tabulate

## Datasets

In [None]:
wine_white = pd.read_csv('../Dataset/wine+quality/winequality-white.csv', sep=';')
wine_white.describe()

In [None]:
wine_red = pd.read_csv('../Dataset/wine+quality/winequality-red.csv', sep=';')
wine_red.describe()

In [None]:
columns = [
    'fixed acidity',
    'volatile acidity',
    'citric acid',
    'residual sugar',
    'chlorides',
    'free sulfur dioxide',
    'total sulfur dioxide',
    'density',
    'pH',
    'sulphates',
    'alcohol',
    'quality'
]

## Boxplot all data to view outliers

In [None]:
def do_boxplot(data):
    # fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(15,10))
    # 6/19/23 ACaterio: Lowering figsize to fit into screenshot for EDA in the report
    fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(10,8))
    axes = axes.ravel()
    for i, column in enumerate(columns):
        axes[i].boxplot(data[column])
        axes[i].set_title(f'Boxplot of {column}')
    plt.tight_layout()
    plt.show()

In [None]:
print('BoxPlots Red Wine')
do_boxplot(wine_red)

In [None]:
print('BoxPlots White Wine')
do_boxplot(wine_white)

In [None]:
wine_red_n = len(wine_red)
wine_white_n = len(wine_white)

def countOutliers(df, df_str, n):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    outliers = ((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).sum()
    outliers = outliers.tolist()
    arr = []
    for i in range(len(outliers)):
        arr.append([])
        outlier_perc = round(outliers[i]/len(wine_red)*100,2)
        arr[i].append(columns[i])
        arr[i].append(outliers[i])
        arr[i].append(outlier_perc)
    print(tabulate(arr, headers=['Attribute', 'Total Outliers', 'Percentage'], tablefmt="fancy_grid"))

In [None]:
countOutliers(wine_red, "Red Wine", wine_red_n)

In [None]:
countOutliers(wine_white, "White Wine", wine_white_n)

## Correlation and Variation

In [None]:
def createCorrMatr(df, df_str, color):
    cols_df = df.corr().nlargest(len(columns), 'quality')['quality'].index
    correl = df[cols_df].corr()
    plt.figure(figsize=(10,8))
    plt.title(f"Correlation Matrix: {df_str}")
    sns.heatmap(correl, annot=True, cmap = color)

createCorrMatr(wine_red, 'Red Wine', 'plasma')
createCorrMatr(wine_white, 'White Wine', 'GnBu')

## Probability of Scores

In [None]:
def get_probability(df):
    df.sort_values(by=['quality'], inplace=True)
    df_mean = np.mean(df["quality"])
    df_std = np.std(df["quality"])
    pdf = stats.norm.pdf(df["quality"], df_mean, df_std)

    plt.xlabel('Quality')
    plt.ylabel('Probability')
    plt.title('PDF of Quality')
    plt.plot(df["quality"], pdf)

In [None]:
get_probability(wine_red)

In [None]:
get_probability(wine_white)

## Remove all outliers

In [None]:
def remove_all_outliers(data_source):
    data = data_source.copy()
    for column in columns:
        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)
        IQR = Q3 - Q1
        data = data[(data[column] >= Q1 - 1.5*IQR) & (data[column] <= Q3 + 1.5*IQR)]
    return data

In [None]:
wine_red_cleaned = remove_all_outliers(wine_red)
wine_white_cleaned = remove_all_outliers(wine_white)

## Generalized Linear Model Regression

In [None]:
def create_glm_fitted_model(df):
    X = df.drop('quality', axis=1)
    y = df['quality']

    X = sm.add_constant(X)

    # Create the model
    model = sm.GLM(y, X)
    return model.fit()

In [None]:
wine_red_results = create_glm_fitted_model(wine_red)
print(wine_red_results.summary())

```
The variables 'volatile acidity', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'pH', 'sulphates', and 'alcohol' are statistically significant predictors of wine quality because their p-values are less than 0.05.
```

In [None]:
wine_white_results = create_glm_fitted_model(wine_white)
print(wine_white_results.summary())

```
The variables 'volatile acidity', 'residual sugar', 'free sulfur dioxide', 'density', 'pH', 'sulphates', and 'alcohol' are statistically significant predictors of wine quality because their p-values are less than 0.05.
```

In [None]:
wine_red_cleaned_results = create_glm_fitted_model(wine_red_cleaned)
print(wine_red_cleaned_results.summary())

In [None]:
wine_white_cleaned_results = create_glm_fitted_model(wine_white_cleaned)
print(wine_white_cleaned_results.summary())

## Predictions - Red Wine

In [None]:
def quality_histogram(X, y, results):
    predicted_scores = []
    actual_scores = []
    for row_iter in range(len(X)):
        row = X.iloc[row_iter]
        predicted_quality = results.predict(row)
        predicted_scores.append(predicted_quality[0])
        actual_scores.append(y.iloc[row_iter])

    sns.kdeplot(predicted_scores, label='Predicted Score')
    sns.kdeplot(actual_scores, label="Actual Score")
    
    plt.legend()
    plt.show()

def predict_wine_using_df(df_source, results):
    df = df_source.copy()
    
    get_mse_predictions(df, results)

    X = df.drop('quality', axis=1)
    X = sm.add_constant(X)
    y = df['quality']
    index = random.randint(0, len(df))
    row = X.iloc[index]
    predicted_quality = results.predict(row)
    print('Predicted wine quality:', predicted_quality[0])
    print('Predicted wine quality rounded:', round(predicted_quality[0]))
    print('Actual wine quality:', y.iloc[index])

    quality_histogram(X, y, results)

In [None]:
def get_mse_predictions(df, results):
    X = df.drop('quality', axis=1)
    X = sm.add_constant(X)
    y = df['quality']
    predictions = results.predict(X)
    mae = mean_absolute_error(y, predictions)
    print(f'Mean Absolute Error: {mae}')


In [None]:
def predict_simulated_best_wine(data_source, results):
    print('Take the best scoring wine in the dataset and make it even better.')
    # new_wine = {
    #     'const': [1],
    #     'fixed acidity': [8.5],
    #     'volatile acidity': [0.8],
    #     'citric acid': [0.56],
    #     'residual sugar': [1.8],
    #     'chlorides': [0.077],
    #     'free sulfur dioxide': [10.0],
    #     'total sulfur dioxide': [37.0],
    #     'density': [0.9968],
    #     'pH': [3.2],
    #     'sulphates': [0.68],
    #     'alcohol': [9.8]
    # }
    data = data_source.copy()
    # get the best scoring wine in the real dataset
    X = sm.add_constant(data)
    max_quality_index = X['quality'].idxmax()
    max_quality_row = X.loc[max_quality_index]
    actual_score = max_quality_row['quality']
    print(f'Actual quality: {actual_score}')
    max_quality_row = max_quality_row.drop('quality')

    # Statistically significant values for both red and white wines
    # tldr how to get a 11/10 wine
    max_quality_row['alcohol'] = 15 #high alcohol
    max_quality_row['sulphates'] = 2 #high sulphates
    max_quality_row['volatile acidity'] = 0.1 #low volatile acidity
    max_quality_row['total sulfur dioxide'] = 30 # low total sulfur dioxide
    max_quality_row['pH'] = 2 # low pH

    print(max_quality_row)
    predicted_quality = results.predict(max_quality_row)
    print(f'\nPredicted wine quality: {round(predicted_quality[0])}\n')


In [None]:
print('\nRed Wine prediction: \n')
predict_wine_using_df(wine_red, wine_red_results)

In [None]:
print('\nWhite Wine prediction: \n')
predict_wine_using_df(wine_white, wine_white_results)

In [None]:
print('\nRed Wine Cleaned prediction: \n')
predict_wine_using_df(wine_red_cleaned, wine_red_cleaned_results)

In [None]:
print('\nWhite Wine Cleaned prediction: \n')
predict_wine_using_df(wine_white_cleaned, wine_white_cleaned_results)

In [None]:
print('\nRed Wine prediction: \n')
predict_simulated_best_wine(wine_red, wine_red_cleaned_results)

In [None]:
print('\nWhite Wine prediction: \n')
predict_simulated_best_wine(wine_white, wine_white_cleaned_results)

In [None]:
def predict_simulated_best_wine_only_modify_pH_and_alcohol(data_source, results):
    predicted_scores_original = []
    predicted_scores_with_modifications = []
    score_diff = []

    data = data_source.copy()
    X = sm.add_constant(data)

    for row_iter in range(len(data)):
        row = X.loc[row_iter]
        row = row.drop('quality')
        predicted_quality = results.predict(row)
        row['alcohol'] = row['alcohol'] + 1.5
        row['pH'] = row['pH'] - 1.5
        predicted_quality_modified = results.predict(row)
        predicted_scores_original.append(predicted_quality[0])
        predicted_scores_with_modifications.append(predicted_quality_modified[0])
        score_diff = predicted_quality_modified[0] - predicted_quality[0]

    sns.kdeplot(predicted_scores_original, label='Predicted Score (Original)')
    sns.kdeplot(predicted_scores_with_modifications, label="Predicted Score (+1.5% abv -1.5%pH)")
    
    plt.legend()
    plt.show()
    print(f'Average Score difference (Score point out of 10): {np.mean(score_diff)}')

In [None]:
# pH and Alcohol and both easily adjustable post fermentation. 
# What would happen to our wine scores if we increased alcohol and decreased pH?

predict_simulated_best_wine_only_modify_pH_and_alcohol(wine_red, wine_red_cleaned_results)

In [None]:
predict_simulated_best_wine_only_modify_pH_and_alcohol(wine_white, wine_white_cleaned_results)

`
Increasing alcohol percentage by 1.5 and lowering pH by 1.5 gains an average of 1 whole score point. 
`