# Final Project

Paul Parks, Alden Caterio, Mayank Bhatt

In [131]:
# imports
import pandas as pd
from scipy import stats
import statsmodels.api as sm
import random

## Datasets

In [132]:
wine_white = pd.read_csv('../Dataset/wine+quality/winequality-white.csv', sep=';')
wine_white.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


In [133]:
wine_red = pd.read_csv('../Dataset/wine+quality/winequality-red.csv', sep=';')
wine_red.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


## Generalized Linear Model Regression

In [134]:
def create_glm_fitted_model(df):
    X = df.drop('quality', axis=1)
    y = df['quality']

    X = sm.add_constant(X)

    # Create the model
    model = sm.GLM(y, X)
    return model.fit()

In [135]:
wine_red_results = create_glm_fitted_model(wine_red)
print(wine_red_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                quality   No. Observations:                 1599
Model:                            GLM   Df Residuals:                     1587
Model Family:                Gaussian   Df Model:                           11
Link Function:               Identity   Scale:                         0.41992
Method:                          IRLS   Log-Likelihood:                -1569.1
Date:                Fri, 16 Jun 2023   Deviance:                       666.41
Time:                        15:16:49   Pearson chi2:                     666.
No. Iterations:                     3   Pseudo R-squ. (CS):             0.4286
Covariance Type:            nonrobust                                         
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   21.9652 

```
The variables 'volatile acidity', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'pH', 'sulphates', and 'alcohol' are statistically significant predictors of wine quality because their p-values are less than 0.05.
```

In [136]:
wine_white_results = create_glm_fitted_model(wine_white)
print(wine_white_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                quality   No. Observations:                 4898
Model:                            GLM   Df Residuals:                     4886
Model Family:                Gaussian   Df Model:                           11
Link Function:               Identity   Scale:                         0.56454
Method:                          IRLS   Log-Likelihood:                -5543.7
Date:                Fri, 16 Jun 2023   Deviance:                       2758.3
Time:                        15:16:49   Pearson chi2:                 2.76e+03
No. Iterations:                     3   Pseudo R-squ. (CS):             0.3240
Covariance Type:            nonrobust                                         
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                  150.1928 

```
The variables 'volatile acidity', 'residual sugar', 'free sulfur dioxide', 'density', 'pH', 'sulphates', and 'alcohol' are statistically significant predictors of wine quality because their p-values are less than 0.05.
```

## Predictions - Red Wine

In [137]:
def predict_wine_using_df(df, results):
    X = df.drop('quality', axis=1)
    X = sm.add_constant(X)
    y = df['quality']
    index = random.randint(0, len(df))
    row = X.iloc[index]
    predicted_quality = results.predict(row)
    print('Predicted wine quality:', predicted_quality[0])
    print('Actual wine quality:', y.iloc[index])

In [138]:
def predict_wine_using_fake_data(results):
    new_wine = {
        'const': [1],
        'fixed acidity': [8.5],
        'volatile acidity': [0.8],
        'citric acid': [0.56],
        'residual sugar': [1.8],
        'chlorides': [0.077],
        'free sulfur dioxide': [10.0],
        'total sulfur dioxide': [37.0],
        'density': [0.9968],
        'pH': [3.2],
        'sulphates': [0.68],
        'alcohol': [9.8]
    }
    new_data_df = pd.DataFrame(new_wine)
    predicted_quality = results.predict(new_data_df)
    print(f'Predicted wine quality: {predicted_quality[0]}')

In [139]:
print('\nRed Wine prediction: \n')
predict_wine_using_df(wine_red, wine_red_results)

print('\nWhite Wine prediction: \n')
predict_wine_using_df(wine_white, wine_white_results)


Red Wine prediction: 

Predicted wine quality: 6.140502660855608
Actual wine quality: 7

White Wine prediction: 

Predicted wine quality: 6.2917592070826505
Actual wine quality: 6


In [140]:
print('\nRed Wine prediction: \n')
predict_wine_using_fake_data(wine_red_results)

print('\nWhite Wine prediction: \n')
predict_wine_using_fake_data(wine_white_results)


Red Wine prediction: 

Predicted wine quality: 5.198635646186875

White Wine prediction: 

Predicted wine quality: 4.144469325543219
