# **Wine Quality Prediction Project: A Predictive Analytics Framework**

## **Objective**
- To predict the quality of wines based on their chemical properties.
- The goal is to build a predictive model that can classify wine quality into categories (e.g., low, medium, high) using various machine learning techniques.



## **Data Collection**

In [None]:
import pandas as pd

# Load the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
wine_data = pd.read_csv(url, delimiter=';')

wine_data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


## **Data Exploration and Cleaning**

In [None]:
# Data overview
wine_data.info()
wine_data.describe()

# Check for missing values
wine_data.isnull().sum()

print(wine_data.columns)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total

##  **Data Preparation**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Bin the quality scores into three categories: low, medium, high
bins = [2, 5, 7, 9]  # Bins for low (3-4), medium (5-6), and high (7-8)
labels = ['low', 'medium', 'high']
wine_data['quality_label'] = pd.cut(wine_data['quality'], bins=bins, labels=labels, include_lowest=True)

# Feature selection (Remove original quality column from features)
X = wine_data.drop(columns=['quality', 'quality_label'])

# Target variable is now the binned quality labels
y = wine_data['quality_label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## **Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Train the Logistic Regression model
log_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))  # Added zero_division=0


Confusion Matrix:
[[  0   0   6]
 [  0 165  58]
 [  0  69 182]]

Classification Report:
              precision    recall  f1-score   support

        high       0.00      0.00      0.00         6
         low       0.71      0.74      0.72       223
      medium       0.74      0.73      0.73       251

    accuracy                           0.72       480
   macro avg       0.48      0.49      0.48       480
weighted avg       0.71      0.72      0.72       480



In [None]:
import numpy as np
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Get the predicted probabilities for each class
y_pred_prob = log_reg.predict_proba(X_test)

## **Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Train the Random Forest model
rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

Confusion Matrix:
[[  0   0   6]
 [  0 171  52]
 [  0  50 201]]

Classification Report:
              precision    recall  f1-score   support

        high       0.00      0.00      0.00         6
         low       0.77      0.77      0.77       223
      medium       0.78      0.80      0.79       251

    accuracy                           0.78       480
   macro avg       0.52      0.52      0.52       480
weighted avg       0.77      0.78      0.77       480



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np

# Predict the probabilities on the test set
y_pred_prob = rf_model.predict_proba(X_test)

# Convert y_test to one-hot encoding if it's not already
y_test_one_hot = pd.get_dummies(y_test).values


## **SVM**

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Train the SVM model
svm_model = SVC(kernel='linear', probability=True, random_state=42)
svm_model.fit(X_train, y_train)

# Predict on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

# Predict the probabilities on the test set
y_pred_prob = svm_model.predict_proba(X_test)

# Convert y_test to one-hot encoding if it's not already
y_test_one_hot = pd.get_dummies(y_test).values


Confusion Matrix:
[[  0   0   6]
 [  0 170  53]
 [  0  78 173]]

Classification Report:
              precision    recall  f1-score   support

        high       0.00      0.00      0.00         6
         low       0.69      0.76      0.72       223
      medium       0.75      0.69      0.72       251

    accuracy                           0.71       480
   macro avg       0.48      0.48      0.48       480
weighted avg       0.71      0.71      0.71       480



In [None]:
#!pip install dash



# Dashboard

In [None]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.figure_factory as ff
import plotly.graph_objects as go
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc

# Initialize Dash app
app = dash.Dash(__name__)

# Example models and metrics (you'll need to define your own models and metrics)
models = {
    "Logistic Regression": log_reg,
    "Random Forest": rf_model,
    "Support Vector Machine": svm_model
}

# Calculate metrics for each model
model_metrics = []
confusion_matrices = {}
roc_curves = {}
for name, model in models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    model_metrics.append({'Model': name, 'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 Score': f1})

    # Confusion Matrix and ROC Curve
    cm = confusion_matrix(y_test, y_pred)
    confusion_matrices[name] = cm

    y_pred_prob = model.predict_proba(X_test)
    fpr = {}
    tpr = {}
    roc_auc = {}
    for i in range(y_pred_prob.shape[1]):
        fpr[i], tpr[i], _ = roc_curve(y_test.apply(lambda x: 1 if x == y_test.unique()[i] else 0), y_pred_prob[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    roc_curves[name] = (fpr, tpr, roc_auc)

# Define layout with enhanced features
app.layout = html.Div([
    html.H1("Model Evaluation Dashboard", style={
        'textAlign': 'center',
        'fontWeight': 'bold',
        'fontSize': '24px',
        'color': 'grey',
        'backgroundColor': '#f8f9fa',
        'padding': '15px',
        'fontFamily': 'Aptos, Arial, sans-serif'  # Add custom font here
    }),

    html.H2("A Comparative Analysis of Different Models", style={
        'textAlign': 'center',
        'fontSize': '16px',
        'color': 'darkgrey',
        'marginBottom': '20px',
        'fontFamily': 'Aptos, Arial, sans-serif'  # Add custom font here
    }),

    dcc.Tabs([
        dcc.Tab(label='Model Summary', children=[
            dash_table.DataTable(
                data=model_metrics,
                columns=[{'name': col, 'id': col} for col in ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score']],
                style_table={'margin': '20px auto', 'width': '80%'},
                style_cell={'fontFamily': 'Aptos, Arial, sans-serif', 'textAlign': 'center', 'padding': '10px'},
                style_header={'backgroundColor': '#f8f9fa', 'fontWeight': 'bold'}
            )
        ]),
        dcc.Tab(label='Confusion Matrix & ROC Curve', children=[
            dcc.Dropdown(
                id='model-dropdown',
                options=[{'label': name, 'value': name} for name in models.keys()],
                value='Logistic Regression',
                clearable=False,
                style={'padding': '2px', 'fontSize': '14px', 'width': '40%', 'margin': 'auto', 'fontFamily': 'Aptos, Arial, sans-serif'}
            ),

            dcc.RadioItems(
                id='plot-type',
                options=[
                    {'label': 'Confusion Matrix', 'value': 'confusion_matrix'},
                    {'label': 'ROC Curve', 'value': 'roc_curve'},
                ],
                value='confusion_matrix',
                labelStyle={'display': 'inline-block', 'fontSize': '14px', 'fontFamily': 'Aptos, Arial, sans-serif'},
                style={'textAlign': 'center', 'margin': '20px'}
            ),

            dcc.Graph(id='model-graph', style={'padding': '20px', 'border': '1px solid lightgrey'})
        ])
    ], style={'fontFamily': 'Aptos, Arial, sans-serif', 'marginBottom': '20px'}),

    html.Button("Download Current Plot", id="download-btn", n_clicks=0, style={
        'display': 'block', 'margin': 'auto', 'padding': '10px 20px', 'fontSize': '14px',
        'fontFamily': 'Aptos, Arial, sans-serif', 'backgroundColor': '#007bff', 'color': 'white', 'border': 'none'
    })
], style={'backgroundColor': '#f8f9fa', 'padding': '20px'})

# Callback to update the graph based on the selected model and plot type
@app.callback(
    Output('model-graph', 'figure'),
    [Input('model-dropdown', 'value'),
     Input('plot-type', 'value')]
)
def update_graph(selected_model, plot_type):
    if plot_type == 'confusion_matrix':
        cm = confusion_matrices[selected_model]
        fig = ff.create_annotated_heatmap(cm, x=['Predicted ' + str(cls) for cls in y_test.unique()],
                                          y=['Actual ' + str(cls) for cls in y_test.unique()],
                                          colorscale='Blues')
        fig.update_layout(title=f'{selected_model} - Confusion Matrix', title_x=0.5,
                          title_font=dict(size=12, color='grey', family='Aptos, Arial, sans-serif'),
                          paper_bgcolor='white')
    else:
        fpr, tpr, roc_auc = roc_curves[selected_model]
        fig = go.Figure()
        for i in range(len(y_test.unique())):
            fig.add_trace(go.Scatter(x=fpr[i], y=tpr[i], mode='lines',
                                     name=f'Class {y_test.unique()[i]} ROC curve (area = {roc_auc[i]:.2f})'))
        fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', line=dict(dash='dash'),
                                 showlegend=False, name='Random Guess'))
        fig.update_layout(title=f'{selected_model} - ROC Curve', title_x=0.5,
                          title_font=dict(size=12, color='grey', family='Aptos, Arial, sans-serif'),
                          xaxis=dict(title='False Positive Rate', title_font=dict(size=11, family='Aptos, Arial, sans-serif')),
                          yaxis=dict(title='True Positive Rate', title_font=dict(size=11, family='Aptos, Arial, sans-serif')),
                          paper_bgcolor='white')
    return fig

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)


<IPython.core.display.Javascript object>