In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder,StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import warnings
warnings.filterwarnings("ignore")

# Exploratory data Analysis

In [2]:
df = pd.read_csv(r"C:\Users\DELL\Desktop\Backend\BACKEND-AI\white_wine\winequality-white.csv",sep=';')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


In [4]:
df.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,4898.0,6.854788,0.843868,3.8,6.3,6.8,7.3,14.2
volatile acidity,4898.0,0.278241,0.100795,0.08,0.21,0.26,0.32,1.1
citric acid,4898.0,0.334192,0.12102,0.0,0.27,0.32,0.39,1.66
residual sugar,4898.0,6.391415,5.072058,0.6,1.7,5.2,9.9,65.8
chlorides,4898.0,0.045772,0.021848,0.009,0.036,0.043,0.05,0.346
free sulfur dioxide,4898.0,35.308085,17.007137,2.0,23.0,34.0,46.0,289.0
total sulfur dioxide,4898.0,138.360657,42.498065,9.0,108.0,134.0,167.0,440.0
density,4898.0,0.994027,0.002991,0.98711,0.991723,0.99374,0.9961,1.03898
pH,4898.0,3.188267,0.151001,2.72,3.09,3.18,3.28,3.82
sulphates,4898.0,0.489847,0.114126,0.22,0.41,0.47,0.55,1.08


#### Findings
From the table above we derived a number of observation
1. Our sample contains 4898 entries and 12 columns
2. The Columns are all contained with numericl features
3. Comparing the mean and the maximum values of columns like, free sulfur dioxide,residual sugar and total sulfur dioxide, we can infer that there is prescence of at least an outlier.

#### Recommendation
From our findings, we deduced ou data is clean and has no missing values and as well, well-encoded. However, wih ouliers being noticed.We recommend the use of ordinal encoder and standard scaler for preprocesing.

## Data Preprocessing

In [6]:
df["quality"].value_counts()

quality
6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: count, dtype: int64

In [7]:
# Encoding quality features
df["quality"] = df["quality"].map({9:"Excellent", 8:"Very good", 7:"Very good",6:"good",5:"good",4:"bad",3:"bad"})
df["quality"]

0            good
1            good
2            good
3            good
4            good
          ...    
4893         good
4894         good
4895         good
4896    Very good
4897         good
Name: quality, Length: 4898, dtype: object

In [8]:
#nScale nummerical features using sklearn standard scaler
num_cols = [x for x in df.columns if df[x].dtype == "float64"]
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

### Preprocessing Details
1. Scaled the data using standard scaler.

## Modelling

In [9]:
X, y= df.drop("quality", axis=1), df["quality"]

X_train, X_test, y_train, y_test = train_test_split(X,y)

models = {"Logistic Regression": LogisticRegression(),
          "Decision Tree classifier": DecisionTreeClassifier(),
          "Random Forest Classifier ": RandomForestClassifier(),
          "Naive Bayes classsifier": GaussianNB(),
        #   "Xgboost Classifier": XGBClassifier()
          }
for key , value in models.items():
    print(f"Training {key}")
    value.fit(X_train, y_train)
    pred = value.predict(X_test)
    print(f"{key} Evaluation on metrics")
    print(f"Accuracy: {accuracy_score(y_test,pred)}")
    # print(f"F1 Score: {f1_score(y_test, pred)}")
    # print(f"Recall: {recall_score(y_test, pred)}")
    # print(f"Precision: {precision_score(y_test, pred)}")
    print(f"Classification Report: \n {classification_report(y_test, pred)}")
    # confusion_matrix = confusion_matrix(y_test, pred)
    # ConfusionMatrixDisplay

    # f"{value}_classifier" = value.fit{X_train, y_train}

    if key == "Random Forest Classifier ":
        joblib.dump(value, "BestModel.pkl")
joblib.dump(scaler, "encoder.pkl")

Training Logistic Regression
Logistic Regression Evaluation on metrics
Accuracy: 0.7624489795918368
Classification Report: 
               precision    recall  f1-score   support

   Excellent       0.00      0.00      0.00         2
   Very good       0.63      0.28      0.39       284
         bad       0.33      0.03      0.05        40
        good       0.78      0.95      0.86       899

    accuracy                           0.76      1225
   macro avg       0.44      0.31      0.32      1225
weighted avg       0.73      0.76      0.72      1225

Training Decision Tree classifier
Decision Tree classifier Evaluation on metrics
Accuracy: 0.7640816326530612
Classification Report: 
               precision    recall  f1-score   support

   Excellent       0.00      0.00      0.00         2
   Very good       0.58      0.58      0.58       284
         bad       0.31      0.38      0.34        40
        good       0.85      0.84      0.84       899

    accuracy                     

['encoder.pkl']