In [12]:
import pandas as pd


df = pd.read_csv('winequality-red-selected-missing.csv')

# Preview the first few records
df.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [41]:
# Data Cleaning and Preprocessing

In [14]:
# Check data structure, types, and null values
print(df.info())
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1388 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1405 non-null   float64
 8   pH                    1389 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB
None
fixed acidity             0
volatile acidity          0
citric acid             211
residual sugar            0
chlorides

In [15]:
# Fill missing values using column-wise mean 
df.fillna(df.mean(), inplace=True)

In [16]:
# Feature Engineering and Label Creation
df['target'] = df['quality'].apply(lambda x: 1 if x >= 7 else 0)

In [None]:
df.drop('quality', axis=1, inplace=True)

In [19]:
X = df.drop('target', axis=1)
y = df['target']

In [None]:
# Train-Test Split

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [43]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [24]:
# Model Training
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [25]:
# Model Evaluation
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[274   3]
 [ 17  26]]
              precision    recall  f1-score   support

           0       0.94      0.99      0.96       277
           1       0.90      0.60      0.72        43

    accuracy                           0.94       320
   macro avg       0.92      0.80      0.84       320
weighted avg       0.94      0.94      0.93       320



In [27]:
# Confidence Score
y_proba = model.predict_proba(X_test)
print("Confidence score for the first test sample:", y_proba[0])


Confidence score for the first test sample: [0.88 0.12]


In [42]:
import joblib 
# Save trained model and scaler for use in Streamlit app
joblib.dump(model, 'wine_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [29]:
scaler = joblib.load('scaler.pkl')

In [35]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[274   3]
 [ 17  26]]
              precision    recall  f1-score   support

           0       0.94      0.99      0.96       277
           1       0.90      0.60      0.72        43

    accuracy                           0.94       320
   macro avg       0.92      0.80      0.84       320
weighted avg       0.94      0.94      0.93       320



In [38]:
# XGBoost Model Evaluation
from sklearn.metrics import classification_report, confusion_matrix

y_pred = xgb_model.predict(X_test)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Confusion Matrix:
 [[271   6]
 [ 14  29]]

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.98      0.96       277
           1       0.83      0.67      0.74        43

    accuracy                           0.94       320
   macro avg       0.89      0.83      0.85       320
weighted avg       0.93      0.94      0.93       320



In [40]:
joblib.dump(xgb_model, 'wine_model.pkl')

['wine_model.pkl']