In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
white_wine = pd.read_csv('whiteWine.csv')
# white_wine = white_wine.drop_duplicates()
red_wine = pd.read_csv('redWine.csv')
# red_wine = red_wine.drop_duplicates()
wine = pd.concat([white_wine, red_wine], ignore_index=True)

In [3]:
wine.head(-5)
# wine = wine.drop_duplicates()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,quantity sold
0,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6,280.0
1,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6,280.0
2,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6,280.0
3,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,281.0
4,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,281.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6487,6.6,0.725,0.20,7.8,0.073,29.0,79.0,0.99770,3.29,0.54,9.2,5,235.0
6488,6.3,0.550,0.15,1.8,0.077,26.0,35.0,0.99314,3.32,0.82,11.6,6,281.0
6489,5.4,0.740,0.09,1.7,0.089,16.0,26.0,0.99402,3.67,0.56,11.6,6,282.0
6490,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,281.0


In [4]:
wine['quality'] = wine['quality'].apply(lambda x : 0 if x <= 5 else 1)
wine.head(15)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,quantity sold
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,1,280.0
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,1,280.0
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,1,280.0
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,1,281.0
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,1,281.0
5,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,1,280.0
6,6.2,0.32,0.16,7.0,0.045,30.0,136.0,0.9949,3.18,0.47,9.6,1,280.0
7,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,1,280.0
8,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,1,280.0
9,8.1,0.22,0.43,1.5,0.044,28.0,129.0,0.9938,3.22,0.45,11.0,1,280.0


In [5]:
y = wine['quality']
X = wine.drop(columns=["quality", "quantity sold"])

In [6]:
from sklearn.preprocessing import StandardScaler

In [7]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [9]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [10]:
from sklearn.metrics import accuracy_score, classification_report

In [11]:
predictions = model.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy: ", accuracy)
print(classification_report(y_test, predictions))

Accuracy:  0.8476923076923077
              precision    recall  f1-score   support

           0       0.80      0.77      0.78       468
           1       0.87      0.89      0.88       832

    accuracy                           0.85      1300
   macro avg       0.84      0.83      0.83      1300
weighted avg       0.85      0.85      0.85      1300



In [12]:
# Prediction on singular data.

new_wine = pd.DataFrame({
    "fixed acidity": [6.6],
    "volatile acidity": [0.725],
    "citric acid": [0.20],
    "residual sugar": [7.8],
    "chlorides": [0.073],
    "free sulfur dioxide": [29.0],
    "total sulfur dioxide": [79.0],
    "density": [0.99770],
    "pH": [3.29],
    "sulphates": [0.54],
    "alcohol": [9.2]
})

new_wine_scaled = scaler.transform(new_wine)

predicted_quality = model.predict(new_wine_scaled)

if predicted_quality[0] == 1:
    print("This wine is predicted to be Good quality.")
else:
    print("This wine is predicted to be Bad quality.")

This wine is predicted to be Bad quality.
