In [None]:
import pandas as pd
###
### original dataset at: https://archive.ics.uci.edu/dataset/186/wine+quality
###

# URL of the white wine dataset
URL = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'

# load the dataset from the URL
white_df = pd.read_csv(URL, sep=";")

# drop all the duplicate records and only keep the first one
white_df = white_df.drop_duplicates(keep='first')

# imbalanced dataset, get data with wine quality greater than 4 and less than 8
white_df = white_df[(white_df['quality'] > 4) & (white_df['quality'] < 8)]

# reset index and drop the old one
white_df = white_df.reset_index(drop=True)

In [None]:
white_df.describe()

In [None]:
white_df.info()

In [None]:
white_df.head()

In [None]:
# count record by each value in quality
white_df['quality'].value_counts()

In [None]:
# train a sgd classifier on the data using quality as the target
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score

# Prepare the data
X = white_df.drop('quality', axis=1)
y = white_df['quality']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the SGDClassifier
sgd_classifier = SGDClassifier(random_state=42)
sgd_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = sgd_classifier.predict(X_test)

# Evaluate the model (example using accuracy)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

In [None]:
# generate the confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming y_test and y_pred are already defined from the previous code

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()