In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load dataset
file_path = 'WineQT.csv'
data = pd.read_csv(file_path)

 # Step 1: Data Exploration and Cleaning

In [None]:

# Checking for missing values
print("Missing values:\n", data.isnull().sum())

# Drop 'Id' column since it's not useful for prediction
data = data.drop(['Id'], axis=1)

# Step 2: Feature Selection


In [None]:
# Correlation matrix to understand relationships between features and target (quality)
import seaborn as sns

plt.figure(figsize=(10, 8))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()


# Step 3: Preparing the data for model training

In [None]:
from sklearn.model_selection import train_test_split

X = data.drop('quality', axis=1)  # Features
y = data['quality']  # Target (quality)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Standardizing the data (important for models like SVM)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Model Training

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

In [None]:
# Model 1: Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Model 2: Support Vector Classifier (SVC)
svc_model = SVC(random_state=42)
svc_model.fit(X_train_scaled, y_train)
y_pred_svc = svc_model.predict(X_test_scaled)

# Model 3: Stochastic Gradient Descent (SGD)
sgd_model = SGDClassifier(random_state=42)
sgd_model.fit(X_train_scaled, y_train)
y_pred_sgd = sgd_model.predict(X_test_scaled)



# Step 5: Model Evaluation

In [None]:
# Random Forest Evaluation
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
print("\nRandom Forest Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

In [None]:
# SVC Evaluation
print("\nSupport Vector Classifier Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_svc))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svc))
print(classification_report(y_test, y_pred_svc))

In [None]:
# SGD Evaluation
print("\nStochastic Gradient Descent Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_sgd))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_sgd))
print(classification_report(y_test, y_pred_sgd))

# Step 6: Visualizations


In [None]:
# Feature importance (Random Forest)
importances = rf_model.feature_importances_
feature_names = X.columns
plt.figure(figsize=(10, 6))
sns.barplot(x=importances, y=feature_names)
plt.title('Feature Importances from Random Forest')
plt.show()