<a href="https://colab.research.google.com/github/Sriva29/Wine-Quality-Analysis/blob/main/Wine_Quality_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
# Load the CSV file into a DataFrame
df = pd.read_csv('https://raw.githubusercontent.com/Sriva29/Wine-Quality-Analysis/refs/heads/main/winequality-red.csv')
df['quality'] = df['quality'].apply(lambda x: 0 if x <= 5 else 1)
# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0


In [3]:
# Display summary statistics
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,0.534709
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.49895
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,0.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,0.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,1.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,1.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,1.0


In [5]:
# Display information about the DataFrame
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [6]:
# Check for missing values
df.isnull().sum()


Unnamed: 0,0
fixed acidity,0
volatile acidity,0
citric acid,0
residual sugar,0
chlorides,0
free sulfur dioxide,0
total sulfur dioxide,0
density,0
pH,0
sulphates,0


#SGD Classifier

## Initialization, training, and evaluation

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, accuracy_score

X = df.drop(['quality'], axis=1)
y = df['quality']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=37)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

#Initializing, fitting, and making preds
sgd_clf = SGDClassifier(loss= 'hinge', penalty = 'l2', max_iter = 1000, random_state=37)
sgd_clf.fit(X_train_scaled, y_train)
y_pred = sgd_clf.predict(X_test_scaled)

#Evaluation
print(f'Accuracy of SGDClf is {accuracy_score(y_test, y_pred):0.2f}')
print('Classification Report: \n', classification_report(y_test, y_pred))


Accuracy of SGDClf is 0.67
Classification Report: 
               precision    recall  f1-score   support

           0       0.64      0.65      0.64       150
           1       0.69      0.68      0.68       170

    accuracy                           0.67       320
   macro avg       0.66      0.66      0.66       320
weighted avg       0.67      0.67      0.67       320



## Tuning using GridSearchCV

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier

# Define the parameter grid with corrected loss and valid eta0 values
param_grid = {
    'loss': ['hinge', 'log_loss'],        # 'log_loss' is the new name for logistic regression
    'penalty': ['l2', 'l1', 'elasticnet'],
    'alpha': [0.0001, 0.001, 0.01],      # Regularization strength
    'learning_rate': ['optimal', 'invscaling', 'constant'],
    'eta0': [0.01, 0.1],                 # Initial learning rate (eta0) must be > 0
    'max_iter': [1000, 2000],            # Maximum number of iterations
}

# Initialize the SGDClassifier
sgd = SGDClassifier(random_state=42)

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(sgd, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit GridSearchCV to the training data
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best parameters found:", grid_search.best_params_)

# Step 3: Evaluate the best model on the test set
best_sgd = grid_search.best_estimator_
y_pred = best_sgd.predict(X_test_scaled)

# Print the accuracy and classification report
from sklearn.metrics import classification_report, accuracy_score

print(f"Accuracy of best model: {accuracy_score(y_test, y_pred):.2f}")
print("Classification Report:\n", classification_report(y_test, y_pred))


Best parameters found: {'alpha': 0.001, 'eta0': 0.1, 'learning_rate': 'invscaling', 'loss': 'log_loss', 'max_iter': 1000, 'penalty': 'l1'}
Accuracy of best model: 0.77
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.73      0.75       150
           1       0.77      0.80      0.79       170

    accuracy                           0.77       320
   macro avg       0.77      0.77      0.77       320
weighted avg       0.77      0.77      0.77       320

