In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load dataset
wine_data = pd.read_csv('/content/drive/MyDrive/Concept and Technologies of AI/WineQT.csv')

# Encode target variable
wine_data['quality'] = LabelEncoder().fit_transform(wine_data['quality'])

# Split data into training and testing sets
X_wine = wine_data.drop('quality', axis=1)
y_wine = wine_data['quality']
X_train_wine, X_test_wine, y_train_wine, y_test_wine = train_test_split(X_wine, y_wine, test_size=0.2, random_state=42)

# Standardize data
scaler = StandardScaler()
X_train_wine = scaler.fit_transform(X_train_wine)
X_test_wine = scaler.transform(X_test_wine)

# Build Two Models for Classification
# Model 1: Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_wine, y_train_wine)
y_pred_log = log_reg.predict(X_test_wine)
print("Logistic Regression Classification Report:")
print(classification_report(y_test_wine, y_pred_log, zero_division=1))

# Model 2: Random Forest Classifier
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train_wine, y_train_wine)
y_pred_rf = rf_clf.predict(X_test_wine)
print("Random Forest Classification Report:")
print(classification_report(y_test_wine, y_pred_rf, zero_division=1))

# Determine Best Model
best_model = rf_clf if rf_clf.score(X_test_wine, y_test_wine) > log_reg.score(X_test_wine, y_test_wine) else log_reg
print("Best Performing Model:", type(best_model).__name__)

# Perform Hyperparameter Optimization
if isinstance(best_model, RandomForestClassifier):
    param_grid_rf = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }
    grid_rf = GridSearchCV(best_model, param_grid_rf, cv=5, scoring='accuracy')
    grid_rf.fit(X_train_wine, y_train_wine)
    best_model = grid_rf.best_estimator_
    print("Best Hyperparameters for Random Forest:", grid_rf.best_params_)

# Feature Selection (Important Features)
feature_importances = pd.DataFrame({'Feature': X_wine.columns, 'Importance': best_model.feature_importances_})
selected_features = feature_importances.sort_values(by='Importance', ascending=False).head(5)['Feature'].tolist()
print("Top Selected Features:", selected_features)

# Rebuild Model with Best Hyperparameters and Best Features
X_train_selected = X_train_wine[:, :5]
X_test_selected = X_test_wine[:, :5]
best_model.fit(X_train_selected, y_train_wine)
y_pred_final = best_model.predict(X_test_selected)
print("Final Model Performance:")
print(classification_report(y_test_wine, y_pred_final, zero_division=1))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         6
           2       0.68      0.73      0.70        96
           3       0.63      0.64      0.63        99
           4       0.52      0.50      0.51        26
           5       1.00      0.00      0.00         2

    accuracy                           0.64       229
   macro avg       0.57      0.37      0.37       229
weighted avg       0.63      0.64      0.63       229

Random Forest Classification Report:
              precision    recall  f1-score   support

           1       1.00      0.00      0.00         6
           2       0.73      0.75      0.74        96
           3       0.64      0.71      0.67        99
           4       0.76      0.62      0.68        26
           5       1.00      0.00      0.00         2

    accuracy                           0.69       229
   macro avg       0.83      0.41      0.42       2

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
