In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from pandas_profiling import ProfileReport
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier

# Set Matplotlib backend to avoid display issues
matplotlib.use('Agg')

# Load datasets
red_wine = pd.read_csv(r"D:\Documents\Sem_3\IS733\HW2\red_wine.csv")
white_wine = pd.read_csv(r"D:\Documents\Sem_3\IS733\HW2\white_wine.csv")

# Generate profiling report
profile = ProfileReport(red_wine, explorative=True)
profile.to_file("red_wine_profile.html")

# Convert categorical target to binary values
def encode_target(df):
    df['type'] = df['type'].map({'low': 0, 'high': 1})
    return df

red_wine = encode_target(red_wine)
white_wine = encode_target(white_wine)

# Define features and target
X_red = red_wine.drop(columns=['type'])
y_red = red_wine['type']
X_white = white_wine.drop(columns=['type'])
y_white = white_wine['type']

# Split data for ROC curve visualization
X_train, X_test, y_train, y_test = train_test_split(X_red, y_red, test_size=0.2, random_state=42, stratify=y_red)

# Define models
models = {
    "Baseline": DummyClassifier(strategy='most_frequent'),
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "SVM-Linear": SVC(kernel='linear', probability=True),
    "SVM-RBF": SVC(kernel='rbf', probability=True),
    "Random Forest": RandomForestClassifier()
}

# SubTask 2: Perform 10-fold cross-validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
performance_metrics = {}

for name, model in models.items():
    auc_scores = cross_val_score(model, X_red, y_red, cv=cv, scoring='roc_auc')
    acc_scores = cross_val_score(model, X_red, y_red, cv=cv, scoring='accuracy')
    performance_metrics[name] = {
        "AUC": np.mean(auc_scores),
        "Accuracy": np.mean(acc_scores)
    }

# Convert to DataFrame for easy viewing
performance_df = pd.DataFrame(performance_metrics).T
print(performance_df)

# SubTask 3: Train and plot ROC curve for Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_prob_rf)
plt.figure()
plt.plot(fpr, tpr, label="Random Forest (AUC = %.2f)" % roc_auc_score(y_test, y_prob_rf))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.savefig("roc_curve.png")

# SubTask 4: Find best model based on AUC and test on white wine
best_model_name = max(performance_metrics, key=lambda k: performance_metrics[k]["AUC"])
best_model = models[best_model_name]
best_model.fit(X_red, y_red)
y_white_pred = best_model.predict_proba(X_white)[:, 1]
auc_white = roc_auc_score(y_white, y_white_pred)
print(f"Best model: {best_model_name}, AUC on white wine: {auc_white}")

# SubTask 5: Choosing an interpretable model
interpretable_models = ["Logistic Regression", "Decision Tree"]
print(f"Preferred models for interpretability: {interpretable_models}")




Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

                          AUC  Accuracy
Baseline             0.500000  0.528887
Logistic Regression  0.875819  0.784392
Naive Bayes          0.893291  0.824773
Decision Tree        0.816579  0.808923
SVM-Linear           0.875446  0.793134
SVM-RBF              0.854900  0.535844
Random Forest        0.921983  0.847399
Best model: Random Forest, AUC on white wine: 0.9734811957569914
Preferred models for interpretability: ['Logistic Regression', 'Decision Tree']


  plt.show()
