In [2]:
import pandas as pd
from ydata_profiling import ProfileReport

In [3]:
df = pd.read_csv("red_wine.csv")
df

Unnamed: 0,citric acid,sulphates,alcohol,type
0,0.49,0.63,8.0,low
1,0.66,0.57,8.3,low
2,0.23,0.44,8.5,high
3,0.44,0.84,8.6,low
4,0.08,0.50,8.7,low
...,...,...,...,...
566,0.40,0.70,12.7,low
567,0.28,0.60,12.8,high
568,0.31,0.60,12.8,high
569,0.34,0.72,12.8,low


In [4]:
red_wine = ProfileReport(df, title="Red Wine Data Analysis", explorative=True)
red_wine.to_file("red_wine_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score

In [6]:
X = df.drop(columns=['type'])  
y = df['type']

In [7]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "SVM-Linear": SVC(kernel='linear', probability=True),
    "SVM-RBF": SVC(kernel='rbf', probability=True),
    "Random Forest": RandomForestClassifier()
}

In [8]:
results = {}

for name, model in models.items():
    y_pred = cross_val_predict(model, X, y, cv=10, method="predict_proba")[:,1]
    auc = roc_auc_score(y, y_pred)
    acc = cross_val_score(model, X, y, cv=10, scoring='accuracy').mean()
    results[name] = {"AUC": auc, "Accuracy": acc}

# Convert results to DataFrame
results_df = pd.DataFrame(results).T
print(results_df)

                          AUC  Accuracy
Logistic Regression  0.870332  0.784785
Naive Bayes          0.882481  0.821627
Decision Tree        0.754838  0.749728
SVM-Linear           0.868498  0.791863
SVM-RBF              0.845454  0.535844
Random Forest        0.864374  0.791924


In [9]:
majority_class = y.value_counts().idxmax()
baseline_acc = y.value_counts().max() / len(y)
baseline_auc = 0.5

results_df.loc["Baseline"] = {"AUC": baseline_auc, "Accuracy": baseline_acc}
print(results_df)

                          AUC  Accuracy
Logistic Regression  0.870332  0.784785
Naive Bayes          0.882481  0.821627
Decision Tree        0.754838  0.749728
SVM-Linear           0.868498  0.791863
SVM-RBF              0.845454  0.535844
Random Forest        0.864374  0.791924
Baseline             0.500000  0.528897


In [10]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

rf = RandomForestClassifier()
rf.fit(X, y)
y_probs = rf.predict_proba(X)[:, 1]


In [11]:
from sklearn.preprocessing import LabelEncoder

# Convert 'high' and 'low' into numerical values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)  # 'low' -> 0, 'high' -> 1


In [None]:
%matplotlib inline
fpr, tpr, _ = roc_curve(y, y_probs)


plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label="Random Forest (AUC = {:.2f})".format(roc_auc_score(y, y_probs)))
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Random Forest")
plt.legend()
#plt.show()


TypeError: 'numpy.float64' object is not callable

In [20]:
df2 = pd.read_csv("white_wine.csv")
df2

Unnamed: 0,citric acid,sulphates,alcohol,type
0,0.24,0.52,9.4,low
1,0.49,0.56,9.4,low
2,0.66,0.73,10.0,low
3,0.32,0.77,10.0,low
4,0.38,0.82,10.0,low
...,...,...,...,...
73,0.12,0.36,13.8,high
74,0.24,0.74,13.8,low
75,0.15,0.77,13.8,high
76,0.27,0.27,13.9,high


In [28]:
# Assume 'quality' is the target column that you want to predict (same as red wine dataset)
X_white = df2.drop('type', axis=1)  # Features (exclude target)
y_white = df2['type'] 


In [29]:
X_white.shape

(78, 3)

In [30]:
y_white.shape

(78,)

In [31]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_white = label_encoder.fit_transform(y_white)

In [32]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score

# Initialize Naive Bayes model
nb_model = GaussianNB()

# Evaluate using 10-fold cross-validation
auc_scores = cross_val_score(nb_model, X_white, y_white, cv=10, scoring='roc_auc')
print(f"AUC scores for Naive Bayes on white wine: {auc_scores}")
print(f"Mean AUC: {auc_scores.mean()}")

AUC scores for Naive Bayes on white wine: [1.         1.         1.         1.         1.         1.
 1.         1.         0.83333333 0.66666667]
Mean AUC: 0.95
