In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load the dataset
file_path = 'sample_data.csv'  # Use your actual CSV file path
data = pd.read_csv(file_path)

# Step 2: Prepare features and labels
X = data.iloc[:, :-1]  # Features (all columns except the last)
y = data.iloc[:, -1]   # Target (last column)

# Step 3: Split dataset into training and testing sets
# Added stratify=y to maintain class balance in train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 4: Create Naive Bayes model
model = GaussianNB()

# Step 5: Train the model
model.fit(X_train, y_train)

# Step 6: Make predictions
y_pred = model.predict(X_test)

# Step 7: Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))

# Optional: Cross-validation for more robust evaluation (especially for small datasets)
print("\nPerforming 5-Fold Cross Validation...")
cv_scores = cross_val_score(model, X, y, cv=5)
print("Cross-validated scores:", cv_scores)
print("Mean Cross-validation Accuracy:", cv_scores.mean())


Accuracy: 0.5

Classification Report:
               precision    recall  f1-score   support

          No       0.50      1.00      0.67         1
         Yes       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2


Performing 5-Fold Cross Validation...
Cross-validated scores: [1.  1.  1.  1.  0.5]
Mean Cross-validation Accuracy: 0.9
