In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv("/content/emails.csv")

X = df.iloc[:, 1:-1] 
y = df.iloc[:, -1]  
if y.dtype == 'object':
    print("Converting target column from object to int...")
    y = y.apply(lambda x: 1 if x == 'spam' else 0)

y = y.dropna()

y = y.astype(int)


print(f"\nShape of X (features): {X.shape}")
print(f"Shape of y (target): {y.shape}")
print(f"Data type of y (target): {y.dtype}")

combined_df = pd.concat([X, y], axis=1)
initial_rows = combined_df.shape[0]
combined_df = combined_df.dropna()
rows_after_dropna = combined_df.shape[0]

if initial_rows != rows_after_dropna:
    print(f"Dropped {initial_rows - rows_after_dropna} rows due to missing values.")

X = combined_df.iloc[:, :-1]
y = combined_df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

classifier = MultinomialNB()
classifier.fit(X_train, y_train) 

y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(report)



Shape of X (features): (5346, 3000)
Shape of y (target): (5345,)
Data type of y (target): int64
Dropped 1 rows due to missing values.

Model Accuracy: 0.9401

Classification Report:
              precision    recall  f1-score   support

         0.0       0.98      0.94      0.96       761
         1.0       0.86      0.94      0.90       308

    accuracy                           0.94      1069
   macro avg       0.92      0.94      0.93      1069
weighted avg       0.94      0.94      0.94      1069

