In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [6]:
# Load the dataset with a specific encoding
file_path = 'spam.csv'  # Replace with the path to your dataset
df = pd.read_csv(file_path, encoding='ISO-8859-1')  # Change encoding as needed

In [16]:
# Rename columns for easier access (adjust as necessary)
df.columns = ['label', 'email_text', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']

In [18]:
# Drop any extra columns
df = df[['email_text', 'label']]

In [20]:
# Explore the data
print(df.head())
print(df['label'].value_counts())

                                          email_text label
0  Go until jurong point, crazy.. Available only ...   ham
1                      Ok lar... Joking wif u oni...   ham
2  Free entry in 2 a wkly comp to win FA Cup fina...  spam
3  U dun say so early hor... U c already then say...   ham
4  Nah I don't think he goes to usf, he lives aro...   ham
label
ham     4825
spam     747
Name: count, dtype: int64


In [22]:
# Preprocess the data
X = df['email_text']
y = df['label']

In [24]:

# Map labels to binary values if necessary (e.g., 'ham' = 0, 'spam' = 1)
y = y.map({'ham': 0, 'spam': 1})

In [26]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [28]:
# Convert text data to numerical features
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [30]:
# Train a Naive Bayes model
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)


In [32]:
# Predict on the test set
y_pred = model.predict(X_test_tfidf)


In [34]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)


In [36]:
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(report)


Accuracy: 0.97
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115



In [38]:
# Save the model if needed
import joblib
joblib.dump(model, 'spam_detector_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']