In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm


In [3]:
data = pd.read_csv('spam.csv', encoding='latin-1')


In [4]:
data.drop_duplicates(inplace=True)
data['label'] = data['v1'].map({'ham': 'ham', 'spam': 'spam'})
X = data['v2']
y = data['label']

In [5]:
# Split the data into two sets: Training Set and Testing Set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

In [7]:
# Fit the vectorizer to the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)


In [17]:
# Initialize a Naive Bayes classifier
classifier = MultinomialNB()

In [18]:
# Train the classifier
classifier.fit(X_train_tfidf, y_train)

In [19]:
# Transform the test data using the same vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [20]:
# Make predictions
y_pred = classifier.predict(X_test_tfidf)

In [21]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)


In [22]:
# Display classification report with labels 'ham' and 'spam'
report = classification_report(y_test, y_pred, target_names=['Legitimate SMS', 'Spam SMS'])

In [23]:
# Create a progress bar
progress_bar = tqdm(total=100, position=0, leave=True)


  0%|          | 0/100 [00:00<?, ?it/s]

In [24]:
# Simulate progress updates
for i in range(10, 101, 10):
    progress_bar.update(10)
    progress_bar.set_description(f'Progress: {i}%')

Progress: 100%: 100%|██████████| 100/100 [00:01<00:00,  6.76it/s]

In [25]:

# Close the progress bar
progress_bar.close()

Progress: 100%: 100%|██████████| 100/100 [00:02<00:00, 39.67it/s]


In [26]:
# Display the results on the interface where the code was initiated from.
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(report)


Accuracy: 0.96
Classification Report:
                precision    recall  f1-score   support

Legitimate SMS       0.95      1.00      0.97       889
      Spam SMS       1.00      0.68      0.81       145

      accuracy                           0.96      1034
     macro avg       0.98      0.84      0.89      1034
  weighted avg       0.96      0.96      0.95      1034

