In [4]:
import sys
!{sys.executable} -m pip install pandas scikit-learn nltk gensim matplotlib

Defaulting to user installation because normal site-packages is not writeable
Collecting gensim
  Downloading gensim-4.4.0-cp313-cp313-win_amd64.whl.metadata (8.6 kB)
Downloading gensim-4.4.0-cp313-cp313-win_amd64.whl (24.4 MB)
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.3/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.3/24.4 MB ? eta -:--:--
    --------------------------------------- 0.5/24.4 MB 589.0 kB/s eta 0:00:41
    --------------------------------------- 0.5/24.4 MB 589.0 kB/s eta 0:00:41
   - -------------------------------------- 0.8/24.4 MB 613.7 kB/s eta 0:00:39
   - -------------------------------------- 1.0/24.4 MB 642.3 kB/s eta 0:00:37
   - -------------------------------------- 1.0/24.4 MB 642.3 kB/s eta 0:00:37
   -- ---------------------

In [5]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [8]:
# 1. Load Dataset (SMS Spam Collection)
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', header=None, names=['label', 'text'])



In [9]:
# 2. Text Cleaning Process
def clean_text(text):
    text = text.lower() # Convert text to lower case
    text = re.sub(r'\W', ' ', text) # Remove punctuation and special characters
    text = re.sub(r'\s\s+', ' ', text) # Remove extra spaces
    return text.strip()

df['clean_text'] = df['text'].apply(clean_text)

In [10]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['label'], test_size=0.2, random_state=42)

In [11]:
# 3. Vectorization (TF-IDF removes stopwords automatically using 'english')
vectorizer = TfidfVectorizer(stop_words='english') 
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [15]:
# 4. Train & Evaluate Models
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Support Vector Machine": SVC(kernel='linear')
}

for name, model in models.items():
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)
    
    print(f"========== {name} ==========")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred), "\n")


    

    # : Top Important Words for Naive Bayes
nb_model = models["Naive Bayes"]
feature_names = vectorizer.get_feature_names_out()

# Get the indices of the most important features and reverse them to show the highest first
ham_top_words = feature_names[nb_model.feature_log_prob_[0].argsort()[::-1][:10]]
spam_top_words = feature_names[nb_model.feature_log_prob_[1].argsort()[::-1][:10]]

print("========== Bonus: Top Important Words ==========")
print("Top Ham words:", ", ".join(ham_top_words))
print("Top Spam words:", ", ".join(spam_top_words))

Accuracy: 0.9785
Confusion Matrix:
 [[966   0]
 [ 24 125]]
Classification Report:
               precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       1.00      0.84      0.91       149

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115
 

Accuracy: 0.9695
Confusion Matrix:
 [[966   0]
 [ 34 115]]
Classification Report:
               precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       1.00      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115
 

Accuracy: 0.9919
Confusion Matrix:
 [[965   1]
 [  8 141]]
Classification Report:
               precision    recall  f1-score   support

         ham       0.99      1.00     