In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [3]:
df = pd.read_csv('SMSSpamCollection.csv', sep='\t', header=None, names=['label', 'message'])


In [4]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])  # ham: 0, spam: 1
df['message_length'] = df['message'].apply(len)
df['word_count'] = df['message'].apply(lambda x: len(x.split()))

In [6]:
df.head()

Unnamed: 0,label,message,message_length,word_count
0,0,"Go until jurong point, crazy.. Available only ...",111,20
1,0,Ok lar... Joking wif u oni...,29,6
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155,28
3,0,U dun say so early hor... U c already then say...,49,11
4,0,"Nah I don't think he goes to usf, he lives aro...",61,13


In [7]:
x = df[['message_length', 'word_count', 'message']]
y = df['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [8]:
vectorizer = CountVectorizer()
x_train_vectorized = vectorizer.fit_transform(x_train['message'])
x_test_vectorized = vectorizer.transform(x_test['message'])

In [9]:
nb_model = MultinomialNB()
lr_model = LogisticRegression(max_iter=1000)

In [10]:
nb_model.fit(x_train_vectorized, y_train)
y_pred_nb = nb_model.predict(x_test_vectorized)
print("Naive Bayes Performance:\n", classification_report(y_test, y_pred_nb))

Naive Bayes Performance:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      1207
           1       0.98      0.94      0.96       186

    accuracy                           0.99      1393
   macro avg       0.98      0.97      0.97      1393
weighted avg       0.99      0.99      0.99      1393



In [11]:
lr_model.fit(x_train_vectorized, y_train)
y_pred_lr = lr_model.predict(x_test_vectorized)
print("Logistic Regression Performance:\n", classification_report(y_test, y_pred_lr))

Logistic Regression Performance:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      1207
           1       1.00      0.89      0.94       186

    accuracy                           0.98      1393
   macro avg       0.99      0.94      0.97      1393
weighted avg       0.99      0.98      0.98      1393



In [12]:
nb_cv_scores = cross_val_score(nb_model, vectorizer.transform(x['message']), y, cv=5)
lr_cv_scores = cross_val_score(lr_model, vectorizer.transform(x['message']), y, cv=5)
print("Naive Bayes CV Mean Score:", nb_cv_scores.mean())
print("Logistic Regression CV Mean Score:", lr_cv_scores.mean())

Naive Bayes CV Mean Score: 0.9802579481688417
Logistic Regression CV Mean Score: 0.9818734250589722


In [13]:
param_grid = {'C': [0.01, 0.1, 1, 10], 'penalty': ['l1', 'l2']}
grid_search = GridSearchCV(LogisticRegression(solver='liblinear'), param_grid, cv=5)
grid_search.fit(x_train_vectorized, y_train)
best_lr_model = grid_search.best_estimator_
y_pred_best_lr = best_lr_model.predict(x_test_vectorized)
print("Tuned Logistic Regression Performance:\n", classification_report(y_test, y_pred_best_lr))
print("Best Hyperparameters:", grid_search.best_params_)

Tuned Logistic Regression Performance:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      1207
           1       1.00      0.90      0.95       186

    accuracy                           0.99      1393
   macro avg       0.99      0.95      0.97      1393
weighted avg       0.99      0.99      0.99      1393

Best Hyperparameters: {'C': 10, 'penalty': 'l2'}
