In [45]:
import pandas as pd

df = pd.read_csv("hf://datasets/codesignal/sms-spam-collection/sms-spam-collection.csv")
df1 = pd.read_csv("train.csv")

In [46]:
df1.head()

Unnamed: 0,sms,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...\n,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [47]:
# ham -> 0, spam -> 1
df['label'] = df['label'].map({'ham': 0, 'spam': 1}).astype('float64')
df['label'] = df['label'].astype(int)
df.head()


Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [48]:
# Create a unified 'sms' column from both dataframes
df['sms'] = df['message']
df1['sms'] = df1['sms']  # Optional, just keeps it clear

# Select only 'label' and new 'sms' columns from both
df_clean = df[['label', 'sms']]
df1_clean = df1[['label', 'sms']]

# Concatenate them
df2 = pd.concat([df_clean, df1_clean], ignore_index=True)



### df2 is the combination of the two initial datasets.

In [52]:
df2.head()


Unnamed: 0,label,sms
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [53]:
df2.shape

(11146, 2)

In [54]:
from transformers import BertTokenizer, BertModel
import torch

In [55]:
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [56]:
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
    with torch.no_grad():
        outputs = model(**inputs)
    # Get the [CLS] token embedding
    return outputs.pooler_output.squeeze().numpy()


In [57]:
from tqdm import tqdm
tqdm.pandas()

# Apply BERT to each row and create a new column with the embeddings
df2['bert_embedding'] = df2['sms'].progress_apply(get_bert_embedding)


100%|████████████████████████████████████████████████████████████████████████████| 11146/11146 [07:45<00:00, 23.94it/s]


In [58]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import torch

In [59]:
df2.head()

Unnamed: 0,label,sms,bert_embedding
0,0,"Go until jurong point, crazy.. Available only ...","[-0.69134617, -0.38095, -0.95244503, 0.7509417..."
1,0,Ok lar... Joking wif u oni...,"[-0.8942947, -0.6027674, -0.9841888, 0.9353531..."
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,"[-0.7034378, -0.50361043, -0.9644544, 0.636864..."
3,0,U dun say so early hor... U c already then say...,"[-0.7182437, -0.3971464, -0.7705447, 0.7624593..."
4,0,"Nah I don't think he goes to usf, he lives aro...","[-0.78104824, -0.57640296, -0.9893936, 0.81220..."


In [60]:
X = np.vstack(df2['bert_embedding'].values)
y = df2['label'].to_numpy()  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# SVM model

In [62]:
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

In [63]:
from sklearn.metrics import classification_report, accuracy_score
y_pred = svm_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9941704035874439

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      1935
           1       1.00      0.96      0.98       295

    accuracy                           0.99      2230
   macro avg       1.00      0.98      0.99      2230
weighted avg       0.99      0.99      0.99      2230



# Imporving Naive Bayes classificator

### Create a balanced dataset out of the combined dataset df2

In [64]:
# Select exactly 1464 samples from each class
spam_df = df2[df2['label'] == 1].sample(n=1464, random_state=42)
ham_df = df2[df2['label'] == 0].sample(n=1464, random_state=42)

# Combine and shuffle the balanced dataset
df2 = pd.concat([spam_df, ham_df]).sample(frac=1, random_state=42).reset_index(drop=True)



In [65]:
df2.shape

(2928, 3)

In [74]:
X = np.vstack(df2['bert_embedding'].values)
y = df2['label'].to_numpy()  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Gaussian Naive Bayes classifier

In [75]:
from sklearn.naive_bayes import GaussianNB

# Create a Gaussian Naive Bayes classifier
model = GaussianNB()

# Fit the model to your data
model.fit(X, y)

# Make predictions
y_pred = model.predict(X)

In [78]:
# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report
print("Accuracy:", accuracy_score(y, y_pred))
print("Classification Report:\n", classification_report(y, y_pred))

Accuracy: 0.7240437158469946
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.48      0.63      1464
           1       0.65      0.97      0.78      1464

    accuracy                           0.72      2928
   macro avg       0.79      0.72      0.71      2928
weighted avg       0.79      0.72      0.71      2928



## DecisionTreeClassifier

In [79]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

In [80]:
# Create and train the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)



In [81]:
# Make predictions
y_pred = dt_model.predict(X_test)
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Print classification report for more details
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9641638225255973
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.93      0.96       268
           1       0.95      0.99      0.97       318

    accuracy                           0.96       586
   macro avg       0.97      0.96      0.96       586
weighted avg       0.97      0.96      0.96       586



## RandomForestClassifier

In [82]:
from sklearn.ensemble import RandomForestClassifier

In [83]:
# Create a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

In [84]:
# Make predictions
y_pred = rf_model.predict(X_test)
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Print classification report for more details
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9795221843003413
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       268
           1       0.98      0.98      0.98       318

    accuracy                           0.98       586
   macro avg       0.98      0.98      0.98       586
weighted avg       0.98      0.98      0.98       586

