<a href="https://colab.research.google.com/github/Sowmyad15/SMS_Spam/blob/main/Downsampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from transformers import BertTokenizer, BertModel
from sklearn.metrics import silhouette_score


In [None]:
df=pd.read_csv('/content/spam.csv',encoding='latin-1')
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [None]:
df = df.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
df.columns = ["label", "message"]
df['label_num']=df['label'].map({
    'ham':0,
    'spam':1
})
df.head()

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:

df_spam = df[df['label']=='spam']

df_ham = df[df['label']=='ham']

print("Ham Dataset Shape:", df_ham.shape)

print("Spam Dataset Shape:", df_spam.shape)

Ham Dataset Shape: (4825, 3)
Spam Dataset Shape: (747, 3)


In [None]:
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 3)

In [None]:
df= pd.concat([df_spam , df_ham_downsampled])

In [None]:
df['label_num'].value_counts()

1    747
0    747
Name: label_num, dtype: int64

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)
df['preprocessed_txt'] = df['message'].apply(preprocess)
df.head()

Unnamed: 0,label,message,label_num,preprocessed_txt
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry 2 wkly comp win FA Cup final tkts 2...
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1,FreeMsg hey darle 3 week word like fun tb ok X...
8,spam,WINNER!! As a valued network customer you have...,1,WINNER value network customer select receivea ...
9,spam,Had your mobile 11 months or more? U R entitle...,1,mobile 11 month u r entitle update late colour...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po...",1,"chance win cash 100 20,000 pound txt > CSH11 s..."


In [None]:
import torch
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
def get_bert_embedding(text):
    tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        output = model(**tokens.to(device))
    embeddings = output.last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings


In [None]:
df['bert_embedding'] = df['preprocessed_txt'].apply(get_bert_embedding)


In [None]:
X = np.vstack(df['bert_embedding'].to_numpy())

In [None]:
X

array([[ 6.9602951e-02, -2.2386853e-02,  6.7938733e-01, ...,
        -5.7595111e-02, -6.4683400e-02,  2.8856483e-01],
       [ 1.5675500e-01,  7.5454071e-02,  7.1723789e-01, ...,
        -2.4080218e-01,  3.2237180e-02,  1.9361259e-01],
       [ 8.1511050e-02,  3.4037884e-02,  3.8506815e-01, ...,
        -2.8555986e-01, -9.7711086e-02,  2.2255979e-01],
       ...,
       [-2.3314089e-01, -2.6617995e-01,  3.0464122e-01, ...,
         1.3567924e-01, -1.2334920e-01,  3.4661299e-01],
       [ 2.7611202e-01,  1.2190273e-01, -2.6327783e-01, ...,
         8.8233314e-02, -4.1775116e-01, -1.7188340e-02],
       [ 3.6202389e-01, -8.4947050e-04,  6.6100419e-01, ...,
        -2.3742676e-01,  2.1207321e-01, -6.4399838e-04]], dtype=float32)

In [None]:
len(X[0])

768

In [None]:
len(X)

1494

In [None]:
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(X)

In [None]:
reduced_embeddings

array([[-2.4025276 , -0.12598366],
       [-1.2582215 ,  1.7298565 ],
       [-2.3668518 , -1.3878269 ],
       ...,
       [ 4.545595  , -3.224718  ],
       [ 5.602863  , -3.8759453 ],
       [ 1.2585615 ,  1.6135795 ]], dtype=float32)

In [None]:
kmeans = KMeans(n_clusters=2, init='k-means++', random_state=42)
kmeans.fit(reduced_embeddings)



In [None]:
cluster_labels = kmeans.labels_

In [None]:
cluster_labels

array([1, 1, 1, ..., 0, 0, 0], dtype=int32)

In [None]:
silhouette_avg = silhouette_score(reduced_embeddings, cluster_labels)

In [None]:
print(f"Silhouette Score: {silhouette_avg}")

Silhouette Score: 0.5284448862075806


In [None]:
df['cluster']=kmeans.labels_

In [None]:
df

Unnamed: 0,label,message,label_num,preprocessed_txt,bert_embedding,cluster
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry 2 wkly comp win FA Cup final tkts 2...,"[[0.06960295, -0.022386853, 0.67938733, -0.242...",1
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1,FreeMsg hey darle 3 week word like fun tb ok X...,"[[0.156755, 0.07545407, 0.7172379, -0.19708195...",1
8,spam,WINNER!! As a valued network customer you have...,1,WINNER value network customer select receivea ...,"[[0.08151105, 0.034037884, 0.38506815, -0.1198...",1
9,spam,Had your mobile 11 months or more? U R entitle...,1,mobile 11 month u r entitle update late colour...,"[[0.14264826, -0.00024623596, 0.6654207, -0.07...",1
11,spam,"SIX chances to win CASH! From 100 to 20,000 po...",1,"chance win cash 100 20,000 pound txt > CSH11 s...","[[0.23117736, -0.15237245, 0.77831644, -0.1032...",1
...,...,...,...,...,...,...
3308,ham,Okie Ì_ wan meet at bishan? Cos me at bishan n...,0,okie Ì wan meet bishan cos bishan drive today,"[[0.12656777, 0.060213927, 0.7551852, -0.33069...",0
993,ham,The Xmas story is peace.. The Xmas msg is love...,0,Xmas story peace Xmas msg love Xmas miracle je...,"[[0.073731326, 0.236083, 1.2705761, -0.0836968...",1
758,ham,U should have made an appointment,0,u appointment,"[[-0.23314089, -0.26617995, 0.30464122, 0.0021...",0
4880,ham,When/where do I pick you up,0,pick,"[[0.27611202, 0.12190273, -0.26327783, -0.0452...",0


In [None]:
reduced_embeddings.shape

(1494, 2)

In [None]:
df['cluster'].shape

(1494,)

In [None]:
ham_indices = df[df['cluster'] == 0].index
spam_indices = df[df['cluster'] == 1].index
import matplotlib.pyplot as plt


plt.scatter(reduced_embeddings[ham_indices, 0], reduced_embeddings[ham_indices, 1], label='Ham', cmap='viridis', s=5)
plt.scatter(reduced_embeddings[spam_indices, 0], reduced_embeddings[spam_indices, 1], label='Spam', cmap='viridis', s=5)

plt.title('Visualization of SMS Clusters')
plt.legend()
plt.xlabel('PCA D1')
plt.ylabel('PCA  D2')
plt.show()

IndexError: ignored

In [None]:
from sklearn.metrics import accuracy_score,classification_report

accuracy=accuracy_score(df['label_num'],df['cluster'])
report=classification_report(df['label_num'], df['cluster'])

print(f'Accuracy: {accuracy}\nClassification Report:\n{report}')


Accuracy: 0.9564926372155288
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.95      0.96       747
           1       0.95      0.96      0.96       747

    accuracy                           0.96      1494
   macro avg       0.96      0.96      0.96      1494
weighted avg       0.96      0.96      0.96      1494



In [None]:
df.head()

Unnamed: 0,label,message,label_num,preprocessed_txt,bert_embedding,cluster
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry 2 wkly comp win FA Cup final tkts 2...,"[[0.06960295, -0.022386853, 0.67938733, -0.242...",1
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1,FreeMsg hey darle 3 week word like fun tb ok X...,"[[0.156755, 0.07545407, 0.7172379, -0.19708195...",1
8,spam,WINNER!! As a valued network customer you have...,1,WINNER value network customer select receivea ...,"[[0.08151105, 0.034037884, 0.38506815, -0.1198...",1
9,spam,Had your mobile 11 months or more? U R entitle...,1,mobile 11 month u r entitle update late colour...,"[[0.14264826, -0.00024623596, 0.6654207, -0.07...",1
11,spam,"SIX chances to win CASH! From 100 to 20,000 po...",1,"chance win cash 100 20,000 pound txt > CSH11 s...","[[0.23117736, -0.15237245, 0.77831644, -0.1032...",1


In [None]:
X = np.vstack(df['bert_embedding'].to_numpy())

pca = PCA(n_components=50)
X_pca = pca.fit_transform(X)

In [None]:
X_pca

array([[-2.4025187e+00, -1.2599185e-01, -3.9257410e-01, ...,
         4.7478836e-02,  5.3428609e-02,  3.6483070e-01],
       [-1.2582239e+00,  1.7298551e+00,  2.0069559e-01, ...,
         1.2771236e-02, -2.7928191e-01, -1.9169475e-01],
       [-2.3668487e+00, -1.3878279e+00, -1.9765352e-01, ...,
         1.1642674e-01, -6.8505101e-02,  5.3321067e-03],
       ...,
       [ 4.5455995e+00, -3.2247121e+00, -5.9597653e-01, ...,
         2.0585996e-01, -3.2079163e-01, -4.8392165e-01],
       [ 5.6028666e+00, -3.8759377e+00,  3.0271924e-01, ...,
        -4.5827311e-01, -8.2221985e-01,  5.4086566e-02],
       [ 1.2585585e+00,  1.6135794e+00,  4.3737224e-01, ...,
         1.3911632e-01, -1.9158071e-01,  7.8277123e-01]], dtype=float32)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_pca, df['label_num'], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression


In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
y_val_pred = logreg.predict(X_val)
y_test_pred = logreg.predict(X_test)


In [None]:
y_train_pred=logreg.predict(X_train)

In [None]:
accuracy_val = logreg.score(X_val, y_val)
accuracy_test = logreg.score(X_test, y_test)


In [None]:
accuracy_train=logreg.score(X_train, y_train)

In [None]:
print("Validation Accuracy:", accuracy_val)
print("Test Accuracy:", accuracy_test)


Validation Accuracy: 0.9790794979079498
Test Accuracy: 0.959866220735786


In [None]:
print("Train Accuracy:", accuracy_train)

Train Accuracy: 0.9853556485355649


In [None]:
from sklearn.metrics import accuracy_score,classification_report

report_val=classification_report(y_val_pred,y_val)
print(report_val)

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       122
           1       0.97      0.99      0.98       117

    accuracy                           0.98       239
   macro avg       0.98      0.98      0.98       239
weighted avg       0.98      0.98      0.98       239



In [None]:
report_val_c=classification_report(y_test,y_test_pred)
print(report_val_c)

              precision    recall  f1-score   support

           0       0.96      0.97      0.96       154
           1       0.97      0.95      0.96       145

    accuracy                           0.96       299
   macro avg       0.96      0.96      0.96       299
weighted avg       0.96      0.96      0.96       299



In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_pca, df['cluster'], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
y_val_pred = logreg.predict(X_val)
y_test_pred = logreg.predict(X_test)

In [None]:
accuracy_val = logreg.score(X_val, y_val)
accuracy_test = logreg.score(X_test, y_test)

In [None]:
from sklearn.metrics import accuracy_score,classification_report
print(accuracy_val)
report_val=classification_report(y_val_pred,y_val)
print(report_val)

1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       113
           1       1.00      1.00      1.00       126

    accuracy                           1.00       239
   macro avg       1.00      1.00      1.00       239
weighted avg       1.00      1.00      1.00       239



In [None]:
report_val_c=classification_report(y_test,y_test_pred)
print(accuracy_test)
print(report_val_c)

0.9933110367892977
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       150
           1       1.00      0.99      0.99       149

    accuracy                           0.99       299
   macro avg       0.99      0.99      0.99       299
weighted avg       0.99      0.99      0.99       299



In [None]:

X = np.array(df['bert_embedding'].tolist())

In [None]:
y = np.array(df['cluster'])

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [None]:
X_train.shape

(1045, 1, 768)

In [None]:
X_train_reshaped = X_train.reshape(X_train.shape[0], -1)
X_val_reshaped = X_val.reshape(X_val.shape[0], -1)
X_test_reshaped = X_test.reshape(X_test.shape[0], -1)

In [None]:
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_reshaped)
X_val_pca = pca.transform(X_val_reshaped)
X_test_pca = pca.transform(X_test_reshaped)


In [None]:

logistic_model = LogisticRegression()
logistic_model.fit(X_train_pca, y_train)



In [None]:
y_val_pred = logistic_model.predict(X_val_pca)
accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {accuracy:.2f}')

report = classification_report(y_val, y_val_pred)
print('Classification Report:\n', report)

Validation Accuracy: 1.00
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       119
           1       1.00      1.00      1.00       105

    accuracy                           1.00       224
   macro avg       1.00      1.00      1.00       224
weighted avg       1.00      1.00      1.00       224



In [None]:
y_test_pred = logistic_model.predict(X_test_pca)

accuracy_test = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {accuracy_test:.2f}')

report_test = classification_report(y_test, y_test_pred)
print('Test Classification Report:\n', report_test)

Test Accuracy: 1.00
Test Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       111
           1       1.00      1.00      1.00       114

    accuracy                           1.00       225
   macro avg       1.00      1.00      1.00       225
weighted avg       1.00      1.00      1.00       225

