<a href="https://colab.research.google.com/github/Sowmyad15/SMS_Spam/blob/main/Downsampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from transformers import BertTokenizer, BertModel
from sklearn.metrics import silhouette_score


In [2]:
df=pd.read_csv('/content/spam.csv',encoding='latin-1')
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [3]:
df = df.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
df.columns = ["label", "message"]
df['label_num']=df['label'].map({
    'ham':0,
    'spam':1
})
df.head()

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [4]:

df_spam = df[df['label']=='spam']

df_ham = df[df['label']=='ham']

print("Ham Dataset Shape:", df_ham.shape)

print("Spam Dataset Shape:", df_spam.shape)

Ham Dataset Shape: (4825, 3)
Spam Dataset Shape: (747, 3)


In [5]:
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 3)

In [6]:
df= pd.concat([df_spam , df_ham_downsampled])

In [None]:
df['label_num'].value_counts()

1    747
0    747
Name: label_num, dtype: int64

In [7]:
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)
df['preprocessed_txt'] = df['message'].apply(preprocess)
df.head()

Unnamed: 0,label,message,label_num,preprocessed_txt
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry 2 wkly comp win FA Cup final tkts 2...
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1,FreeMsg hey darle 3 week word like fun tb ok X...
8,spam,WINNER!! As a valued network customer you have...,1,WINNER value network customer select receivea ...
9,spam,Had your mobile 11 months or more? U R entitle...,1,mobile 11 month u r entitle update late colour...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po...",1,"chance win cash 100 20,000 pound txt > CSH11 s..."


In [8]:
import torch
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [10]:
def get_bert_embedding(text):
    tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        output = model(**tokens.to(device))
    embeddings = output.last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings


In [11]:
df['bert_embedding'] = df['preprocessed_txt'].apply(get_bert_embedding)


In [12]:
X = np.vstack(df['bert_embedding'].to_numpy())

In [13]:
X

array([[ 0.06960295, -0.02238685,  0.67938733, ..., -0.05759511,
        -0.0646834 ,  0.28856483],
       [ 0.156755  ,  0.07545407,  0.7172379 , ..., -0.24080218,
         0.03223718,  0.19361259],
       [ 0.08151105,  0.03403788,  0.38506815, ..., -0.28555986,
        -0.09771109,  0.2225598 ],
       ...,
       [ 0.06568925, -0.22949146, -0.14958945, ...,  0.07863455,
        -0.17543575, -0.07671051],
       [-0.09462243,  0.06525365,  0.6332099 , ..., -0.3998135 ,
        -0.11058348,  0.07225692],
       [-0.55583876, -0.42556778,  0.10936137, ..., -0.35835573,
        -0.14071417, -0.22445336]], dtype=float32)

In [14]:
len(X[0])

768

In [15]:
len(X)

1494

In [16]:
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(X)

In [17]:
reduced_embeddings

array([[-2.4480577 , -0.2086208 ],
       [-1.3041773 ,  1.5595833 ],
       [-2.3973844 , -1.4002914 ],
       ...,
       [ 5.777977  , -4.169127  ],
       [ 0.14309916,  2.0304022 ],
       [ 1.2191033 ,  1.3167881 ]], dtype=float32)

In [18]:
kmeans = KMeans(n_clusters=2, init='k-means++', random_state=42)
kmeans.fit(reduced_embeddings)



In [19]:
cluster_labels = kmeans.labels_

In [20]:
cluster_labels

array([1, 1, 1, ..., 0, 0, 0], dtype=int32)

In [21]:
silhouette_avg = silhouette_score(reduced_embeddings, cluster_labels)

In [22]:
print(f"Silhouette Score: {silhouette_avg}")

Silhouette Score: 0.5371034145355225


In [23]:
df['cluster']=kmeans.labels_

In [24]:
df

Unnamed: 0,label,message,label_num,preprocessed_txt,bert_embedding,cluster
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry 2 wkly comp win FA Cup final tkts 2...,"[[0.06960295, -0.022386853, 0.67938733, -0.242...",1
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1,FreeMsg hey darle 3 week word like fun tb ok X...,"[[0.156755, 0.07545407, 0.7172379, -0.19708195...",1
8,spam,WINNER!! As a valued network customer you have...,1,WINNER value network customer select receivea ...,"[[0.08151105, 0.034037884, 0.38506815, -0.1198...",1
9,spam,Had your mobile 11 months or more? U R entitle...,1,mobile 11 month u r entitle update late colour...,"[[0.14264826, -0.00024623596, 0.6654207, -0.07...",1
11,spam,"SIX chances to win CASH! From 100 to 20,000 po...",1,"chance win cash 100 20,000 pound txt > CSH11 s...","[[0.23117736, -0.15237245, 0.77831644, -0.1032...",1
...,...,...,...,...,...,...
5052,ham,Lmao you know me so well...,0,Lmao know,"[[0.20578831, -0.01056812, 0.18737462, -0.1811...",0
637,ham,When Ì_ login dat time... Dad fetching Ì_ home...,0,Ì login dat time Dad fetch Ì home,"[[-0.24634726, 0.22145817, 0.66078526, -0.2109...",0
2695,ham,And whenever you and i see we can still hook u...,0,hook,"[[0.06568925, -0.22949146, -0.14958945, 0.0381...",0
2487,ham,K ill drink.pa then what doing. I need srs mod...,0,K ill drink.pa need srs model pls send mail d pa,"[[-0.094622426, 0.06525365, 0.6332099, -0.0224...",0


In [25]:
reduced_embeddings.shape

(1494, 2)

In [26]:
df['cluster'].shape

(1494,)

In [27]:
ham_indices = df[df['cluster'] == 0].index
spam_indices = df[df['cluster'] == 1].index
import matplotlib.pyplot as plt


plt.scatter(reduced_embeddings[ham_indices, 0], reduced_embeddings[ham_indices, 1], label='Ham', cmap='viridis', s=5)
plt.scatter(reduced_embeddings[spam_indices, 0], reduced_embeddings[spam_indices, 1], label='Spam', cmap='viridis', s=5)

plt.title('Visualization of SMS Clusters')
plt.legend()
plt.xlabel('PCA D1')
plt.ylabel('PCA  D2')
plt.show()

IndexError: ignored

In [29]:
from sklearn.metrics import accuracy_score,classification_report

accuracy=accuracy_score(df['label_num'],df['cluster'])
report=classification_report(df['label_num'], df['cluster'])

print(f'Accuracy: {accuracy}\nClassification Report:\n{report}')


Accuracy: 0.9558232931726908
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.95      0.96       747
           1       0.95      0.96      0.96       747

    accuracy                           0.96      1494
   macro avg       0.96      0.96      0.96      1494
weighted avg       0.96      0.96      0.96      1494



In [30]:
df.head()

Unnamed: 0,label,message,label_num,preprocessed_txt,bert_embedding,cluster
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry 2 wkly comp win FA Cup final tkts 2...,"[[0.06960295, -0.022386853, 0.67938733, -0.242...",1
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1,FreeMsg hey darle 3 week word like fun tb ok X...,"[[0.156755, 0.07545407, 0.7172379, -0.19708195...",1
8,spam,WINNER!! As a valued network customer you have...,1,WINNER value network customer select receivea ...,"[[0.08151105, 0.034037884, 0.38506815, -0.1198...",1
9,spam,Had your mobile 11 months or more? U R entitle...,1,mobile 11 month u r entitle update late colour...,"[[0.14264826, -0.00024623596, 0.6654207, -0.07...",1
11,spam,"SIX chances to win CASH! From 100 to 20,000 po...",1,"chance win cash 100 20,000 pound txt > CSH11 s...","[[0.23117736, -0.15237245, 0.77831644, -0.1032...",1


In [32]:
X = np.vstack(df['bert_embedding'].to_numpy())

pca = PCA(n_components=50)
X_pca = pca.fit_transform(X)

In [33]:
X_pca

array([[-2.4480643 , -0.20863794, -0.47632927, ..., -0.14038081,
        -0.0529055 ,  0.02110654],
       [-1.3041755 ,  1.5595844 , -0.21714509, ...,  0.04302064,
        -0.16252367,  0.12360511],
       [-2.3973858 , -1.400291  , -0.24944851, ..., -0.0134986 ,
        -0.1651088 ,  0.37378725],
       ...,
       [ 5.777971  , -4.169131  , -0.19857144, ...,  0.22261116,
        -0.23268846, -0.6001386 ],
       [ 0.14310142,  2.0304    , -0.54217005, ..., -0.4525913 ,
        -0.24479671,  0.06042355],
       [ 1.219105  ,  1.316788  ,  0.9064409 , ..., -0.45945746,
        -0.18347073,  0.18204355]], dtype=float32)

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_pca, df['label_num'], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [50]:
from sklearn.linear_model import LogisticRegression

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
y_val_pred = logreg.predict(X_val)
y_test_pred = logreg.predict(X_test)


In [None]:
y_train_pred=logreg.predict(X_train)

In [None]:
accuracy_val = logreg.score(X_val, y_val)
accuracy_test = logreg.score(X_test, y_test)


In [None]:
accuracy_train=logreg.score(X_train, y_train)

In [None]:
print("Validation Accuracy:", accuracy_val)
print("Test Accuracy:", accuracy_test)


Validation Accuracy: 0.9790794979079498
Test Accuracy: 0.959866220735786


In [None]:
print("Train Accuracy:", accuracy_train)

Train Accuracy: 0.9853556485355649


In [None]:
from sklearn.metrics import accuracy_score,classification_report

report_val=classification_report(y_val_pred,y_val)
print(report_val)

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       122
           1       0.97      0.99      0.98       117

    accuracy                           0.98       239
   macro avg       0.98      0.98      0.98       239
weighted avg       0.98      0.98      0.98       239



In [None]:
report_val_c=classification_report(y_test,y_test_pred)
print(report_val_c)

              precision    recall  f1-score   support

           0       0.96      0.97      0.96       154
           1       0.97      0.95      0.96       145

    accuracy                           0.96       299
   macro avg       0.96      0.96      0.96       299
weighted avg       0.96      0.96      0.96       299



In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_pca, df['cluster'], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
y_val_pred = logreg.predict(X_val)
y_test_pred = logreg.predict(X_test)

In [None]:
accuracy_val = logreg.score(X_val, y_val)
accuracy_test = logreg.score(X_test, y_test)

In [None]:
from sklearn.metrics import accuracy_score,classification_report
print(accuracy_val)
report_val=classification_report(y_val_pred,y_val)
print(report_val)

1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       113
           1       1.00      1.00      1.00       126

    accuracy                           1.00       239
   macro avg       1.00      1.00      1.00       239
weighted avg       1.00      1.00      1.00       239



In [None]:
report_val_c=classification_report(y_test,y_test_pred)
print(accuracy_test)
print(report_val_c)

0.9933110367892977
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       150
           1       1.00      0.99      0.99       149

    accuracy                           0.99       299
   macro avg       0.99      0.99      0.99       299
weighted avg       0.99      0.99      0.99       299



In [None]:

X = np.array(df['bert_embedding'].tolist())

In [None]:
y = np.array(df['cluster'])

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [None]:
X_train.shape

(1045, 1, 768)

In [None]:
X_train_reshaped = X_train.reshape(X_train.shape[0], -1)
X_val_reshaped = X_val.reshape(X_val.shape[0], -1)
X_test_reshaped = X_test.reshape(X_test.shape[0], -1)

In [None]:
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_reshaped)
X_val_pca = pca.transform(X_val_reshaped)
X_test_pca = pca.transform(X_test_reshaped)


In [None]:

logistic_model = LogisticRegression()
logistic_model.fit(X_train_pca, y_train)



In [None]:
y_val_pred = logistic_model.predict(X_val_pca)
accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {accuracy:.2f}')

report = classification_report(y_val, y_val_pred)
print('Classification Report:\n', report)

Validation Accuracy: 1.00
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       119
           1       1.00      1.00      1.00       105

    accuracy                           1.00       224
   macro avg       1.00      1.00      1.00       224
weighted avg       1.00      1.00      1.00       224



In [None]:
y_test_pred = logistic_model.predict(X_test_pca)

accuracy_test = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {accuracy_test:.2f}')

report_test = classification_report(y_test, y_test_pred)
print('Test Classification Report:\n', report_test)

Test Accuracy: 1.00
Test Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       111
           1       1.00      1.00      1.00       114

    accuracy                           1.00       225
   macro avg       1.00      1.00      1.00       225
weighted avg       1.00      1.00      1.00       225



In [60]:
from sklearn.model_selection import train_test_split
traintest_data, val_data = train_test_split(df,test_size=0.2)

In [61]:
df_u=traintest_data

In [63]:
df_u.head()

Unnamed: 0,label,message,label_num,preprocessed_txt,bert_embedding,cluster
5012,ham,"Uncle G, just checking up on you. Do have a re...",0,Uncle G check rewarding month,"[[-0.0066569746, -0.3955102, 0.40341333, -0.12...",0
3734,ham,"ItÛ÷s å£6 to get in, is that ok?",0,itû÷s å£6 ok,"[[-0.15679595, 0.13057539, 0.52779335, -0.4074...",1
830,spam,U have a secret admirer. REVEAL who thinks U R...,1,u secret admirer REVEAL think U R special 0906...,"[[0.105451256, -0.15613061, 0.7577801, -0.2316...",1
3441,spam,Save money on wedding lingerie at www.bridal.p...,1,save money wedding lingerie www.bridal.pettico...,"[[0.32006028, -0.29678413, 0.83792084, 0.10138...",1
3300,spam,RCT' THNQ Adrian for U text. Rgds Vatian,1,RCT thnq Adrian u text Rgds Vatian,"[[0.0054607256, -0.10947789, 0.5937615, -0.149...",0


In [64]:
val_data.head()

Unnamed: 0,label,message,label_num,preprocessed_txt,bert_embedding,cluster
2848,spam,YOUR CHANCE TO BE ON A REALITY FANTASY SHOW ca...,1,chance reality fantasy = 08707509020 20p min N...,"[[0.20324457, -0.21100028, 0.47878477, -0.3558...",1
3860,spam,Free Msg: Ringtone!From: http://tms. widelive....,1,Free Msg ringtone!from http://tms widelive.com...,"[[0.2358737, -0.13406031, 0.48625103, -0.22509...",1
2487,ham,K ill drink.pa then what doing. I need srs mod...,0,K ill drink.pa need srs model pls send mail d pa,"[[-0.094622426, 0.06525365, 0.6332099, -0.0224...",0
4310,ham,It so happens that there r 2waxsto do wat you ...,0,happen r 2waxsto wat want come ill medical ins...,"[[0.22366089, 0.062169246, 1.1797861, -0.18018...",0
5200,spam,Call Germany for only 1 pence per minute! Call...,1,Germany 1 penny minute fix line access number ...,"[[0.20101507, -0.09836525, 0.14895153, 0.12973...",1


In [68]:
X = np.vstack(df_u['bert_embedding'].to_numpy())

pca = PCA(n_components=50)
X_pca = pca.fit_transform(X)

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, df_u['label_num'], test_size=0.2, random_state=42)

In [70]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [73]:
pred=logreg.predict(X_test)

accuracy=accuracy_score(y_test,pred)
report=classification_report(y_test,pred)

print(f'Accuracy: {accuracy}\nClassification Report:\n{report}')

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

conf_matrix = confusion_matrix(y_test, pred)

print(f'Confusion Matrix:\n{conf_matrix}')

Accuracy: 0.9456066945606695
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.96      0.95       124
           1       0.96      0.93      0.94       115

    accuracy                           0.95       239
   macro avg       0.95      0.95      0.95       239
weighted avg       0.95      0.95      0.95       239

Confusion Matrix:
[[119   5]
 [  8 107]]


In [84]:
y_train.value_counts()

1    479
0    477
Name: label_num, dtype: int64

In [85]:
y_test.value_counts()

0    124
1    115
Name: label_num, dtype: int64

Validation_Data-Predict

In [83]:
val_data['label_num'].value_counts()

1    153
0    146
Name: label_num, dtype: int64

In [78]:
X_valid = np.vstack(val_data['bert_embedding'].to_numpy())

pca = PCA(n_components=50)
X_pca_valid = pca.fit_transform(X_valid)

In [79]:
pred_v=logreg.predict(X_pca_valid)

In [81]:
pred_v

array([1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0])

In [82]:
val_data['label_num']

2848    1
3860    1
2487    0
4310    0
5200    1
       ..
2250    0
2807    0
5237    1
5493    0
162     0
Name: label_num, Length: 299, dtype: int64

In [80]:
accuracy=accuracy_score(val_data['label_num'],pred_v)
report=classification_report(val_data['label_num'],pred_v)

print(f'Accuracy: {accuracy}\nClassification Report:\n{report}')

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

conf_matrix = confusion_matrix(val_data['label_num'],pred_v)

print(f'Confusion Matrix:\n{conf_matrix}')

Accuracy: 0.9498327759197325
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       146
           1       0.95      0.95      0.95       153

    accuracy                           0.95       299
   macro avg       0.95      0.95      0.95       299
weighted avg       0.95      0.95      0.95       299

Confusion Matrix:
[[138   8]
 [  7 146]]
