<a href="https://colab.research.google.com/github/Sowmyad15/SMS_Spam/blob/main/Downsampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BALANCING DATA

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from transformers import BertTokenizer, BertModel
from sklearn.metrics import silhouette_score


In [None]:
df=pd.read_csv('/content/spam.csv',encoding='latin-1')
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [None]:
df = df.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
df.columns = ["label", "message"]
df['label_num']=df['label'].map({
    'ham':0,
    'spam':1
})
df.head()

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:

df_spam = df[df['label']=='spam']

df_ham = df[df['label']=='ham']

print("Ham Dataset Shape:", df_ham.shape)

print("Spam Dataset Shape:", df_spam.shape)

Ham Dataset Shape: (4825, 3)
Spam Dataset Shape: (747, 3)


In [None]:
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 3)

In [None]:
df= pd.concat([df_spam , df_ham_downsampled])

In [None]:
df['label_num'].value_counts()

1    747
0    747
Name: label_num, dtype: int64

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)
df['preprocessed_txt'] = df['message'].apply(preprocess)
df.head()

Unnamed: 0,label,message,label_num,preprocessed_txt
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry 2 wkly comp win FA Cup final tkts 2...
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1,FreeMsg hey darle 3 week word like fun tb ok X...
8,spam,WINNER!! As a valued network customer you have...,1,WINNER value network customer select receivea ...
9,spam,Had your mobile 11 months or more? U R entitle...,1,mobile 11 month u r entitle update late colour...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po...",1,"chance win cash 100 20,000 pound txt > CSH11 s..."


In [None]:
import torch
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
def get_bert_embedding(text):
    tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        output = model(**tokens.to(device))
    embeddings = output.last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings


In [None]:
df['bert_embedding'] = df['preprocessed_txt'].apply(get_bert_embedding)


In [None]:
X = np.vstack(df['bert_embedding'].to_numpy())

In [None]:
X

array([[ 0.06960295, -0.02238685,  0.67938733, ..., -0.05759511,
        -0.0646834 ,  0.28856483],
       [ 0.156755  ,  0.07545407,  0.7172379 , ..., -0.24080218,
         0.03223718,  0.19361259],
       [ 0.08151105,  0.03403788,  0.38506815, ..., -0.28555986,
        -0.09771109,  0.2225598 ],
       ...,
       [-0.01613099,  0.08003004,  0.43793458, ..., -0.14571533,
         0.19558555, -0.02857584],
       [-0.16056214,  0.2330256 ,  0.78816724, ..., -0.39043212,
        -0.09360802,  0.29595158],
       [-0.02864446,  0.06673483,  0.02676702, ..., -0.3282245 ,
        -0.25967473, -0.26254985]], dtype=float32)

In [None]:
len(X[0])

768

In [None]:
len(X)

1494

In [None]:
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(X)

In [None]:
reduced_embeddings

array([[-2.4167936 , -0.09610669],
       [-1.2334609 ,  1.8077601 ],
       [-2.4063478 , -1.3836381 ],
       ...,
       [ 3.2084515 , -1.2657431 ],
       [ 1.0770724 ,  2.9945905 ],
       [ 4.3153563 , -2.4667308 ]], dtype=float32)

In [None]:
kmeans = KMeans(n_clusters=2, init='k-means++', random_state=42)
kmeans.fit(reduced_embeddings)



In [None]:
cluster_labels = kmeans.labels_

In [None]:
cluster_labels

array([1, 1, 1, ..., 0, 0, 0], dtype=int32)

In [None]:
silhouette_avg = silhouette_score(reduced_embeddings, cluster_labels)

In [None]:
print(f"Silhouette Score: {silhouette_avg}")

Silhouette Score: 0.5377438068389893


In [None]:
df['cluster']=kmeans.labels_

In [None]:
df

Unnamed: 0,label,message,label_num,preprocessed_txt,bert_embedding,cluster
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry 2 wkly comp win FA Cup final tkts 2...,"[[0.06960295, -0.022386853, 0.67938733, -0.242...",1
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1,FreeMsg hey darle 3 week word like fun tb ok X...,"[[0.156755, 0.07545407, 0.7172379, -0.19708195...",1
8,spam,WINNER!! As a valued network customer you have...,1,WINNER value network customer select receivea ...,"[[0.08151105, 0.034037884, 0.38506815, -0.1198...",1
9,spam,Had your mobile 11 months or more? U R entitle...,1,mobile 11 month u r entitle update late colour...,"[[0.14264826, -0.00024623596, 0.6654207, -0.07...",1
11,spam,"SIX chances to win CASH! From 100 to 20,000 po...",1,"chance win cash 100 20,000 pound txt > CSH11 s...","[[0.23117736, -0.15237245, 0.77831644, -0.1032...",1
...,...,...,...,...,...,...
2244,ham,No management puzzeles.,0,management puzzele,"[[0.004228411, -0.51557523, 0.39472747, -0.119...",0
1667,ham,So now my dad is gonna call after he gets out ...,0,dad go to get work ask crazy question,"[[0.122247435, -0.22601874, 0.3324001, -0.1873...",0
2515,ham,Bognor it is! Should be splendid at this time ...,0,bognor splendid time year,"[[-0.016130988, 0.08003004, 0.43793458, -0.287...",0
2336,ham,\Aww you must be nearly dead!Well Jez isComing...,0,\aww nearly dead!well Jez isComing toDo workan...,"[[-0.16056214, 0.2330256, 0.78816724, -0.30544...",0


In [None]:
reduced_embeddings.shape

(1494, 2)

In [None]:
df['cluster'].shape

(1494,)

In [None]:
from sklearn.metrics import accuracy_score,classification_report

accuracy=accuracy_score(df['label_num'],df['cluster'])
report=classification_report(df['label_num'], df['cluster'])

print(f'Accuracy: {accuracy}\nClassification Report:\n{report}')


Accuracy: 0.963186077643909
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       747
           1       0.96      0.96      0.96       747

    accuracy                           0.96      1494
   macro avg       0.96      0.96      0.96      1494
weighted avg       0.96      0.96      0.96      1494



# BERT EMBEDDING+SUPERVISED LEARNING -Train,test,validate


In [None]:
df.head()

Unnamed: 0,label,message,label_num,preprocessed_txt,bert_embedding,cluster
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry 2 wkly comp win FA Cup final tkts 2...,"[[0.06960295, -0.022386853, 0.67938733, -0.242...",1
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1,FreeMsg hey darle 3 week word like fun tb ok X...,"[[0.156755, 0.07545407, 0.7172379, -0.19708195...",1
8,spam,WINNER!! As a valued network customer you have...,1,WINNER value network customer select receivea ...,"[[0.08151105, 0.034037884, 0.38506815, -0.1198...",1
9,spam,Had your mobile 11 months or more? U R entitle...,1,mobile 11 month u r entitle update late colour...,"[[0.14264826, -0.00024623596, 0.6654207, -0.07...",1
11,spam,"SIX chances to win CASH! From 100 to 20,000 po...",1,"chance win cash 100 20,000 pound txt > CSH11 s...","[[0.23117736, -0.15237245, 0.77831644, -0.1032...",1


In [None]:
X = np.vstack(df['bert_embedding'].to_numpy())

pca = PCA(n_components=50)
X_pca = pca.fit_transform(X)

In [None]:
X_pca

array([[-2.41679859e+00, -9.61243212e-02, -4.43759799e-01, ...,
        -5.34525104e-02, -1.33263497e-02, -1.23988286e-01],
       [-1.23346174e+00,  1.80776072e+00,  1.08721308e-01, ...,
         1.37850091e-01,  1.02005333e-01,  2.61431336e-01],
       [-2.40634775e+00, -1.38363898e+00, -3.59995186e-01, ...,
        -9.73643437e-02, -3.60548198e-01, -1.03819661e-01],
       ...,
       [ 3.20845366e+00, -1.26574552e+00, -9.62587357e-01, ...,
         6.96999133e-01, -7.03771830e-01,  5.96493036e-02],
       [ 1.07707179e+00,  2.99458647e+00, -2.23072219e+00, ...,
        -4.51216102e-01, -2.17613786e-01, -7.07954228e-01],
       [ 4.31535482e+00, -2.46672988e+00, -3.26296641e-03, ...,
        -4.41891998e-01, -6.01976156e-01, -7.38484502e-01]], dtype=float32)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_pca, df['label_num'], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
y_val_pred = logreg.predict(X_val)
y_test_pred = logreg.predict(X_test)


In [None]:
y_train_pred=logreg.predict(X_train)

In [None]:
accuracy_val = logreg.score(X_val, y_val)
accuracy_test = logreg.score(X_test, y_test)


In [None]:
accuracy_train=logreg.score(X_train, y_train)

In [None]:
print("Validation Accuracy:", accuracy_val)
print("Test Accuracy:", accuracy_test)


Validation Accuracy: 0.9832635983263598
Test Accuracy: 0.9665551839464883


In [None]:
print("Train Accuracy:", accuracy_train)

Train Accuracy: 0.9822175732217573


In [None]:
from sklearn.metrics import accuracy_score,classification_report

report_val=classification_report(y_val_pred,y_val)
print(report_val)

              precision    recall  f1-score   support

           0       0.99      0.98      0.98       121
           1       0.97      0.99      0.98       118

    accuracy                           0.98       239
   macro avg       0.98      0.98      0.98       239
weighted avg       0.98      0.98      0.98       239



In [None]:
report_val_c=classification_report(y_test,y_test_pred)
print(report_val_c)

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       154
           1       0.98      0.95      0.97       145

    accuracy                           0.97       299
   macro avg       0.97      0.97      0.97       299
weighted avg       0.97      0.97      0.97       299



In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_pca, df['cluster'], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
y_val_pred = logreg.predict(X_val)
y_test_pred = logreg.predict(X_test)

In [None]:
accuracy_val = logreg.score(X_val, y_val)
accuracy_test = logreg.score(X_test, y_test)

In [None]:
from sklearn.metrics import accuracy_score,classification_report
print(accuracy_val)
report_val=classification_report(y_val_pred,y_val)
print(report_val)

0.9916317991631799
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       122
           1       0.99      0.99      0.99       117

    accuracy                           0.99       239
   macro avg       0.99      0.99      0.99       239
weighted avg       0.99      0.99      0.99       239



In [None]:
report_val_c=classification_report(y_test,y_test_pred)
print(accuracy_test)
print(report_val_c)

0.9966555183946488
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       157
           1       0.99      1.00      1.00       142

    accuracy                           1.00       299
   macro avg       1.00      1.00      1.00       299
weighted avg       1.00      1.00      1.00       299



In [None]:

X = np.array(df['bert_embedding'].tolist())

In [None]:
y = np.array(df['cluster'])

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [None]:
X_train.shape

(1045, 1, 768)

In [None]:
X_train_reshaped = X_train.reshape(X_train.shape[0], -1)
X_val_reshaped = X_val.reshape(X_val.shape[0], -1)
X_test_reshaped = X_test.reshape(X_test.shape[0], -1)

In [None]:
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_reshaped)
X_val_pca = pca.transform(X_val_reshaped)
X_test_pca = pca.transform(X_test_reshaped)


In [None]:

logistic_model = LogisticRegression()
logistic_model.fit(X_train_pca, y_train)



In [None]:
y_val_pred = logistic_model.predict(X_val_pca)
accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {accuracy:.2f}')

report = classification_report(y_val, y_val_pred)
print('Classification Report:\n', report)

Validation Accuracy: 1.00
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       120
           1       1.00      1.00      1.00       104

    accuracy                           1.00       224
   macro avg       1.00      1.00      1.00       224
weighted avg       1.00      1.00      1.00       224



In [None]:
y_test_pred = logistic_model.predict(X_test_pca)

accuracy_test = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {accuracy_test:.2f}')

report_test = classification_report(y_test, y_test_pred)
print('Test Classification Report:\n', report_test)

Test Accuracy: 1.00
Test Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       115
           1       1.00      1.00      1.00       110

    accuracy                           1.00       225
   macro avg       1.00      1.00      1.00       225
weighted avg       1.00      1.00      1.00       225



#TRAIN,TEST VALIDATE-BERT+LR

In [None]:
from sklearn.model_selection import train_test_split
traintest_data, val_data = train_test_split(df,test_size=0.2)

In [None]:
df_u=traintest_data

In [None]:
df_u.head()

Unnamed: 0,label,message,label_num,preprocessed_txt,bert_embedding,cluster
4040,spam,Please call our customer service representativ...,1,customer service representative FREEPHONE 0808...,"[[0.0829664, -0.027345695, 0.56222016, -0.4152...",1
1639,spam,FreeMsg:Feelin kinda lnly hope u like 2 keep m...,1,freemsg Feelin kinda lnly hope u like 2 compan...,"[[0.2064764, -0.1967398, 0.93513143, -0.148678...",1
146,spam,FreeMsg Why haven't you replied to my text? I'...,1,FreeMsg reply text Randy sexy female live loca...,"[[0.20409854, -0.2485005, 0.84947103, -0.12386...",1
54,spam,SMS. ac Sptv: The New Jersey Devils and the De...,1,SMS ac Sptv New Jersey Devils Detroit Red Wing...,"[[-0.07736311, 0.009546255, 0.3534619, 0.04193...",1
793,ham,Omg I want to scream. I weighed myself and I l...,0,Omg want scream weigh lose weight Woohoo,"[[0.22102648, -0.06298344, 1.0458688, -0.17942...",0


In [None]:
val_data.head()

Unnamed: 0,label,message,label_num,preprocessed_txt,bert_embedding,cluster
882,ham,I love to give massages. I use lots of baby oi...,0,love massage use lot baby oil fave position,"[[0.062540434, -0.008281253, 0.2742118, 0.1134...",0
1535,spam,You have won a Nokia 7250i. This is what you g...,1,win Nokia 7250i win free auction send Nokia 86...,"[[0.009557985, -0.2859278, 0.44823343, 0.13017...",1
1571,ham,Near kalainar tv office.thenampet,0,near kalainar tv office.thenampet,"[[0.06486433, -0.32131106, -0.1006705, -0.1851...",0
1220,spam,No. 1 Nokia Tone 4 ur mob every week! Just txt...,1,1 Nokia Tone 4 ur mob week txt NOK 87021 1st t...,"[[0.10063407, -0.0053970744, 0.80479735, -0.17...",1
2792,ham,Not from this campus. Are you in the library?,0,campus library,"[[0.27780497, -0.07234697, -0.45469463, 0.0647...",0


In [None]:
X = np.vstack(df_u['bert_embedding'].to_numpy())

pca = PCA(n_components=50)
X_pca = pca.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, df_u['label_num'], test_size=0.2, random_state=42)

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
pred=logreg.predict(X_test)

accuracy=accuracy_score(y_test,pred)
report=classification_report(y_test,pred)

print(f'Accuracy: {accuracy}\nClassification Report:\n{report}')

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

conf_matrix = confusion_matrix(y_test, pred)

print(f'Confusion Matrix:\n{conf_matrix}')

Accuracy: 0.9832635983263598
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       124
           1       0.99      0.97      0.98       115

    accuracy                           0.98       239
   macro avg       0.98      0.98      0.98       239
weighted avg       0.98      0.98      0.98       239

Confusion Matrix:
[[123   1]
 [  3 112]]


In [None]:
y_train.value_counts()

0    480
1    476
Name: label_num, dtype: int64

In [None]:
y_test.value_counts()

0    124
1    115
Name: label_num, dtype: int64

Validation_Data-Predict

In [None]:
val_data['label_num'].value_counts()

1    156
0    143
Name: label_num, dtype: int64

In [None]:
X_valid = np.vstack(val_data['bert_embedding'].to_numpy())

pca = PCA(n_components=50)
X_pca_valid = pca.fit_transform(X_valid)

In [None]:
pred_v=logreg.predict(X_pca_valid)

In [None]:
pred_v

array([0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0])

In [None]:
val_data['label_num']

882     0
1535    1
1571    0
1220    1
2792    0
       ..
712     1
2707    1
3296    1
4064    0
2921    0
Name: label_num, Length: 299, dtype: int64

In [None]:
accuracy=accuracy_score(val_data['label_num'],pred_v)
report=classification_report(val_data['label_num'],pred_v)

print(f'Accuracy: {accuracy}\nClassification Report:\n{report}')

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

conf_matrix = confusion_matrix(val_data['label_num'],pred_v)

print(f'Confusion Matrix:\n{conf_matrix}')

Accuracy: 0.9698996655518395
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       143
           1       0.98      0.96      0.97       156

    accuracy                           0.97       299
   macro avg       0.97      0.97      0.97       299
weighted avg       0.97      0.97      0.97       299

Confusion Matrix:
[[140   3]
 [  6 150]]
