# BALANCING DATA

In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from transformers import BertTokenizer, BertModel
from sklearn.metrics import silhouette_score


In [2]:
df=pd.read_csv('/content/spam.csv',encoding='latin-1')
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [3]:
df = df.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
df.columns = ["label", "message"]
df['label_num']=df['label'].map({
    'ham':0,
    'spam':1
})
df.head()

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [4]:

df_spam = df[df['label']=='spam']

df_ham = df[df['label']=='ham']

print("Ham Dataset Shape:", df_ham.shape)

print("Spam Dataset Shape:", df_spam.shape)

Ham Dataset Shape: (4825, 3)
Spam Dataset Shape: (747, 3)


In [5]:
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 3)

In [6]:
df= pd.concat([df_spam , df_ham_downsampled])

In [7]:
df['label_num'].value_counts()

1    747
0    747
Name: label_num, dtype: int64

In [8]:
import torch
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [10]:
def get_bert_embedding(text):
    tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        output = model(**tokens.to(device))
    embeddings = output.last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings


In [11]:
df['bert_embedding'] = df['message'].apply(get_bert_embedding)


In [12]:
X = np.vstack(df['bert_embedding'].to_numpy())

In [13]:
X

array([[ 0.07410222, -0.06461929,  0.81856906, ..., -0.1393411 ,
         0.02458197,  0.22032973],
       [ 0.36046693,  0.02393169,  0.8340686 , ..., -0.20053485,
         0.3949341 ,  0.15713932],
       [-0.08033293, -0.01641425,  0.562275  , ..., -0.20568158,
         0.13814169,  0.17836627],
       ...,
       [ 0.37299603, -0.2470761 ,  0.39728785, ...,  0.14180765,
         0.05916128, -0.01134313],
       [-0.1847709 , -0.2545435 ,  0.69109887, ..., -0.28668204,
         0.01519782,  0.23987916],
       [ 0.15750305, -0.01906057,  0.5420365 , ..., -0.13023762,
         0.2189064 ,  0.11101371]], dtype=float32)

In [14]:
len(X[0])

768

In [15]:
len(X)

1494

In [16]:
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(X)

In [17]:
reduced_embeddings

array([[-2.317465  ,  0.9889265 ],
       [ 0.23411527, -0.23039302],
       [-2.4252098 , -0.67370033],
       ...,
       [ 2.091303  ,  2.450655  ],
       [-0.4065703 ,  3.592796  ],
       [ 2.947957  ,  0.44480765]], dtype=float32)

In [18]:
kmeans = KMeans(n_clusters=2, init='k-means++', random_state=42)
kmeans.fit(reduced_embeddings)



In [19]:
cluster_labels = kmeans.labels_

In [20]:
cluster_labels

array([1, 0, 1, ..., 0, 0, 0], dtype=int32)

In [21]:
silhouette_avg = silhouette_score(reduced_embeddings, cluster_labels)

In [22]:
print(f"Silhouette Score: {silhouette_avg}")

Silhouette Score: 0.5037923455238342


In [23]:
df['cluster']=kmeans.labels_

In [24]:
df

Unnamed: 0,label,message,label_num,bert_embedding,cluster
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,"[[0.074102215, -0.06461929, 0.81856906, -0.223...",1
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1,"[[0.36046693, 0.02393169, 0.8340686, -0.197409...",0
8,spam,WINNER!! As a valued network customer you have...,1,"[[-0.08033293, -0.01641425, 0.562275, -0.24865...",1
9,spam,Had your mobile 11 months or more? U R entitle...,1,"[[0.29597822, -0.10529756, 0.8708993, -0.13898...",1
11,spam,"SIX chances to win CASH! From 100 to 20,000 po...",1,"[[-0.012413757, -0.021395132, 0.826615, -0.055...",1
...,...,...,...,...,...
648,ham,No prob. I will send to your email.,0,"[[0.26408538, -0.31273556, 0.47010052, -0.0567...",0
1826,ham,Dude. What's up. How Teresa. Hope you have bee...,0,"[[0.115350164, 0.018907359, 0.580321, 0.016629...",0
5469,ham,Ok lor.,0,"[[0.37299603, -0.2470761, 0.39728785, -0.26854...",0
2448,ham,Do u knw dis no. &lt;#&gt; ?,0,"[[-0.1847709, -0.2545435, 0.69109887, -0.08826...",0


In [25]:
reduced_embeddings.shape

(1494, 2)

In [26]:
df['cluster'].shape

(1494,)

In [27]:
from sklearn.metrics import accuracy_score,classification_report

accuracy=accuracy_score(df['label_num'],df['cluster'])
report=classification_report(df['label_num'], df['cluster'])

print(f'Accuracy: {accuracy}\nClassification Report:\n{report}')


Accuracy: 0.9544846050870147
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       747
           1       0.95      0.95      0.95       747

    accuracy                           0.95      1494
   macro avg       0.95      0.95      0.95      1494
weighted avg       0.95      0.95      0.95      1494



# BERT EMBEDDING+SUPERVISED LEARNING -Train,test,validate


In [28]:
df.head()

Unnamed: 0,label,message,label_num,bert_embedding,cluster
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,"[[0.074102215, -0.06461929, 0.81856906, -0.223...",1
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1,"[[0.36046693, 0.02393169, 0.8340686, -0.197409...",0
8,spam,WINNER!! As a valued network customer you have...,1,"[[-0.08033293, -0.01641425, 0.562275, -0.24865...",1
9,spam,Had your mobile 11 months or more? U R entitle...,1,"[[0.29597822, -0.10529756, 0.8708993, -0.13898...",1
11,spam,"SIX chances to win CASH! From 100 to 20,000 po...",1,"[[-0.012413757, -0.021395132, 0.826615, -0.055...",1


In [29]:
X = np.vstack(df['bert_embedding'].to_numpy())

pca = PCA(n_components=50)
X_pca = pca.fit_transform(X)

In [30]:
X_pca

array([[-2.3174636 ,  0.9889264 , -0.3576724 , ..., -0.23905179,
        -0.28392455,  0.48049107],
       [ 0.23411515, -0.23039426, -2.3693254 , ...,  0.51146895,
        -0.47689983, -0.1230363 ],
       [-2.4252098 , -0.6737044 ,  0.5885692 , ..., -0.06712554,
        -0.26465195,  0.27471632],
       ...,
       [ 2.0913029 ,  2.450657  ,  1.2732702 , ...,  0.44435546,
        -0.55690604, -0.10965492],
       [-0.40657017,  3.592782  , -0.12583433, ...,  0.12105826,
         0.20087127, -0.06188155],
       [ 2.9479582 ,  0.44480523, -0.6588984 , ...,  0.10881553,
        -0.1518886 ,  0.03577056]], dtype=float32)

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_pca, df['label_num'], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [32]:
from sklearn.linear_model import LogisticRegression

In [33]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [34]:
y_val_pred = logreg.predict(X_val)
y_test_pred = logreg.predict(X_test)


In [35]:
y_train_pred=logreg.predict(X_train)

In [36]:
accuracy_val = logreg.score(X_val, y_val)
accuracy_test = logreg.score(X_test, y_test)


In [37]:
accuracy_train=logreg.score(X_train, y_train)

In [38]:
print("Validation Accuracy:", accuracy_val)
print("Test Accuracy:", accuracy_test)


Validation Accuracy: 0.9832635983263598
Test Accuracy: 0.9698996655518395


In [39]:
print("Train Accuracy:", accuracy_train)

Train Accuracy: 0.9843096234309623


In [40]:
from sklearn.metrics import accuracy_score,classification_report

report_val=classification_report(y_val_pred,y_val)
print(report_val)

              precision    recall  f1-score   support

           0       0.99      0.98      0.98       121
           1       0.97      0.99      0.98       118

    accuracy                           0.98       239
   macro avg       0.98      0.98      0.98       239
weighted avg       0.98      0.98      0.98       239



In [41]:
report_val_c=classification_report(y_test,y_test_pred)
print(report_val_c)

              precision    recall  f1-score   support

           0       0.97      0.97      0.97       154
           1       0.97      0.97      0.97       145

    accuracy                           0.97       299
   macro avg       0.97      0.97      0.97       299
weighted avg       0.97      0.97      0.97       299



In [42]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_pca, df['cluster'], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [43]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [44]:
y_val_pred = logreg.predict(X_val)
y_test_pred = logreg.predict(X_test)

In [45]:
accuracy_val = logreg.score(X_val, y_val)
accuracy_test = logreg.score(X_test, y_test)

In [46]:
from sklearn.metrics import accuracy_score,classification_report
print(accuracy_val)
report_val=classification_report(y_val_pred,y_val)
print(report_val)

0.9916317991631799
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       119
           1       0.99      0.99      0.99       120

    accuracy                           0.99       239
   macro avg       0.99      0.99      0.99       239
weighted avg       0.99      0.99      0.99       239



In [47]:
report_val_c=classification_report(y_test,y_test_pred)
print(accuracy_test)
print(report_val_c)

0.9966555183946488
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       157
           1       0.99      1.00      1.00       142

    accuracy                           1.00       299
   macro avg       1.00      1.00      1.00       299
weighted avg       1.00      1.00      1.00       299



In [48]:

X = np.array(df['bert_embedding'].tolist())

In [49]:
y = np.array(df['cluster'])

In [50]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [51]:
X_train.shape

(1045, 1, 768)

In [52]:
X_train_reshaped = X_train.reshape(X_train.shape[0], -1)
X_val_reshaped = X_val.reshape(X_val.shape[0], -1)
X_test_reshaped = X_test.reshape(X_test.shape[0], -1)

In [53]:
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_reshaped)
X_val_pca = pca.transform(X_val_reshaped)
X_test_pca = pca.transform(X_test_reshaped)


In [54]:

logistic_model = LogisticRegression()
logistic_model.fit(X_train_pca, y_train)



In [55]:
y_val_pred = logistic_model.predict(X_val_pca)
accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {accuracy:.2f}')

report = classification_report(y_val, y_val_pred)
print('Classification Report:\n', report)

Validation Accuracy: 1.00
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       117
           1       1.00      1.00      1.00       107

    accuracy                           1.00       224
   macro avg       1.00      1.00      1.00       224
weighted avg       1.00      1.00      1.00       224



In [56]:
y_test_pred = logistic_model.predict(X_test_pca)

accuracy_test = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {accuracy_test:.2f}')

report_test = classification_report(y_test, y_test_pred)
print('Test Classification Report:\n', report_test)

Test Accuracy: 1.00
Test Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       119
           1       1.00      1.00      1.00       106

    accuracy                           1.00       225
   macro avg       1.00      1.00      1.00       225
weighted avg       1.00      1.00      1.00       225



#TRAIN,TEST VALIDATE-BERT+LR

In [57]:
from sklearn.model_selection import train_test_split
traintest_data, val_data = train_test_split(df,test_size=0.2)

In [58]:
df_u=traintest_data

In [59]:
df_u.head()

Unnamed: 0,label,message,label_num,bert_embedding,cluster
1738,ham,K go and sleep well. Take rest:-).,0,"[[0.28612158, -0.12810539, 0.61261386, -0.2512...",0
2419,spam,SMS SERVICES For your inclusive text credits p...,1,"[[0.16590594, -0.04098712, 0.61584187, -0.1147...",1
992,ham,Up to Ì_... ÌÏ wan come then come lor... But i...,0,"[[0.060723748, -0.18597502, 0.7858233, -0.2350...",0
3904,spam,Do you want a new video handset? 750 anytime a...,1,"[[0.30257562, -0.3803488, 0.74159616, -0.06929...",1
81,ham,K. Did you call me just now ah?,0,"[[0.0033380538, -0.20321369, 0.17493819, -0.19...",0


In [60]:
val_data.head()

Unnamed: 0,label,message,label_num,bert_embedding,cluster
289,ham,"Dear,shall mail tonite.busy in the street,shal...",0,"[[0.14669141, -0.13386518, 0.38457966, -0.4188...",0
4894,spam,Want the latest Video handset? 750 anytime any...,1,"[[0.23846859, -0.38170058, 0.81724364, -0.0716...",1
1063,spam,We have new local dates in your area - Lots of...,1,"[[0.270445, -0.05541379, 0.71303535, -0.001307...",1
56,spam,Congrats! 1 year special cinema pass for 2 is ...,1,"[[0.28544742, -0.0642987, 0.8683037, -0.062105...",1
5066,spam,83039 62735=å£450 UK Break AccommodationVouche...,1,"[[0.08354508, 0.11704355, 0.72321796, -0.44884...",1


In [61]:
X = np.vstack(df_u['bert_embedding'].to_numpy())

pca = PCA(n_components=50)
X_pca = pca.fit_transform(X)

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, df_u['label_num'], test_size=0.2, random_state=42)

In [63]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [64]:
pred=logreg.predict(X_test)

accuracy=accuracy_score(y_test,pred)
report=classification_report(y_test,pred)

print(f'Accuracy: {accuracy}\nClassification Report:\n{report}')

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

conf_matrix = confusion_matrix(y_test, pred)

print(f'Confusion Matrix:\n{conf_matrix}')

Accuracy: 0.9748953974895398
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.98       119
           1       0.99      0.96      0.97       120

    accuracy                           0.97       239
   macro avg       0.98      0.97      0.97       239
weighted avg       0.98      0.97      0.97       239

Confusion Matrix:
[[118   1]
 [  5 115]]


In [65]:
y_train.value_counts()

0    480
1    476
Name: label_num, dtype: int64

In [66]:
y_test.value_counts()

1    120
0    119
Name: label_num, dtype: int64

Validation_Data-Predict

In [67]:
val_data['label_num'].value_counts()

1    151
0    148
Name: label_num, dtype: int64

In [68]:
X_valid = np.vstack(val_data['bert_embedding'].to_numpy())

pca = PCA(n_components=50)
X_pca_valid = pca.fit_transform(X_valid)

In [69]:
pred_v=logreg.predict(X_pca_valid)

In [70]:
pred_v

array([0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1])

In [71]:
val_data['label_num']

289     0
4894    1
1063    1
56      1
5066    1
       ..
4487    0
2866    0
3293    0
1462    1
2707    1
Name: label_num, Length: 299, dtype: int64

In [72]:
accuracy=accuracy_score(val_data['label_num'],pred_v)
report=classification_report(val_data['label_num'],pred_v)

print(f'Accuracy: {accuracy}\nClassification Report:\n{report}')

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

conf_matrix = confusion_matrix(val_data['label_num'],pred_v)

print(f'Confusion Matrix:\n{conf_matrix}')

Accuracy: 0.9565217391304348
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.94      0.96       148
           1       0.94      0.97      0.96       151

    accuracy                           0.96       299
   macro avg       0.96      0.96      0.96       299
weighted avg       0.96      0.96      0.96       299

Confusion Matrix:
[[139   9]
 [  4 147]]
