In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk

In [None]:
df = pd.read_csv('/content/spam_sms_dataset.csv')
df.head()

Unnamed: 0,LABEL,TEXT,URL,EMAIL,PHONE
0,ham,Your opinion about me? 1. Over 2. Jada 3. Kusr...,No,No,No
1,ham,What's up? Do you want me to come online? If y...,No,No,No
2,ham,So u workin overtime nigpun?,No,No,No
3,ham,"Also sir, i sent you an email about how to log...",No,No,No
4,Smishing,Please Stay At Home. To encourage the notion o...,No,No,No


In [None]:
df["LABEL"] = df["LABEL"].str.lower()

In [None]:
df["LABEL"].unique()

array(['ham', 'smishing', 'spam'], dtype=object)

In [None]:
#converting columns from text to numbers using label encoder

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["LABEL"] = le.fit_transform(df["LABEL"])

In [None]:
df["LABEL"].unique()

array([0, 1, 2])

In [None]:
df["PHONE"] = df["PHONE"].replace({"yes":1,"No":0})
df["PHONE"].unique()

array([0, 1])

In [None]:
df["EMAIL"] = df["EMAIL"].replace({"yes":1,"No":0})
df["EMAIL"].unique()

array([0, 1])

In [None]:
df["URL"] = df["URL"].replace({"yes":1,"No":0})
df["URL"].unique()

array([0, 1])

In [None]:
df.sample(n=5)

Unnamed: 0,LABEL,TEXT,URL,EMAIL,PHONE
831,0,Can't. I feel nauseous. I'm so pissed. I didn'...,0,0,0
915,0,If you were/are free i can give. Otherwise nal...,0,0,0
5907,0,"Easy mate, * guess the quick drink was bit amb...",0,0,0
1998,0,Ok try to do week end course in coimbatore.,0,0,0
4810,0,thanks for the temales it was wonderful. Thank...,0,0,0


In [None]:
#lowering the text
df["TEXT"] = df["TEXT"].str.lower()

In [None]:
import re

In [None]:
def remove_specific_numbers(text):
    # Regular expression to match 4- to 12-digit numbers
    number_pattern = re.compile(r'\b\d{4,12}\b')
    return number_pattern.sub(r'', text)


In [None]:
df["TEXT"] = df["TEXT"].apply(remove_specific_numbers)

In [None]:
def remove_specific_emails(text):
  email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@(gmail\.com|hotmail\.com|kiefer\.com|emc\.co\.uk|.co\.uk)\b')
  return email_pattern.sub(r'', text)

In [None]:
df['TEXT'] = df['TEXT'].apply(remove_specific_emails)

In [None]:
df["HTTP"] =0
df["HTTPS"]=0
df["BIT"]=0
df["WWW"]=0

In [None]:
df.sample(n=5)

Unnamed: 0,LABEL,TEXT,URL,EMAIL,PHONE,HTTP,HTTPS,BIT,WWW
4559,0,i am in revage theatre now. . going to watch k...,0,0,0,0,0,0,0
4870,0,finished class where are you.,0,0,0,0,0,0,0
680,1,urgent! we are trying to contact u. todays dra...,0,0,1,0,0,0,0
279,0,k actually can you guys meet me at the sunoco ...,0,0,0,0,0,0,0
4663,2,text pass to to collect your polyphonic ringt...,0,0,1,0,0,0,0


In [None]:
def update_link_flags(row):
    http_pattern = re.compile(r'http://\S+')
    https_pattern = re.compile(r'https://\S+')
    bit_pattern = re.compile(r'bit\.\S+')
    www_pattern = re.compile(r'www\.\S+')

    # Check and update the corresponding columns
    if http_pattern.search(row['TEXT']):
        row['HTTP'] = 1
    if https_pattern.search(row['TEXT']):
        row['HTTPS'] = 1
    if bit_pattern.search(row['TEXT']):
        row['BIT'] = 1
    if www_pattern.search(row['TEXT']):
        row['WWW'] = 1

    return row


In [None]:
df = df.apply(update_link_flags,axis =1)

In [None]:
df[df["HTTP"]==1]["TEXT"]

5       bankofamerica alert . please follow http://bit...
48      apple id: [buxcx7gbvwwccod final notification ...
131     pl: battle royale! gilchrist vs warne today at...
284     dear voucher holder, to claim this weeks offer...
332     you are now unsubscribed all services. get ton...
                              ...                        
5250    urgent message for cibc account holder, kindly...
5421    \ta link to your picture has been sent. you ca...
5453    message important information for o2 user. tod...
5535    \tmessage important information for o2 user. t...
5643    mobile office & mms settings have been success...
Name: TEXT, Length: 73, dtype: object

In [None]:
df[df["BIT"]==1]["TEXT"]

5       bankofamerica alert . please follow http://bit...
378     your tax refund of �349.14 is ready to be clai...
976     englishvocabulary ululate: to mourn loudly and...
2400    reading gud habit.. nan bari hudgi yorge patai...
3910    in the open lucky draw contest and you are the...
5174    me hungry buy some food good lei... but mum n ...
Name: TEXT, dtype: object

In [None]:
len(df[df["HTTP"]==1])

73

In [None]:
import string
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
from nltk.stem import WordNetLemmatizer
lm =  WordNetLemmatizer()

In [None]:
def process_text(text):
    text = nltk.word_tokenize(text)
    #removing special charcters
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)

    text = y[:]
    y.clear()
  #removing stop words
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    text = y[:]
    y.clear()
    #lemmatization
    for i in text:
        y.append(lm.lemmatize(i))


    return " ".join(y)

In [None]:
df["text_processed"] = df["TEXT"].apply(process_text)

In [None]:
df.head()

Unnamed: 0,LABEL,TEXT,URL,EMAIL,PHONE,HTTP,HTTPS,BIT,WWW,text_processed
0,0,your opinion about me? 1. over 2. jada 3. kusr...,0,0,0,0,0,0,0,opinion jada kusruthi lovable silent spl chara...
1,0,what's up? do you want me to come online? if y...,0,0,0,0,0,0,0,want come online free talk
2,0,so u workin overtime nigpun?,0,0,0,0,0,0,0,u workin overtime nigpun
3,0,"also sir, i sent you an email about how to log...",0,0,0,0,0,0,0,also sir sent email log usc payment portal sen...
4,1,please stay at home. to encourage the notion o...,0,0,0,0,0,0,0,please stay home encourage notion staying home...


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
tf = TfidfVectorizer(max_features = 3000)

In [None]:
vec = tf.fit_transform(df["text_processed"]).toarray()

In [None]:
y = df["LABEL"].values

In [None]:
vec = pd.DataFrame(vec)

In [None]:
X = pd.concat([df,vec],axis =1)

In [None]:
X.drop(["TEXT","text_processed","LABEL"],axis=1,inplace = True)

In [None]:
X.head()

Unnamed: 0,URL,EMAIL,PHONE,HTTP,HTTPS,BIT,WWW,0,1,2,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
0,0,0,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
X= X.rename(columns={"URL":3000,"EMAIL":3001,"PHONE":3002,"HTTP":3003,"HTTPS":3004,"BIT":3005,"WWW":3006})

In [None]:
X.head()

Unnamed: 0,3000,3001,3002,3003,3004,3005,3006,0,1,2,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
0,0,0,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [None]:
pip install imbalanced-learn scikit-learn




In [None]:
from imblearn.combine import SMOTEENN

In [None]:
smoteen = SMOTEENN(random_state=42)


In [None]:
X_resampled, y_resampled = smoteen.fit_resample(X_train, y_train)


In [None]:
X_resampled.head()

Unnamed: 0,3000,3001,3002,3003,3004,3005,3006,0,1,2,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
0,0,0,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.635162,0.0


In [None]:
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB

In [None]:
gb = GaussianNB()
mb = MultinomialNB()
bb = BernoulliNB()

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression()

In [None]:
lr.fit(X_resampled,y_resampled)
y_pred = lr.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9740585774058578
[[963   0   1]
 [  2 118  12]
 [  8   8  83]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       964
           1       0.94      0.89      0.91       132
           2       0.86      0.84      0.85        99

    accuracy                           0.97      1195
   macro avg       0.93      0.91      0.92      1195
weighted avg       0.97      0.97      0.97      1195



In [None]:
gb.fit(X_resampled,y_resampled)
y_pred = lr.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9740585774058578
[[963   0   1]
 [  2 118  12]
 [  8   8  83]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       964
           1       0.94      0.89      0.91       132
           2       0.86      0.84      0.85        99

    accuracy                           0.97      1195
   macro avg       0.93      0.91      0.92      1195
weighted avg       0.97      0.97      0.97      1195



In [None]:
mb.fit(X_resampled,y_resampled)
y_pred = lr.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9740585774058578
[[963   0   1]
 [  2 118  12]
 [  8   8  83]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       964
           1       0.94      0.89      0.91       132
           2       0.86      0.84      0.85        99

    accuracy                           0.97      1195
   macro avg       0.93      0.91      0.92      1195
weighted avg       0.97      0.97      0.97      1195



In [None]:
bb.fit(X_resampled,y_resampled)
y_pred = lr.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9740585774058578
[[963   0   1]
 [  2 118  12]
 [  8   8  83]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       964
           1       0.94      0.89      0.91       132
           2       0.86      0.84      0.85        99

    accuracy                           0.97      1195
   macro avg       0.93      0.91      0.92      1195
weighted avg       0.97      0.97      0.97      1195



In [None]:
from sklearn.ensemble import ExtraTreesClassifier
et = ExtraTreesClassifier()

In [None]:
et.fit(X_resampled,y_resampled)
y_pred = lr.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9740585774058578
[[963   0   1]
 [  2 118  12]
 [  8   8  83]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       964
           1       0.94      0.89      0.91       132
           2       0.86      0.84      0.85        99

    accuracy                           0.97      1195
   macro avg       0.93      0.91      0.92      1195
weighted avg       0.97      0.97      0.97      1195

