In [36]:
import re
import pandas
import numpy
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [63]:
Data_Path = r'Data\spam.csv'

In [64]:

dataframe = pandas.read_csv(Data_Path,encoding='latin1')
drop_column_name = ['Unnamed: 2','Unnamed: 3','Unnamed: 4']
dataframe = dataframe.drop(drop_column_name, axis=1)
Category = dataframe['v1'].unique()
dataframe.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
dataframe.rename(columns={'v1': 'Target', 'v2':'Text'}, inplace=True)
dataframe.head()

Unnamed: 0,Target,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## **Text Processing**

In [12]:
LE = LabelEncoder()
dataframe['Target_LE'] = LE.fit_transform(dataframe['Target'])
dataframe.head()


Unnamed: 0,Target,Text,Target_LE
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [13]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    # Remove mentions (@username)
    text = re.sub(r"@\w+", '', text)
    # Remove hashtags but keep the text
    text = re.sub(r"#(\w+)", r'\1', text)
    # Remove special characters and numbers
    text = re.sub(r"[^A-Za-z\s]", '', text)
    # Remove extra spaces
    text = re.sub(r"\s+", ' ', text).strip()
    return text.lower()

In [15]:
dataframe['Cleaned_Text'] = dataframe['Text'].apply(lambda x : clean_text(x))

In [16]:
dataframe['Cleaned_Text'].head()

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in a wkly comp to win fa cup final ...
3          u dun say so early hor u c already then say
4    nah i dont think he goes to usf he lives aroun...
Name: Cleaned_Text, dtype: object

In [19]:
TfidfV = TfidfVectorizer(stop_words='english')
Vectorize = TfidfV.fit_transform(dataframe['Cleaned_Text']).toarray()

In [61]:
pickle.dump(TfidfV, open('Spam_SMS_Tfidf.pkl', 'wb'))

In [20]:
X = Vectorize
y = dataframe['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=101)

In [21]:
X_train.shape, X_test.shape

((4457, 8179), (1115, 8179))

In [24]:
# KNeighborsClassifier()

KNA = KNeighborsClassifier()
KNA.fit(X_train,y_train)
Prediction = KNA.predict(X_test)
KNA_Accuracy = accuracy_score(y_true=y_test, y_pred=Prediction)

print(f'KNA Accuracy : {KNA_Accuracy*100}') 

KNA Accuracy : 91.83856502242152


In [39]:
Classifiers = [KNeighborsClassifier(), LogisticRegressionCV(), LogisticRegression(), RandomForestClassifier(), MultinomialNB()]

Classifiers_Name = ['KNeighborsClassifier', 'LogisticRegressionCV', 'LogisticRegression', 'RandomForestClassifier', 'MultinomialNB']
Accuracy_list = list()
    

In [43]:
i = 0
for classifier in Classifiers:
    Fitting = classifier.fit(X_train, y_train)
    class_prediction = Fitting.predict(X_test)
    Accuracy = accuracy_score(y_test, class_prediction)
    Accuracy_list.append(Accuracy)
    pickle.dump(Fitting, open(Classifiers_Name[i]+'.pkl', 'wb'))
    i = i+1

In [44]:
Classifiers_Name

['KNeighborsClassifier',
 'LogisticRegressionCV',
 'LogisticRegression',
 'RandomForestClassifier',
 'MultinomialNB']

In [45]:
Accuracy_list

[0.9183856502242153,
 0.9183856502242153,
 0.9802690582959641,
 0.9551569506726457,
 0.9704035874439462,
 0.968609865470852]

In [59]:

for i in range(len(Accuracy_list)-1):
    print(f'{Classifiers_Name[i]} Accuracy :: {Accuracy_list[i]}')


KNeighborsClassifier Accuracy :: 0.9183856502242153
LogisticRegressionCV Accuracy :: 0.9183856502242153
LogisticRegression Accuracy :: 0.9802690582959641
RandomForestClassifier Accuracy :: 0.9551569506726457
MultinomialNB Accuracy :: 0.9704035874439462
