In [1]:
# Importing necessary libraries
import numpy as np        # For numerical operations
import pandas as pd       # For data manipulation and analysis
import matplotlib.pyplot as plt  # For data visualization
%matplotlib inline

# Importing WordCloud for text visualization
from wordcloud import WordCloud

# Importing NLTK for natural language processing
import nltk
from nltk.corpus import stopwords    # For stopwords


# Downloading NLTK data
nltk.download('stopwords')   # Downloading stopwords data
nltk.download('punkt')  

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\riswa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\riswa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Read the CSV file
df = pd.read_csv('spam.csv')

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [4]:
df.isnull().sum()


v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [5]:
df.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [6]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [7]:
df.drop(columns=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], inplace=True)
df.columns = ["label", "text"]
df.head()

df.label.value_counts()


label
ham     4825
spam     747
Name: count, dtype: int64

In [8]:
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Data Preprocessing

In [10]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler,MinMaxScaler

In [11]:
encoder  = LabelEncoder()
df["label"]= encoder.fit_transform(df['label'])
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
df.drop_duplicates(inplace =True,keep='first')


## Feature Engineering


In [14]:
from nltk.stem.porter import PorterStemmer
import string

ps = PorterStemmer()



In [15]:
def transform_text(text):
    text = text.lower()
    text =nltk.word_tokenize(text)
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)

    text =y[:]

    y.clear()
    for i in text:
        if i not in stopwords.words("english") and i  not in string.punctuation:
            y.append(i)

    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))

    return " ".join(y)


In [18]:
import nltk
nltk.download('all')  # This will download all NLTK data (larger download but ensures everything is available)

[nltk_data]    |   Unzipping corpora\pil.zip.
[nltk_data]    | Downloading package pl196x to
[nltk_data]    |     C:\Users\riswa\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\pl196x.zip.
[nltk_data]    | Downloading package porter_test to
[nltk_data]    |     C:\Users\riswa\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping stemmers\porter_test.zip.
[nltk_data]    | Downloading package ppattach to
[nltk_data]    |     C:\Users\riswa\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\ppattach.zip.
[nltk_data]    | Downloading package problem_reports to
[nltk_data]    |     C:\Users\riswa\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\problem_reports.zip.
[nltk_data]    | Downloading package product_reviews_1 to
[nltk_data]    |     C:\Users\riswa\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\product_reviews_1.zip.
[nltk_data]    | Downloading package product_reviews_2 to
[nltk_data]    |     C:\Users\riswa\

True

In [19]:
df['transformed_text'] = df['text'].apply(transform_text)
df.head()

Unnamed: 0,label,text,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english',max_df=0.7)

X = tfidf.fit_transform(df['transformed_text'])

y = df['label']




In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test , y_train, y_test = train_test_split(X,y,test_size = 0.20, random_state = 2)

In [26]:
X_train.shape
X_test.shape
y_train.shape
y_test.shape




(1034,)

# MODEL TRAINING

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score


In [28]:
svc = SVC(kernel= "sigmoid", gamma  = 1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth = 5)
lrc = LogisticRegression(solver = 'liblinear', penalty = 'l1')
rfc = RandomForestClassifier(n_estimators = 50, random_state = 2 )
abc = AdaBoostClassifier(n_estimators = 50, random_state = 2)
bc = BaggingClassifier(n_estimators = 50, random_state = 2)
etc = ExtraTreesClassifier(n_estimators = 50, random_state = 2)
gbdt = GradientBoostingClassifier(n_estimators = 50, random_state = 2)    
xgb  = XGBClassifier(n_estimators = 50, random_state = 2)


clfs = {
    'SVC': svc,
    'KNN': knc,
    'NB': mnb,
    'DT': dtc,
    'LR': lrc,
    'RF': rfc,
    'Adaboost': abc,
    'Bgc': bc,
    'ETC': etc,
    'GBDT': gbdt,
    'xgb': xgb
    
}

# Model Training


In [29]:
for name, clf in clfs.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"Accuracy of {name}: {accuracy_score(y_test, y_pred)}")
    print(f"Precision of {name}: {precision_score(y_test, y_pred)}")



Accuracy of SVC: 0.9729206963249516
Precision of SVC: 0.9583333333333334
Accuracy of KNN: 0.8984526112185687
Precision of KNN: 1.0
Accuracy of NB: 0.9613152804642167
Precision of NB: 1.0
Accuracy of DT: 0.9303675048355899
Precision of DT: 0.84375
Accuracy of LR: 0.9506769825918762
Precision of LR: 0.9393939393939394
Accuracy of RF: 0.9748549323017408
Precision of RF: 0.9912280701754386




Accuracy of Adaboost: 0.9535783365570599
Precision of Adaboost: 0.8947368421052632
Accuracy of Bgc: 0.965183752417795
Precision of Bgc: 0.9180327868852459
Accuracy of ETC: 0.9787234042553191
Precision of ETC: 0.9833333333333333
Accuracy of GBDT: 0.9429400386847195
Precision of GBDT: 0.9647058823529412
Accuracy of xgb: 0.9709864603481625
Precision of xgb: 0.9576271186440678
