### ***Import required libraries***

In [3]:
# Install dependencies
%pip install numpy pandas matplotlib nltk wordcloud scikit-learn xgboost

Collecting wordcloud
  Downloading wordcloud-1.9.5-cp312-cp312-win_amd64.whl.metadata (3.5 kB)
Downloading wordcloud-1.9.5-cp312-cp312-win_amd64.whl (307 kB)
Installing collected packages: wordcloud
Successfully installed wordcloud-1.9.5
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [15]:
# Import required libraries

import numpy as np                # For numerical operations
import pandas as pd               # For data manipulation and analysis
import matplotlib.pyplot as plt   # For data visualization
%matplotlib inline

import nltk                       # For natural language processing
from nltk.corpus import stopwords # For stopwords
from wordcloud import WordCloud   # For text visualization

# Downloading NLTK data
nltk.download('stopwords')        # Downloading stopwords data
nltk.download('punkt')            # Downloading tokenizer data
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\spand\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\spand\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\spand\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

### ***Data Exploration***

In [5]:
# Read the CSV file
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
# Basic exploration
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [7]:
# Dropping unnecessary columns
df.drop(columns = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace = True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
# Renaming columns
df.rename(columns = {'v1': 'target', 'v2': 'text'}, inplace = True)
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### ***Data Preprocessing***

In [9]:
# Label encode target column
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])
df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
# Check for duplicate values
print(f'No. of duplicate rows in dataset: {df.duplicated().sum()}')
print(f'Total No. of rows in dataset: {len(df)}')

No. of duplicate rows in dataset: 403
Total No. of rows in dataset: 5572


In [11]:
# Remove duplicates
df = df.drop_duplicates(keep = 'first')
len(df)

5169

### ***Feature Engineering***

In [12]:
from nltk.stem.porter import PorterStemmer # For text stemming
import string # For handling special characters

# Creating an instance of the Porter Stemmer
ps = PorterStemmer()

In [13]:
# Lowercase transformation and text preprocessing function

def transform_text(text):

    text = text.lower() # Transform the text to lowercase
    text = nltk.word_tokenize(text) # Tokenization using NLTK
    
    # Removing special characters
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
            
    # Removing stop words and punctuation
    text = y[:]
    y.clear()
    
    # Loop through the tokens and remove stopwords and punctuation
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
        
    # Stemming using Porter Stemmer
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))
    
    # Join the processed tokens back into a single string
    return " ".join(y)

In [16]:
transform_text('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...')

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [17]:
df['transformed_text'] = df['text'].apply(transform_text)
df.head()

Unnamed: 0,target,text,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [18]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

tfid = TfidfVectorizer(max_features = 500)
X = tfid.fit_transform(df['transformed_text']).toarray()
y = df['target'].values

In [19]:
# Train-test split
from sklearn.model_selection import train_test_split

X_train, X_test , y_train, y_test = train_test_split(X,y,test_size = 0.20, random_state = 2)

### ***Model Training***

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [21]:
svc = SVC(kernel= "sigmoid", gamma  = 1.0)
knn = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth = 5)
lrc = LogisticRegression(solver = 'liblinear', penalty = 'l1')
rfc = RandomForestClassifier(n_estimators = 50, random_state = 2 )
abc = AdaBoostClassifier(n_estimators = 50, random_state = 2)
bc = BaggingClassifier(n_estimators = 50, random_state = 2)
etc = ExtraTreesClassifier(n_estimators = 50, random_state = 2)
gbc = GradientBoostingClassifier(n_estimators = 50, random_state = 2)    
xgb  = XGBClassifier(n_estimators = 50, random_state = 2)

In [22]:
clfs = {
    'SVC': svc,
    'KNN': knn,
    'NB': mnb,
    'DT': dtc,
    'LR': lrc,
    'RF': rfc,
    'AdaBoost': abc,
    'BGC': bc,
    'ETC': etc,
    'GBC': gbc,
    'XGB': xgb
}

### ***Model Evaluation***

In [23]:
from sklearn.metrics import accuracy_score, precision_score

def train_classifier(clfs, X_train, y_train, X_test, y_test):
    clfs.fit(X_train,y_train)
    y_pred = clfs.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    return accuracy , precision

In [24]:
accuracy_scores = []
precision_scores = []

for name , clfs in clfs.items():
    current_accuracy, current_precision = train_classifier(clfs, X_train, y_train, X_test, y_test)
    print()
    print("For:", name)
    print("Accuracy:", current_accuracy)
    print("Precision:", current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)


For: SVC
Accuracy: 0.9661508704061895
Precision: 0.9327731092436975

For: KNN
Accuracy: 0.9274661508704062
Precision: 1.0

For: NB
Accuracy: 0.9709864603481625
Precision: 0.9655172413793104

For: DT
Accuracy: 0.937137330754352
Precision: 0.9010989010989011

For: LR
Accuracy: 0.9632495164410058
Precision: 0.9629629629629629

For: RF
Accuracy: 0.971953578336557
Precision: 0.943089430894309

For: AdaBoost
Accuracy: 0.9235976789168279
Precision: 0.8734177215189873

For: BGC
Accuracy: 0.965183752417795
Precision: 0.9180327868852459

For: ETC
Accuracy: 0.9729206963249516
Precision: 0.9296875

For: GBC
Accuracy: 0.9506769825918762
Precision: 0.9393939393939394

For: XGB
Accuracy: 0.9700193423597679
Precision: 0.9572649572649573
