# Basic Imports

In [45]:
# Importing necessary libraries
import numpy as np        # For numerical operations
import pandas as pd       # For data manipulation and analysis
import matplotlib.pyplot as plt  # For data visualization
%matplotlib inline

# Importing WordCloud for text visualization
from wordcloud import WordCloud

# Importing NLTK for natural language processing
import nltk
from nltk.corpus import stopwords    # For stopwords


# Downloading NLTK data
nltk.download('stopwords')   # Downloading stopwords data(Contains common words (like "the", "is") that are typically removed to improve text analysis.)
nltk.download('punkt')       # Downloading tokenizer data
nltk.download('punkt_tab')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\praka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\praka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\praka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [46]:
# Read the CSV_File
df = pd.read_csv("spam.csv")

# Display the first few rows of DataFrame
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [47]:
# Droping unnecessary columns
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [48]:
# Rename the columns name
df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df['target'] = df['target'].map({'ham': 0, 'spam': 1})  # Converts 'ham' to 0 and 'spam' to 1
df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [49]:
# Detecting the Duplicate
df.duplicated().sum()

# As we can see there are 403 duplicate row in our data set

np.int64(403)

In [50]:
# Calculating total no. of row in or dataset
len(df)

5572

In [51]:
# Removing Duplicates
df.drop_duplicates(keep='first',inplace=True)
len(df)

5169

# Feature Engineering


In [52]:
# Importing the Porter Stemmer for text stemming
from nltk.stem.porter import PorterStemmer

# Importing the string module for handling special characters
import string

# Creating an instance of the Porter Stemmer(The Porter Stemmer is a stemming algorithm used to reduce words to their root form by removing common suffixes. It follows a set of linguistic rules to simplify words while preserving their meaning.Example:"running" → "run","happily" → "happi")
ps = PorterStemmer()

In [53]:
# Lower case transformation and text preprocessing
def transform_text(text):
    
    # Transform the text to lowercase
    text = text.lower()

    # Tokenizing using NLTK(A Tokenizer splits text into smaller units called tokens (words, sentences, or subwords).Example: "Hello, World!" → ["Hello", ",", "World", "!"])
    text = nltk.word_tokenize(text)

    # Removing Special Character
    y = []
    for i in text:
        if i.isalnum(): #The isalnum() function in Python checks whether a string consists only of alphanumeric characters (letters and numbers) without spaces or special symbols.
            y.append(i)
    
    # Removing stop words and punctuation
    text = y[:] # Creates a copy of y
    y.clear()

    for i in text: 
        if i is not stopwords.words('english') and string.punctuation:
            y.append(i)
    
    # Stemming using Porter Stemmer
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))
    
    # Joining the procced tokens back into single string
    return " ".join(y) # The join() function in Python concatenates elements of an iterable (like a list) into a single string, using a specified separator.

In [54]:
transform_text( "Hey Bro! what the hell are you doing.")

'hey bro what the hell are you do'

In [55]:
# Transforming text column data
df['transformed_text'] = df['text'].apply(transform_text)
df.head()

Unnamed: 0,target,text,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go until jurong point crazi avail onli in bugi...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri in 2 a wkli comp to win fa cup fina...
3,0,U dun say so early hor... U c already then say...,u dun say so earli hor u c alreadi then say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah i do think he goe to usf he live around he...


In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfid = TfidfVectorizer(max_features=500)
'''1. What is TF-IDF?
TF-IDF (Term Frequency-Inverse Document Frequency) is a method used in Natural Language Processing (NLP) to convert text data into numerical values based on the importance of words in a document.

Term Frequency (TF): Measures how often a word appears in a document.
𝑇𝐹 = Number of times the word appears in the document/Total words in the document                        
 
Inverse Document Frequency (IDF): Measures how important a word is across multiple documents.
𝐼𝐷𝐹 = log(Total number of documents/Number of documents containing the word)

TF-IDF Score:
TF-IDF = TFxIDF

High TF-IDF Score → Word is important but rare.
Low TF-IDF Score → Word is common or not important.
2. What Does TfidfVectorizer Do?
Converts text into numerical vectors.
Assigns weights to words using TF-IDF scores.
Ignores stopwords (e.g., "the", "is", "and") unless specified otherwise

3. Meaning of max_features=500
Limits the number of words considered to the top 500 most important words based on their TF-IDF scores.
Helps reduce dimensionality and improve model efficiency.
'''

'1. What is TF-IDF?\nTF-IDF (Term Frequency-Inverse Document Frequency) is a method used in Natural Language Processing (NLP) to convert text data into numerical values based on the importance of words in a document.\n\nTerm Frequency (TF): Measures how often a word appears in a document.\n𝑇𝐹 = Number of times the word appears in the document/Total words in the document                        \n\nInverse Document Frequency (IDF): Measures how important a word is across multiple documents.\n𝐼𝐷𝐹 = log(Total number of documents/Number of documents containing the word)\n\nTF-IDF Score:\nTF-IDF = TFxIDF\n\nHigh TF-IDF Score → Word is important but rare.\nLow TF-IDF Score → Word is common or not important.\n2. What Does TfidfVectorizer Do?\nConverts text into numerical vectors.\nAssigns weights to words using TF-IDF scores.\nIgnores stopwords (e.g., "the", "is", "and") unless specified otherwise\n\n3. Meaning of max_features=500\nLimits the number of words considered to the top 500 most impo

In [57]:
# Defining Dependent and Independent variables
X = tfid.fit_transform(df['transformed_text']).toarray()
Y = df['target'].values

# Train-Test-Split


In [58]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.20,random_state=2)

# Model-Training

In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [60]:
svc = SVC(kernel= "sigmoid", gamma  = 1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth = 5)
lrc = LogisticRegression(solver = 'liblinear', penalty = 'l1')
rfc = RandomForestClassifier(n_estimators = 50, random_state = 2 )
abc = AdaBoostClassifier(n_estimators = 50, random_state = 2)
bc = BaggingClassifier(n_estimators = 50, random_state = 2)
etc = ExtraTreesClassifier(n_estimators = 50, random_state = 2)
gbdt = GradientBoostingClassifier(n_estimators = 50, random_state = 2)    
xgb  = XGBClassifier(n_estimators = 50, random_state = 2)

In [61]:
clfs = {
    'SVC': svc,
    'KNN': knc,
    'NB': mnb,
    'DT': dtc,
    'LR': lrc,
    'RF': rfc,
    'Adaboost': abc,
    'Bgc': bc,
    'ETC': etc,
    'GBDT': gbdt,
    'xgb': xgb
    }

# Model-Evaluation

In [62]:
from sklearn.metrics import accuracy_score, precision_score
def train_classifier(clfs, X_train, y_train, X_test, y_test):
    clfs.fit(X_train,y_train)
    y_pred = clfs.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    return accuracy , precision

In [63]:
accuracy_scores = []
precision_scores = []
for (name,clf) in clfs.items(): # clfs.items(): Retrieves all key-value pairs from the clfs dictionary, where each key is a classifier's name (e.g., 'SVC', 'KNN') and each value is the corresponding classifier object.

    current_accuracy, current_precision = train_classifier(clf, X_train, Y_train, X_test, Y_test)
    print()
    print("For: ", name)
    print("Accuracy: ", current_accuracy)
    print("Precision: ", current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)


For:  SVC
Accuracy:  0.971953578336557
Precision:  0.9658119658119658

For:  KNN
Accuracy:  0.9274661508704062
Precision:  1.0

For:  NB
Accuracy:  0.9680851063829787
Precision:  0.9646017699115044

For:  DT
Accuracy:  0.9390715667311412
Precision:  0.8378378378378378

For:  LR
Accuracy:  0.9690522243713733
Precision:  0.9491525423728814

For:  RF
Accuracy:  0.971953578336557
Precision:  0.9658119658119658

For:  Adaboost
Accuracy:  0.9439071566731141
Precision:  0.9166666666666666

For:  Bgc
Accuracy:  0.9632495164410058
Precision:  0.8846153846153846

For:  ETC
Accuracy:  0.9758220502901354
Precision:  0.9829059829059829

For:  GBDT
Accuracy:  0.9545454545454546
Precision:  0.941747572815534

For:  xgb
Accuracy:  0.9816247582205029
Precision:  0.968503937007874
