In [13]:
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
df=pd.read_csv('/content/spam.csv' , encoding = "ISO-8859-1")


In [15]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [17]:
print(df.shape)
print(df.size)

(5572, 5)
27860


In [16]:
df.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [19]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [20]:
#dropping unwanted columns
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

In [21]:
#rename the columns v1 & v2 by label and text
df.columns = ['label', 'text']

In [22]:
#Convert labels to binary values
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [26]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
import string

stopwords = nltk.corpus.stopwords.words('english')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [27]:
def preprocess_text(text, use_stemming=True, use_lemmatization=False):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stopwords])
    
    # Apply stemming or lemmatization if specified
    if use_stemming:
        text = ' '.join([stemmer.stem(word) for word in text.split()])
    elif use_lemmatization:
        text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    
    return text

In [28]:
df['text'] = df['text'].apply(preprocess_text, use_stemming=True, use_lemmatization=False)

In [39]:
df.head()

Unnamed: 0,label,text
0,0,go jurong point crazi avail bugi n great world...
1,0,ok lar joke wif u oni
2,1,free entri 2 wkli comp win fa cup final tkt 21...
3,0,u dun say earli hor u c alreadi say
4,0,nah dont think goe usf live around though


**Splitting the data in train and test**

In [29]:
#Split data into train and test data
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)


**Feature Engineering**

In [30]:
#Converting the text to numerical features using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)


**Training data using Random Forest algorithm**

In [31]:
#Train a Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)


In [32]:
#Predict on test data
y_pred_rf = clf.predict(X_test)

In [33]:
len(y_pred_rf)

1115

In [34]:
#Calculate accuracy score and confusion matrix
accuracy = accuracy_score(y_test, y_pred_rf)
conf_mat = confusion_matrix(y_test, y_pred_rf)

print("Accuracy:", accuracy)
print("Confusion matrix:\n", conf_mat)


Accuracy: 0.9748878923766816
Confusion matrix:
 [[965   0]
 [ 28 122]]


**Training the data using Naive Bayes classifier**

In [35]:
# Train the Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

In [36]:
# predict on test data
y_pred_nb = classifier.predict(X_test)

In [37]:
len(y_pred_nb)

1115

In [38]:
#Calculate accuracy score and confusion matrix
accuracy = accuracy_score(y_test, y_pred_nb)
conf_mat = confusion_matrix(y_test, y_pred_nb)

print("Accuracy:", accuracy)
print("Confusion matrix:\n", conf_mat)

Accuracy: 0.9632286995515695
Confusion matrix:
 [[965   0]
 [ 41 109]]
