<a href="https://colab.research.google.com/github/Rammohith/AD-NLP/blob/main/spam_classification_proj.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize  # For tokenizing text into words and sentences

# Importing stopwords from NLTK to remove common words that add little value
from nltk.corpus import stopwords

# Downloading required NLTK datasets
nltk.download('punkt')  # Downloads tokenizer models for sentence and word tokenization
nltk.download('punkt_tab')  # Optional: Additional support for tokenization
nltk.download('stopwords')  # Downloads predefined stopword lists for various languages

# Downloading the dataset from Kaggle using Kaggle CLI
# Here, we download a dataset containing spam emails
!kaggle datasets download -d abdallahwagih/spam-emails  # Downloads the spam emails dataset
!unzip spam-emails.zip  # Extracts the downloaded dataset

# Importing pandas for data manipulation
import pandas as pd

# Loading the dataset into a pandas DataFrame
# The CSV file contains columns like 'Message' (email text) and 'Label' (spam/not spam indicator)
df = pd.read_csv("spam.csv")

import re  # Regular expressions module for text cleaning

cleaned = []  # List to store cleaned text
for text in df['Message']:  # Iterating over each message in the 'Message' column
    cleaned_text = re.sub(r'[^\w\s]', '', text)  # Removing punctuation and special characters
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replacing multiple spaces with a single space
    cleaned_data = cleaned_text.strip()  # Removing leading and trailing spaces
    cleaned.append(cleaned_data)  # Appending the cleaned text to the list

# Tokenizing the cleaned text into words
# Each cleaned text is split into a list of individual words for further analysis
tokens = [word_tokenize(x) for x in cleaned]

# Removing stopwords from the tokenized words
# Stopwords are common words (e.g., "the", "is") that do not contribute much to analysis
stop = set(stopwords.words('english'))  # Fetching the list of English stopwords
stop_token = []  # List to store tokens after removing stopwords
for k in range(len(df['Message'])):  # Iterating through tokenized text
    p = [i for i in tokens[k] if i not in stop]  # Filtering out stopwords
    stop_token.append(p)  # Adding filtered tokens to the list

# Applying stemming to reduce words to their root form
# This helps group similar words (e.g., "running", "runner" -> "run")
from nltk.stem import PorterStemmer
ps = PorterStemmer()  # Initializing the Porter Stemmer
stemedata = []  # List to store stemmed data
for message in stop_token:  # Iterating over tokens after stopword removal
    st = [ps.stem(word) for word in message]  # Applying stemming to each word
    stemedata.append(st)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Dataset URL: https://www.kaggle.com/datasets/abdallahwagih/spam-emails
License(s): apache-2.0
Downloading spam-emails.zip to /content
  0% 0.00/207k [00:00<?, ?B/s]
100% 207k/207k [00:00<00:00, 62.2MB/s]
Archive:  spam-emails.zip
  inflating: spam.csv                


In [None]:
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [None]:
#apply pos_tags on stop token

from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng' )

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [None]:
pos_tokens = [pos_tag(i) for i in stop_token]
pos_tokens

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
lm = WordNetLemmatizer()

In [None]:
lm.lemmatize('running','v')

'run'

In [None]:
pos_tokens[0]

[('Go', 'VB'),
 ('jurong', 'JJ'),
 ('point', 'NN'),
 ('crazy', 'NN'),
 ('Available', 'NNP'),
 ('bugis', 'NN'),
 ('n', 'RB'),
 ('great', 'JJ'),
 ('world', 'NN'),
 ('la', 'NN'),
 ('e', 'VBP'),
 ('buffet', 'JJ'),
 ('Cine', 'NNP'),
 ('got', 'VBD'),
 ('amore', 'RB'),
 ('wat', 'JJ')]

In [None]:
#function to convert nltks pos tags to wordnet pos tags
from nltk.corpus import wordnet
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ  # Adjective
    elif tag.startswith('V'):
        return wordnet.VERB  # Verb
    elif tag.startswith('N'):
        return wordnet.NOUN  # Noun
    elif tag.startswith('R'):
        return wordnet.ADV  # Adverb
    else:
        return wordnet.NOUN  # Default to noun if

In [None]:
get_wordnet_pos(pos_tokens[0][0][1])

'v'

In [None]:
pos_tokens[0][0][1]

'VB'

In [None]:
pos_tokens[0][0][0]

'Go'

In [None]:
lm.lemmatize(pos_tokens[0][0][0],get_wordnet_pos(pos_tokens[0][0][1]))

'Go'

In [None]:
lemmed_data = []
for x in range (len(pos_tokens)):
    lem = [lm.lemmatize(pos_tokens[x][y][0],get_wordnet_pos(pos_tokens[x][y][1])) for y in range(len(pos_tokens[x]))]
    lemmed_data.append(lem)
lemmed_data[0]

['Go',
 'jurong',
 'point',
 'crazy',
 'Available',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'Cine',
 'get',
 'amore',
 'wat']

In [None]:
#applying count vectorization to convert text data into numerical format using bag of words

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [None]:
sen_stem = [' '.join(i) for i in stemedata]
sen_stem

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri questionstd txt ratetc appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah i dont think goe usf live around though',
 'freemsg hey darl 3 week word back id like fun still tb ok xxx std chg send 150 rcv',
 'even brother like speak they treat like aid patent',
 'as per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner as valu network custom select receivea 900 prize reward to claim call 09061701461 claim code kl341 valid 12 hour',
 'had mobil 11 month u r entitl updat latest colour mobil camera free call the mobil updat co free 08002986030',
 'im gon na home soon dont want talk stuff anymor tonight k ive cri enough today',
 'six chanc win cash from 100 20000 pound txt csh11 send 87575 cost 150pday 6day 16 tsandc ap

In [None]:
X_vec = cv.fit_transform(sen_stem).toarray()

In [None]:
X_vec

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
#importing multinomial nb model
from sklearn.naive_bayes import MultinomialNB
mb = MultinomialNB()

In [32]:
mb