<a href="https://colab.research.google.com/github/Rammohith/AD-NLP/blob/main/spam_classification_proj.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize  # For tokenizing text into words and sentences

# Importing stopwords from NLTK to remove common words that add little value
from nltk.corpus import stopwords

# Downloading required NLTK datasets
nltk.download('punkt')  # Downloads tokenizer models for sentence and word tokenization
nltk.download('punkt_tab')  # Optional: Additional support for tokenization
nltk.download('stopwords')  # Downloads predefined stopword lists for various languages

# Downloading the dataset from Kaggle using Kaggle CLI
# Here, we download a dataset containing spam emails
!kaggle datasets download -d abdallahwagih/spam-emails  # Downloads the spam emails dataset
!unzip spam-emails.zip  # Extracts the downloaded dataset

# Importing pandas for data manipulation
import pandas as pd

# Loading the dataset into a pandas DataFrame
# The CSV file contains columns like 'Message' (email text) and 'Label' (spam/not spam indicator)
df = pd.read_csv("spam.csv")

import re  # Regular expressions module for text cleaning

cleaned = []  # List to store cleaned text
for text in df['Message']:  # Iterating over each message in the 'Message' column
    cleaned_text = re.sub(r'[^\w\s]', '', text)  # Removing punctuation and special characters
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replacing multiple spaces with a single space
    cleaned_data = cleaned_text.strip()  # Removing leading and trailing spaces
    cleaned.append(cleaned_data)  # Appending the cleaned text to the list

# Tokenizing the cleaned text into words
# Each cleaned text is split into a list of individual words for further analysis
tokens = [word_tokenize(x) for x in cleaned]

# Removing stopwords from the tokenized words
# Stopwords are common words (e.g., "the", "is") that do not contribute much to analysis
stop = set(stopwords.words('english'))  # Fetching the list of English stopwords
stop_token = []  # List to store tokens after removing stopwords
for k in range(len(df['Message'])):  # Iterating through tokenized text
    p = [i for i in tokens[k] if i not in stop]  # Filtering out stopwords
    stop_token.append(p)  # Adding filtered tokens to the list

# Applying stemming to reduce words to their root form
# This helps group similar words (e.g., "running", "runner" -> "run")
from nltk.stem import PorterStemmer
ps = PorterStemmer()  # Initializing the Porter Stemmer
stemedata = []  # List to store stemmed data
for message in stop_token:  # Iterating over tokens after stopword removal
    st = [ps.stem(word) for word in message]  # Applying stemming to each word
    stemedata.append(st)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Dataset URL: https://www.kaggle.com/datasets/abdallahwagih/spam-emails
License(s): apache-2.0
spam-emails.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  spam-emails.zip
replace spam.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
df

In [None]:
#apply pos_tags on stop token

from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng' )

In [None]:
pos_tokens = [pos_tag(i) for i in stop_token]
pos_tokens

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
lm = WordNetLemmatizer()

In [None]:
lm.lemmatize('running','v')

In [None]:
pos_tokens[0]

In [None]:
#function to convert nltks pos tags to wordnet pos tags
from nltk.corpus import wordnet
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ  # Adjective
    elif tag.startswith('V'):
        return wordnet.VERB  # Verb
    elif tag.startswith('N'):
        return wordnet.NOUN  # Noun
    elif tag.startswith('R'):
        return wordnet.ADV  # Adverb
    else:
        return wordnet.NOUN  # Default to noun if

In [None]:
get_wordnet_pos(pos_tokens[0][0][1])

In [None]:
pos_tokens[0][0][1]

In [None]:
pos_tokens[0][0][0]

In [None]:
lm.lemmatize(pos_tokens[0][0][0],get_wordnet_pos(pos_tokens[0][0][1]))

In [None]:
lemmed_data = []
for x in range (len(pos_tokens)):
    lem = [lm.lemmatize(pos_tokens[x][y][0],get_wordnet_pos(pos_tokens[x][y][1])) for y in range(len(pos_tokens[x]))]
    lemmed_data.append(lem)
lemmed_data[0]

In [None]:
#applying count vectorization to convert text data into numerical format using bag of words

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [None]:
sen_stem = [' '.join(i) for i in stemedata]
sen_stem

In [None]:
X_vec = cv.fit_transform(sen_stem).toarray()

In [None]:
X_vec

In [None]:
#importing multinomial nb model
from sklearn.naive_bayes import MultinomialNB
mb = MultinomialNB()

In [None]:
mb.fit(X_vec,y)

In [None]:
mb.predict([X_vec[0]])