In [1]:
import pandas as pd

dataframe = pd.read_csv("C:/Users/nrebe/Downloads/Data Science/SMSSpamCollectiondf.csv",
    sep="\t",
    header=None,
    names=["label", "text"],
    encoding="utf-8"
)
dataframe.head

df_copy = dataframe.copy()
df_copy.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
# Remove duplicate rows
df_copy = df_copy.drop_duplicates()
df_copy.shape

print("Rows before:", len(dataframe))
print("Rows after:", len(df_copy))
print("Duplicates removed:", len(dataframe) - len(df_copy))


Rows before: 5572
Rows after: 5169
Duplicates removed: 403


In [5]:
import re

def basic_cleaning(text):
    # 1. Lowercase
    text = text.lower()
    
    # 2. Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    
    # 3. Remove email addresses
    text = re.sub(r"\S+@\S+\.\S+", "", text)

     # 4. Remove punctuation
    text = re.sub(r"[^\w\s]", "", text)
    
    # 5. Remove numbers
    text = re.sub(r"\d+", "", text)
    
    # 6. Remove multiple spaces
    text = re.sub(r"\s+", " ", text).strip()
    
    return text

# Apply cleaning to the dataset
df_copy["clean_text"] = df_copy["text"].apply(basic_cleaning)

# Preview
df_copy[["text", "clean_text"]].head(19)

Unnamed: 0,text,clean_text
0,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final ...
3,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...
5,FreeMsg Hey there darling it's been 3 week's n...,freemsg hey there darling its been weeks now a...
6,Even my brother is not like to speak with me. ...,even my brother is not like to speak with me t...
7,As per your request 'Melle Melle (Oru Minnamin...,as per your request melle melle oru minnaminun...
8,WINNER!! As a valued network customer you have...,winner as a valued network customer you have b...
9,Had your mobile 11 months or more? U R entitle...,had your mobile months or more u r entitled to...


##  **Text Preprocessing: Tokenization & Lemmatization**

Before converting the SMS messages into numerical features (e.g., TF-IDF), we need to further clean and standardize the text.  
This section introduces two essential NLP preprocessing steps: **tokenization** and **lemmatization**.

---

###  **Tokenization**
Tokenization is the process of splitting text into individual units called *tokens* (usually words).  
This allows the model to analyze each word separately instead of treating the message as a single long string.

**Example:**  
`"Free entry in 2 a wkly comp"` → `["free", "entry", "in", "a", "wkly", "comp"]`

---

###  **Lemmatization**
Lemmatization reduces words to their **base dictionary form** (*lemma*).  
This helps unify different grammatical variations of the same word, reducing vocabulary size and improving model generalization.

**Example:**  
`running → run`  
`studies → study`  
`better → good`

---

##  **Objective of This Step**
- Convert raw SMS text into a clean and structured representation  
- Reduce vocabulary complexity by standardizing word forms  
- Improve the quality of TF-IDF features  
- Increase the performance and robustness of ML models  

---

##  **Tools Used**
We will use the **NLTK (Natural Language Toolkit)** library:
- `word_tokenize` → tokenization  
- `WordNetLemmatizer` → lemmatization  
- `stopwords` (optional) → remove common non-informative words  

NLTK is lightweight, efficient, and well-suited for preprocessing classical NLP datasets such as SMS spam detection.

In [8]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt_tab')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nrebe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nrebe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\nrebe\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nrebe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nrebe\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [9]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def tokenize_and_lemmatize(text):
    # Tokenize
    tokens = word_tokenize(text)
    
    # Lemmatize
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Remove stopwords
    lemmas = [word for word in lemmas if word not in stop_words]
    
    return " ".join(lemmas)

df_copy["final_text"] = df_copy["clean_text"].apply(tokenize_and_lemmatize)

df_copy[["clean_text", "final_text"]].head(10)

Unnamed: 0,clean_text,final_text
0,go until jurong point crazy available only in ...,go jurong point crazy available bugis n great ...
1,ok lar joking wif u oni,ok lar joking wif u oni
2,free entry in a wkly comp to win fa cup final ...,free entry wkly comp win fa cup final tkts st ...
3,u dun say so early hor u c already then say,u dun say early hor u c already say
4,nah i dont think he goes to usf he lives aroun...,nah dont think go usf life around though
5,freemsg hey there darling its been weeks now a...,freemsg hey darling week word back id like fun...
6,even my brother is not like to speak with me t...,even brother like speak treat like aid patent
7,as per your request melle melle oru minnaminun...,per request melle melle oru minnaminunginte nu...
8,winner as a valued network customer you have b...,winner valued network customer selected receiv...
9,had your mobile months or more u r entitled to...,mobile month u r entitled update latest colour...


**We first tokenize the text to split it into individual words, then lemmatize each word to reduce it to its base form, and finally join everything back into a clean sentence — this order is required because lemmatization can only be applied to words separately, not to a full sentence.**


In [10]:
df_copy.to_csv(
    r"C:\Users\nrebe\Downloads\Spam Filter\preprocessed_sms_spam.csv",
    index=False
)
