In [18]:
# Importing necessary libraries

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer


In [19]:
# Reading the data with specified encoding

file_path = "C:\\Windows\\spam.csv"  # Double backslashes are used to escape the single backslash in Windows file path
spam_df = pd.read_csv(file_path, encoding='latin1')

# Check information about the dataframe
print(spam_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB
None


In [20]:
# Display the first few rows of the dataframe
print(spam_df.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [21]:
spam_df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [22]:
# Check for missing values

missing_values = spam_df.isnull().sum()

In [23]:
# Display columns with missing values, if any

print(missing_values[missing_values > 0])

Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64


In [24]:
# Dropping columns with missing values

columns_to_drop = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"]
spam_df_cleaned = spam_df.drop(columns=columns_to_drop)


In [25]:
# Verify the columns have been dropped

print(spam_df_cleaned.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None


In [26]:

def clean_text(text):
    
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Convert text to lowercase
    text = text.lower()
    
    # Remove unnecessary whitespace
    text = ' '.join(text.split())
    
    return text


In [27]:
spam_df_cleaned.columns

Index(['v1', 'v2'], dtype='object')

In [28]:
# Apply text cleaning to the columns

In [29]:
spam_df_cleaned['v1'] = spam_df_cleaned['v1'].apply(clean_text)

In [30]:
spam_df_cleaned['v2'] = spam_df_cleaned['v2'].apply(clean_text)

In [32]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [35]:
# Tokenize the text: Splitting texts into individual word or tokens

spam_df_cleaned['tokens'] = spam_df_cleaned['v1'].apply(word_tokenize)
spam_df_cleaned['tokens'] = spam_df_cleaned['v2'].apply(word_tokenize)

In [37]:
import nltk
nltk.download('stopwords')

# Removing Stop Words: Removing common words that don't carry much meaning

stop_words = set(stopwords.words('english'))

# Remove stop words
spam_df_cleaned['tokens'] = spam_df_cleaned['tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [38]:
# Reducing words to their base or root form using lemmatization.

nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

# Lemmatize the tokens
spam_df_cleaned['tokens'] = spam_df_cleaned['tokens'].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...


In [39]:
# Vectorization

# Converting tokens back to text
spam_df_cleaned['cleaned_text'] = spam_df_cleaned['tokens'].apply(lambda tokens: ' '.join(tokens))


In [40]:
# Initialize TF-IDF vectorizer

tfidf_vectorizer = TfidfVectorizer()

In [41]:
# Fitting and transforming the cleaned text data

tfidf_matrix = tfidf_vectorizer.fit_transform(spam_df_cleaned['cleaned_text'])