# Data Extraction

In [1]:
# Import required libraries
import pandas as pd

In [2]:
# Load the dataset with encoding='latin-1' to avoid UnicodeDecodeError
data = pd.read_csv('spam_messages.csv', encoding='latin-1')

In [3]:
# View the top 5 rows
data.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# Check the size of the dataset (number of rows, number of columns)
data.shape

(5572, 5)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


# Data Cleaning

In [6]:
# Drop irrelevant columns
data = data[['v1', 'v2']]  # v1 = label, v2 = message

# Rename columns for clarity
data.columns = ['Label', 'Message']

# Check and remove null values
data.isnull().sum()
data = data.dropna()

# Remove duplicates
data = data.drop_duplicates()


In [7]:
data.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
data.shape

(5169, 2)

# Data Analysis

In [9]:
# Count labels
data['Label'].value_counts()

Label
ham     4516
spam     653
Name: count, dtype: int64

In [10]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [11]:
import nltk

try:
    nltk.download('punkt_tab')
except:
    nltk.download('punkt')

try:
    from nltk.tokenize import sent_tokenize
    test = sent_tokenize("This is test message")
    print(f"NLTK tokenizer working: {len(test)} sentences detected")
    use_nltk = True
except Exception as e:
    print(f"NLTK failed: {e}")
    use_nltk = False

# Character count
data['num_characters'] = data['Message'].apply(len)

# Word count  
data['num_words'] = data['Message'].apply(lambda x: len(str(x).split()))

# Sentence count
if use_nltk:
    data['num_sentences'] = data['Message'].apply(lambda x: len(sent_tokenize(str(x))))
else:
    import re
    def count_sentences(text):
        # Split on sentence endings and count non-empty results
        sentences = re.split(r'[.!?]+', str(text))
        return len([s for s in sentences if s.strip()])
    
    data['num_sentences'] = data['Message'].apply(count_sentences)

print(f"New columns: {list(data.columns)}")
print("Sample of new features:")
print(data[['num_characters', 'num_words', 'num_sentences']].head(10))

NLTK tokenizer working: 1 sentences detected
New columns: ['Label', 'Message', 'num_characters', 'num_words', 'num_sentences']
Sample of new features:
   num_characters  num_words  num_sentences
0             111         20              2
1              29          6              2
2             155         28              2
3              49         11              1
4              61         13              1
5             148         32              4
6              77         16              2
7             160         26              2
8             158         26              5
9             154         29              3


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [12]:
data.head()

Unnamed: 0,Label,Message,num_characters,num_words,num_sentences
0,ham,"Go until jurong point, crazy.. Available only ...",111,20,2
1,ham,Ok lar... Joking wif u oni...,29,6,2
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,28,2
3,ham,U dun say so early hor... U c already then say...,49,11,1
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,13,1


# Text Preprocessing

In [13]:
import string

# Download with force=True to ensure fresh downloads
downloads = [
    ('stopwords', 'Stopwords corpus'),
    ('punkt', 'Punkt tokenizer'),
    ('punkt_tab', 'Punkt tokenizer (new version)'),
]

for package, description in downloads:
    try:
        nltk.download(package, force=True)
    except Exception as e:
        print(f"Failed to download {package}: {e}")

# Now import the required modules
try:
    from nltk.corpus import stopwords
    from nltk.stem.porter import PorterStemmer
    from nltk.tokenize import word_tokenize
    
    # Test that stopwords work
    english_stopwords = stopwords.words('english')

except Exception as e:
    print(f"Import failed: {e}")
    raise

# Initialize the stemmer
ps = PorterStemmer()

def transform_text(text):
    text = text.lower()                            # convert to lowercase
    text = nltk.word_tokenize(text)                # Split the text  into words
    text = [word for word in text if word.isalnum()]  # Remove special characters
    text = [word for word in text if word not in english_stopwords and word not in string.punctuation]  # Remove stopwords and punctuation
    text = [ps.stem(word) for word in text]        # Apply Stemming
    return " ".join(text)         

data['transformed_message'] = data['Message'].apply(transform_text)

# Check dataset
print(f"\nDataset shape: {data.shape}")
print(f"Columns: {list(data.columns)}")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



Dataset shape: (5169, 6)
Columns: ['Label', 'Message', 'num_characters', 'num_words', 'num_sentences', 'transformed_message']


In [14]:
data.head()

Unnamed: 0,Label,Message,num_characters,num_words,num_sentences,transformed_message
0,ham,"Go until jurong point, crazy.. Available only ...",111,20,2,go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,29,6,2,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,28,2,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,U dun say so early hor... U c already then say...,49,11,1,u dun say earli hor u c alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,13,1,nah think goe usf live around though


In [15]:
# Import two text vectorization tools from scikit-learn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Count Vectorizer: Simple bag-of-words approach
cv = CountVectorizer()

# fit_transform() builds a vocabulary of all words in the corpus and transforms each message into a numeric array based on word counts
X_bow = cv.fit_transform(data['transformed_message']).toarray()


# Create a TfidfVectorizer instance (Better for weighting important words) with max_features=3000 to limit the number of unique words to 3000
tfidf = TfidfVectorizer(max_features=3000)

# Apply TF-IDF on the same preprocessed text to give higher weight to important, rare words and lower weight to frequent ones
X = tfidf.fit_transform(data['transformed_message']).toarray()


# Best Model Selection