In [1]:
!pip install pandas numpy scikit-learn nltk matplotlib seaborn



In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sarav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Loading the Dataset

In [3]:
import pandas as pd

# Load the dataset
df = pd.read_csv('spam.csv', encoding='latin-1')

# Select relevant columns
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

# Display the first few rows
print(df.head())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


## Data Preprocessing

### Steps Involved:
#### Lowercasing: Convert all text to lowercase.
#### Removing Punctuation: Eliminate punctuation marks.
#### Removing Stopwords: Remove common words that may not contribute to the classification.
#### Stemming/Lemmatization: Reduce words to their root form.

In [4]:
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Initialize stemmer and stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize and remove stopwords
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming
    tokens = [stemmer.stem(word) for word in tokens]
    
    # Join back to string
    return ' '.join(tokens)

# Apply preprocessing
df['cleaned_message'] = df['message'].apply(preprocess_text)

print(df.head())

  label                                            message  \
0   ham  Go until jurong point, crazy.. Available only ...   
1   ham                      Ok lar... Joking wif u oni...   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...   
3   ham  U dun say so early hor... U c already then say...   
4   ham  Nah I don't think he goes to usf, he lives aro...   

                                     cleaned_message  
0  go jurong point crazi avail bugi n great world...  
1                              ok lar joke wif u oni  
2  free entri 2 wkli comp win fa cup final tkt 21...  
3                u dun say earli hor u c alreadi say  
4          nah dont think goe usf live around though  
