In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [3]:
#Load dataset
df=pd.read_csv("spam_sms.csv", encoding='latin-1')
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### 1. Preprocessing

In [4]:
df = df.rename(columns={'v1': 'label', 'v2': 'message'})
df = df[['label', 'message']]
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
nltk.download('stopwords')
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to C:\Users\RISHIKA
[nltk_data]     RAVICHANDRAN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove special characters & numbers
    text = text.lower().split()
    text = [stemmer.stem(word) for word in text if word not in stop_words]
    return ' '.join(text)

df['cleaned_message'] = df['message'].apply(preprocess_text)

### 2. Text Vectorization


In [8]:
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_message'])

bow_vectorizer = CountVectorizer()
X_bow = bow_vectorizer.fit_transform(df['cleaned_message'])

X_tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
final_df = pd.concat([df[['label', 'cleaned_message']], X_tfidf_df], axis=1)
final_df.to_csv("cleaned_sms_data.csv.gz", index=False, compression='gzip')
final_df.to_parquet("cleaned_sms_data.parquet", compression='snappy')

### 3. Sentiment Analysis/Spam Detection

In [10]:
y = df['label']
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
X_train_bow, X_test_bow, _, _ = train_test_split(X_bow, y, test_size=0.2, random_state=42)


In [11]:
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier()
}


In [12]:
results = []

for name, model in models.items():
    for vec_name, X_train, X_test in [("TF-IDF", X_train_tfidf, X_test_tfidf), ("Bag of Words", X_train_bow, X_test_bow)]:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        results.append([name, vec_name, accuracy, precision, recall, f1])

# Convert results to DataFrame and display
results_df = pd.DataFrame(results, columns=["Model", "Vectorization", "Accuracy", "Precision", "Recall", "F1-score"])
print(results_df)


                 Model Vectorization  Accuracy  Precision    Recall  F1-score
0          Naive Bayes        TF-IDF  0.965919   1.000000  0.746667  0.854962
1          Naive Bayes  Bag of Words  0.973991   0.875776  0.940000  0.906752
2  Logistic Regression        TF-IDF  0.955157   0.962963  0.693333  0.806202
3  Logistic Regression  Bag of Words  0.977578   1.000000  0.833333  0.909091
4        Random Forest        TF-IDF  0.977578   0.992126  0.840000  0.909747
5        Random Forest  Bag of Words  0.979372   1.000000  0.846667  0.916968
