# Spam Classification 

## Outline:

1. load the dataset 
2. preprocess and split the dataset
3. load the Word2Vect model
4. Vectorize the text data
5. Train the model `ex:logistic Regression`
6. Evaluate the model


### 1. Loading the Dataset

In [7]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


In [10]:
file_path = "SMSSpamCollection.txt"

data = pd.read_csv(file_path, sep="\t", header=None, names=["label", "message"])
print(data['label'].value_counts())
data.head()

label
ham     4825
spam     747
Name: count, dtype: int64


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### 2. Preprocessing and Splitting the Data

In [8]:
def preprocess_data(text):
    # 1. Convert to lowercase
    text = text.lower()
    
    # 2. Remove special characters and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    
    # 3. Tokenize the text
    tokens = word_tokenize(text)
    
    # 4. Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    
    # 5. Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # 6. Join the tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

In [12]:
data['cleaned_message'] = data['message'].apply(preprocess_data)

In [21]:
X = data['cleaned_message']
y = data['label'].map(
    {
        'ham': 0,
        'spam': 1
    }
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
print(f"The shape of the Train data is {X_train.shape}")
print(f"The shape of the Test data is {X_test.shape}")

The shape of the Train data is (4457,)
The shape of the Test data is (1115,)


In [28]:
X[0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [24]:
y[:3]

0    0
1    0
2    1
Name: label, dtype: int64

### 3. load the Word2Vect model

In [5]:
model_path = r"C:\Users\hassa\gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz"

model = KeyedVectors.load_word2vec_format(model_path, binary=True)

### 4. Vectorize the data

In [25]:
def sentence_to_vector(sentence, model, vector_size=300):
    words = sentence.split()
    word_vectors = [model[word] for word in words if word in model]
    
    if len(word_vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

In [29]:
X_train_vectors = np.array([sentence_to_vector(sent, model) for sent in X_train])
X_test_vectors = np.array([sentence_to_vector(sent, model) for sent in X_test])

print(X_train_vectors.shape, X_test_vectors.shape)

(4457, 300) (1115, 300)


### 5. Train the model

In [31]:
lr = LogisticRegression(max_iter=1000, random_state=42,class_weight='balanced')
lr.fit(X_train_vectors, y_train)

In [33]:
y_pred = lr.predict(X_test_vectors)

### 6. Evaluate the model

In [34]:
# Evaluate the model
print("Accuracy Score:\n", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy Score:
 0.9381165919282511
Confusion Matrix:
 [[910  56]
 [ 13 136]]


In [35]:
from sklearn.model_selection import GridSearchCV

# Parameter grid for Logistic Regression
param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs']
}

grid_search = GridSearchCV(LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
                           param_grid, cv=5, scoring='f1')
grid_search.fit(X_train_vectors, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)

# Re-evaluate with the best model
best_model = grid_search.best_estimator_
y_pred_optimized = best_model.predict(X_test_vectors)

print("Optimized Classification Report:\n", accuracy_score(y_test, y_pred_optimized))


Best Parameters: {'C': 1, 'solver': 'liblinear'}
Optimized Classification Report:
 0.9381165919282511
