<a href="https://colab.research.google.com/github/RheedWhan/Email-Spam-Classification/blob/main/Email_Spam_Classification_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [46]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import GridSearchCV

In [47]:
df = pd.read_csv('/content/spam.csv') # Import dataset

In [48]:
df.head() # print the first 5 rows

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [49]:
df.shape # the number of rows and columns in the dataset

(5572, 2)

In [50]:
# Split dataset into train and test
X = df['EmailText']
y = df['Label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [51]:
# Extract features
cv = CountVectorizer()
features = cv.fit_transform(x_train)

In [52]:
# Define the parameter grid

# Use GridSearchCV to find the best parameters

tuned_param = {'kernel': ['linear', 'rbf'], 'gamma' : [1e-3, 1e-4], 'C' : [1,10, 100, 10001]}

In [53]:
# Build model
model = GridSearchCV(svm.SVC(), tuned_param)
model.fit(features, y_train)

In [54]:
print(model.best_params_)

{'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}


In [55]:
# Test Accuracy
features_test = cv.transform(X_test)

In [56]:
print(f'Accuracy: {model.score(features_test, y_test)}')

Accuracy: 0.8654708520179372


### Using Randomized Search

In [57]:
from sklearn.model_selection import RandomizedSearchCV

tuned_param = {'kernel': ['linear', 'rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000, 10000]}

# Use RandomizedSearchCV for a more comprehensive search
random_search = RandomizedSearchCV(svm.SVC(), tuned_param, n_iter=20, random_state=42)
random_search.fit(features, y_train)

# Get the best parameters
best_params = random_search.best_params_
print(f"Best Parameters: {best_params}")

# Build the model with the best parameters
best_model = svm.SVC(**best_params)
best_model.fit(features, y_train)

# Test Accuracy
features_test = cv.transform(X_test)
accuracy = best_model.score(features_test, y_test)
print(f"Test Accuracy: {accuracy}")

Best Parameters: {'kernel': 'rbf', 'gamma': 0.001, 'C': 1}
Test Accuracy: 0.8654708520179372
