# Spam Email Detection Program

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split,GridSearchCV,KFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,classification_report
from imblearn.over_sampling import SMOTE

In [2]:
data = pd.read_csv("spam.csv")
data.head()

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
data["Label"].value_counts()

Label
ham     4825
spam     747
Name: count, dtype: int64

In [5]:
x = data["EmailText"]
y = data["Label"]

In [6]:
cvec = CountVectorizer()
# fit_transform create new columns (Create new sparse matrix)
cx = cvec.fit_transform(x)

In [7]:
smt=SMOTE()
x_sm,y_sm=smt.fit_resample(cx,y)

In [8]:
y_sm.value_counts()

Label
ham     4825
spam    4825
Name: count, dtype: int64

In [9]:
x_train,x_test,y_train,y_test=train_test_split(x_sm,y_sm,test_size=0.2,random_state=0)

In [11]:
# Hyperparameter Optimization
params = {'kernel': ['rbf','linear'],"C":[0.1,0.5,1]}
cv=KFold(n_splits=5)
model = SVC()

In [12]:
gsearch = GridSearchCV(SVC(), params,cv=cv)

In [13]:
results = gsearch.fit(x_train,y_train)
results.best_params_

{'C': 0.5, 'kernel': 'linear'}

In [14]:
sv=SVC(kernel="linear",C=0.5)

In [15]:
sv.fit(x_train,y_train)

In [16]:
y_pred=sv.predict(x_test)

In [17]:
y_pred

array(['ham', 'spam', 'spam', ..., 'spam', 'spam', 'spam'], dtype=object)

In [18]:
y_test

1070     ham
4488     ham
8763    spam
7372    spam
7633    spam
        ... 
212      ham
4546     ham
6411    spam
7916    spam
6712    spam
Name: Label, Length: 1930, dtype: object

In [19]:
accuracy_score(y_test,y_pred)

0.9549222797927461

In [20]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         ham       0.98      0.93      0.95       936
        spam       0.93      0.98      0.96       994

    accuracy                           0.95      1930
   macro avg       0.96      0.95      0.95      1930
weighted avg       0.96      0.95      0.95      1930



In [21]:
emails=["Hey, you have won a car !!!!. Conrgratzz","Dear applicant, Your CV has been recieved. Best regards"]

In [22]:
sv.predict(cvec.transform(emails))

array(['spam', 'ham'], dtype=object)