# <h1><center><u> Spam Detection Classifier</u></center></h1>

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## <u>Importing Libraries</u>

In [None]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import string

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.model_selection import GridSearchCV

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))


<hr></hr>

## <u>Data Preperation</u>

### *Loading Dataset*

In [None]:
msg = pd.read_csv("../input/spamorham/spam.csv", encoding= "ISO-8859–1" )
msg.head(10)

In [None]:
msg.shape

In [None]:
msg.tail()

###  *Comparing number of spams and hams in Dataset*

In [None]:
msg.groupby("Label").count()

In [None]:
plt.figure(figsize=[5,5])
msg["Label"].value_counts().plot(kind='pie',legend=True,cmap="Set3")
plt.ylabel("Spam/NotSpam")

### *Checking missing values*

In [None]:
msg.isna().sum()

### *Evaluating and visualizing data according to text length*

In [None]:
msg["TextLength"]=msg["EmailText"].apply(len)
msg.head()

 * **Visualizing using seaborn**

In [None]:
sns.set_style("darkgrid")
sns.set(rc = {'figure.figsize' : (18,6)})
msg.hist(column = 'TextLength', by = 'Label', bins = 40,edgecolor = 'black',color="orange")

* **Visualizing using matplotlib.pyplot**

In [None]:
plt.figure(figsize=[10,5])
plt.hist(msg[msg['Label']=='spam']['TextLength'],color='darkblue',bins=50,edgecolor='darkblue')
plt.title('Spam Message Length',fontsize=20)
plt.xlabel('Message Length')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=[10,5])
plt.hist(msg[msg['Label']=='ham']['TextLength'],color='olive',bins=50,edgecolor='olive')
plt.title('Ham Message Length',fontsize=20)
plt.xlabel('Message Length')
plt.ylabel('Count')
plt.show()

#### **Conclusion** -
**Message Length is more for spam as compared to not spam or ham**

<hr></hr>

## <u>Data Preprocessing</u>

### *Stemming using Snowball Stemmer and removing stopwords and punctuations from data*

In [None]:
#data_preprocessesing
def preprocess(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    return " ".join(text)


def stem(text):
    text = text.split()
    words = ""
    for i in text:
            stemmer = SnowballStemmer("english")
            words += (stemmer.stem(i))+" "
    return words

In [None]:
msg["ProcessedEmailText"]=msg["EmailText"].apply(preprocess)
msg["ProcessedEmailText"]=msg["ProcessedEmailText"].apply(stem)


In [None]:
msg.head()

### *Adding Numeric column for Label column as LabelNum using Label Encoder*

In [None]:
le = preprocessing.LabelEncoder()
msg["LabelNum"]=le.fit_transform(msg["Label"])

In [None]:
msg["ProcessedEmailText"]

### *Vectorization of Data*

* **Using Count Vectorizer**

In [None]:
cv = CountVectorizer("english")
spam_model_cnt = cv.fit_transform(msg["ProcessedEmailText"])
spam_model_cnt.shape

* **Using Tf IDF Vectorizer**

In [None]:
tf = TfidfVectorizer("english")
spam_model_tfidf = tf.fit_transform(msg["ProcessedEmailText"])
spam_model_tfidf.shape


In [None]:
vect_tfidf=spam_model_tfidf.toarray()
vect_cnt=spam_model_cnt.toarray()

#### Making Features Column

In [None]:
feature=pd.DataFrame(vect_tfidf,columns=tf.get_feature_names())
feature["TextLen"]=msg["TextLength"]
feature.head()

***

## Modeling Data

* **Spliting Data into train and test data**

In [None]:
msg_train,msg_test,ans_train,ans_test=train_test_split(feature,msg["LabelNum"],test_size=0.2,random_state=20)

### <i><u>Logistic Regression Classifier</u></i>

In [None]:
#LogisticRegression
from sklearn.linear_model import LogisticRegression

model=LogisticRegression()
model.fit(msg_train, ans_train)


In [None]:
predicted=model.predict(msg_test)
accuracy_score(predicted,ans_test)

In [None]:
cnf_matrix = confusion_matrix(predicted, ans_test, labels=[1,0])
cnf_matrix

In [None]:
print (classification_report(predicted, ans_test))

* **Optimizing using GridSearchCV**

In [None]:
grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}

In [None]:
model=LogisticRegression()
model_gs=GridSearchCV(model,grid,cv=5,n_jobs=-1,verbose=1)
model_gs.fit(msg_train, ans_train)

In [None]:
model_gs.best_params_

In [None]:
model_gs.best_score_

<hr>

### <i><u>Decision Tree Classifier</u></i>

In [None]:
#Decision_Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
model=DecisionTreeClassifier()
model.fit(msg_train,ans_train)

In [None]:
predicted=model.predict(msg_test)
accuracy_score(predicted,ans_test)

In [None]:
cnf_matrix = confusion_matrix(predicted, ans_test, labels=[1,0])
cnf_matrix

* **Optimizing using GridSearchCV**

In [None]:
array=[1,2,3,4,5,6,7,8,9,10]
grid={'criterion':['gini','entropy'], 'max_depth':array}

In [None]:
model=DecisionTreeClassifier()
model_gs=GridSearchCV(model,grid,cv=5,n_jobs=-1,verbose=1)
model_gs.fit(msg_train, ans_train)

In [None]:
model_gs.best_params_

In [None]:
model_gs.best_score_

***

### <i><u>K Nearest Neighbour Classifier</u></i>

In [None]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3)
model.fit(msg_train,ans_train)

In [None]:
predicted=model.predict(msg_test)
accuracy_score(predicted,ans_test)

In [None]:
print (classification_report(predicted, ans_test))

* **Optimizing using GridSearchCV**

In [None]:
array=[1,2,3,4,5,6,7,8,9,10]
grid={'n_neighbors':array, 'weights':['uniform', 'distance'], 'metric':['euclidean','manhattan','minkowski']}

In [None]:
model=KNeighborsClassifier()
model_gs=GridSearchCV(model,grid,verbose=1,cv=5,n_jobs=-1)
model_gs.fit(msg_train, ans_train)

In [None]:
model_gs.best_params_

In [None]:
model_gs.best_score_

***

### <i><u>Naive Bayes Classifier</u></i>

* **MultinomialNB**

In [None]:
#Naive_Bayes
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB(0.1)
model.fit(msg_train,ans_train)

In [None]:
predicted=model.predict(msg_test)
accuracy_score(predicted,ans_test)

In [None]:
print (classification_report(predicted, ans_test))

* **GaussianNB**

In [None]:
from sklearn.naive_bayes import GaussianNB
model=GaussianNB()
model.fit(msg_train,ans_train)

In [None]:
predicted=model.predict(msg_test)
accuracy_score(predicted,ans_test)

In [None]:
print (classification_report(predicted, ans_test))

***

### <i><u>Support Vector Machine (SVM) Classifier</u></i>

In [None]:
#SVM
from sklearn import svm
model=svm.SVC(C=100, gamma=0.001, kernel='rbf')
model.fit(msg_train,ans_train)

In [None]:
predicted=model.predict(msg_test)
accuracy_score(predicted,ans_test)

In [None]:
print (classification_report(predicted, ans_test))

***

## <u>Conclusion</u>

### Accuracy of different models:

#### 1. **Logistic Regression Classifier:  $98.05$ %**
#### 2. **Decision Tree Classifier:  $95.52$ %**
#### 3. **KNN Classifier:  $94.43$ %**
#### 4. **Naive Bayes Classifier:  $97.67$ %**
#### 5. **SVM Classifier:  $96.23$ %**

***