## PROJECT : SPAM CLASSIFICATION 

OBJECTIVE: 

> We need to train our machine lerning model to categorize whether the message is spam or not.

__________________________________________

## Importing Libraries and Dataset

In [None]:
# importing the libraries 
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

# We will also require these to do necessary modifications to the messages like : lemmatization, stemming  , TF-IDF etc
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


In [None]:
# Reading the dataset
!wget https://raw.githubusercontent.com/krishnaik06/SpamClassifier/master/smsspamcollection/SMSSpamCollection

--2023-03-07 21:03:13--  https://raw.githubusercontent.com/krishnaik06/SpamClassifier/master/smsspamcollection/SMSSpamCollection
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 477907 (467K) [text/plain]
Saving to: ‘SMSSpamCollection’


2023-03-07 21:03:14 (12.5 MB/s) - ‘SMSSpamCollection’ saved [477907/477907]



In [None]:
df = pd.read_csv('SMSSpamCollection', sep= '\t', names = ['label', 'message'])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


<b> So in this dataframe we have two different Labels 'ham' and 'spam'

## <b> Data Cleaning and Pre-Processing

In [None]:
# Gettting all the sentences 

sentences = [sentence for sentence in df['message']]

# Now we will be performing Data Cleaning and preprocessing by using Stopwords and Stemming.
porter  = PorterStemmer()
wordnet = WordNetLemmatizer()
corpus = []

for i in range(len(sentences)):

  review = re.sub("[^a-zA-Z]", ' ', sentences[i])
  review = review.lower()
  review = review.split()
  review = [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
  # review = [porter.stem(word) for word in review if not word in set(stopwords.words('english'))]
  review = ' '.join(review)
  
  corpus.append(review)


In [None]:
# CREATING BOG-OF-WORDS

from sklearn.feature_extraction.text import CountVectorizer # To perform bag of words
from sklearn.feature_extraction.text import TfidfVectorizer # To perform TF-IDF if necessary

# For Bag Of Words 

bow = CountVectorizer(max_features = 5000) # max_features helps to get top most occuring words/features.
X = bow.fit_transform(corpus).toarray()    #  X will be our predictors dataset

In [None]:
X.shape

(5572, 5000)

In [None]:
# Getting our target Labels:

y = df['label'].map({'ham':0, 'spam': 1}).values
y.shape

(5572,)

<b> So far, we have obtained our predictor and target variables X and y, respectively.





## Creating a MODEL for prediction.

### NAIVE_ BAIS MODEL

In [None]:
from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state = 41)

In [None]:
# Creating a Training model Using Naive_baise classification Technique

from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

# Fitting out training data :
model.fit(x_train, y_train)

In [None]:
# Getting the prediction on the test data ;

y_pred= model.predict(x_test)

# Getting the accuracy of the model on test data:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[1172,   15],
       [   8,  198]])

In [None]:
# We can also check the accuracy score and f1 score:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

acc_score = accuracy_score(y_test, y_pred)
print (f'ACCURACY SCORE FOR NAIVE_BAIS MODEL : {acc_score}')
print (f'F1 SCORE FOR NAIVE_BAIS MODEL : {f1_score(y_test, y_pred)}')

ACCURACY SCORE FOR NAIVE_BAIS MODEL : 0.9834888729361091
F1 SCORE FOR NAIVE_BAIS MODEL : 0.9451073985680191


In [None]:
from sklearn.metrics import classification_report

# CLASSIFICATION REPORT FOR THE TEST SET
# assume y_test is the true labels and y_pred is the predicted labels
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1187
           1       0.93      0.96      0.95       206

    accuracy                           0.98      1393
   macro avg       0.96      0.97      0.97      1393
weighted avg       0.98      0.98      0.98      1393



In [None]:
# CLASSIFICATION REPORT FOR THE TRAIN SET
# assume y_train is the true labels
print(classification_report(y_train, model.predict(x_train)))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      3638
           1       0.96      0.97      0.96       541

    accuracy                           0.99      4179
   macro avg       0.98      0.98      0.98      4179
weighted avg       0.99      0.99      0.99      4179



## <b>
By observing the Classification Report, we can conclude that our Machine Leanring Model is performing quite well and giving us an accuracy score of 99%