<a href="https://colab.research.google.com/github/S-AILAB/Spam-Classification-NLP/blob/main/Spam_Classification_NLP_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Objective
The goal of this exercise is to classify messages as either spam or ham using Natural Language Processing techniques. You will preprocess the text data, apply feature extraction methods, train a machine learning model, and evaluate its performance.

##Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


##Loading the dataset

In [None]:
df = pd.read_csv('/content/Spam_SMS.csv')

In [None]:
df

Unnamed: 0,Class,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5569,spam,This is the 2nd time we have tried 2 contact u...
5570,ham,Will ü b going to esplanade fr home?
5571,ham,"Pity, * was in mood for that. So...any other s..."
5572,ham,The guy did some bitching but I acted like i'd...


##Data Pre-Processing

In [None]:
df.isnull().sum()

Unnamed: 0,0
Class,0
Message,0


In [None]:
df.size

11148

In [None]:
df.shape

(5574, 2)

In [None]:
df.columns

Index(['Class', 'Message'], dtype='object')

In [None]:
#Calculating total ham and spam
df['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
ham,4827
spam,747


##Text Preprocessing

###Cleaning the text, removing special char, Stop word removal, tokenization, lemmatization

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
corpus = []
for i in range(0, 5574):
  message  = re.sub('[^a-zA-Z]', ' ', df['Message'][i])
  message = message.lower()
  message = message.split()
  message = [word for word in message if not word in set(stopwords.words('english'))]
  ps = PorterStemmer()
  message = [ps.stem(word) for word in message if not word in set(stopwords.words('english'))]
  message = ' '.join(message)
  corpus.append(message)

In [None]:
corpus[5000]

'hmph go head big baller'

## Feature extractrion

###Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1000)
X = cv.fit_transform(corpus).toarray()
y = df['Class']

In [None]:
len(y)

5574

##Model Training

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)

In [None]:
#applying logistic regression
from sklearn.linear_model import LogisticRegression
le = LogisticRegression()
le.fit(X_train, y_train)

##Prediction

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype='<U4')

In [None]:
# from sklearn.preprocessing import LabelEncoder

# le = LabelEncoder()
# y = le.fit_transform(df['Class'])  # 'ham' becomes 0, 'spam' becomes 1

# y_pred = model.predict(X_test)
# print(y_pred)  # Now this will be array([0, 0, 1, ...])

# If you still want to convert predictions back to labels later (for reports or output):
# y_pred_labels = le.inverse_transform(y_pred)

In [None]:
y_pred_le = le.predict(X_test)

In [None]:
y_pred_le

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

##Confustion matrix

In [None]:
#confusion matrix for Naive_bayes
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [None]:
cm

array([[936,  14],
       [ 12, 153]])

In [None]:
#confusion matrix for Naive_bayes
cmle = confusion_matrix(y_test, y_pred_le)

In [None]:
cmle

array([[946,   4],
       [ 24, 141]])

#SAME CODE BUT INSTEAD OF BAG OF WORDS WE ARE USING TF-IDF

In [None]:
## Feature Extraction: TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000)
X1 = tfidf.fit_transform(corpus).toarray()
y1 = df['Class']

print(len(y))

## Model Training: Naive Bayes
from sklearn.model_selection import train_test_split
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=0)

from sklearn.naive_bayes import MultinomialNB
model1 = MultinomialNB()
model1.fit(X_train, y_train)

## Prediction
y_pred1 = model1.predict(X1_test)
print(y_pred1)

## Evaluation: Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y1_test, y_pred1)
print(cm)


5574
['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham']
[[946   4]
 [ 24 141]]
