<a href="https://colab.research.google.com/github/Nitin286roxs/NLP/blob/main/sms_spam_classification/sms_spam_classificatiom.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

In [2]:
#loading text dataset
smses = pd.read_csv("/content/SMSSpamCollection.txt", sep="\t", names=['label', 'sms'])

In [3]:
smses.head(3)

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [4]:
smses.shape

(5572, 2)

In [5]:
#Data cleaning and preprocessing
import nltk
import re
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
ps=PorterStemmer()

# Stemming

In [7]:
corpus = []
for i in range(0, len(smses)):
  review=re.sub('[^a-zA-z]', ' ', smses['sms'][i])
  review=review.lower()
  review=review.split()
  review=[ ps.stem(word) for word in review if word not in stopwords.words('english')]
  review=" ".join(review)
  corpus.append(review)


# Creating Bag of words model

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500, binary=True)
X=cv.fit_transform(corpus).toarray()

In [9]:
y=pd.get_dummies(smses['label'], dtype=int)
Y=y.iloc[:, 1]

In [10]:
X.shape

(5572, 2500)

## Train and test split

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=0,)

## Use Naive Bays Model

In [12]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, Y_train)

## Model prediction

In [13]:
Y_Pred = spam_detect_model.predict(X_test)

## Accuracy score

In [14]:
from sklearn.metrics import accuracy_score, classification_report
score=accuracy_score(Y_Pred, Y_test)
score

0.9850478468899522

## Classification Report

In [15]:
print(classification_report(Y_Pred, Y_test))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1456
           1       0.93      0.95      0.94       216

    accuracy                           0.99      1672
   macro avg       0.96      0.97      0.97      1672
weighted avg       0.99      0.99      0.99      1672



# TF-IDF model

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv=TfidfVectorizer(max_features=2500)
X=tv.fit_transform(corpus).toarray()

## Train and test split

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=0,)

## Use Naive Bays Model

In [18]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, Y_train)

## TF-IDF model prediction

In [19]:
Y_Pred=spam_detect_model.predict(X_test)

## accuracy score and accuracy _report

In [20]:
from sklearn.metrics import accuracy_score, classification_report
score=accuracy_score(Y_Pred, Y_test)
score

0.9808612440191388

## classification report

In [21]:
print(classification_report(Y_Pred, Y_test))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99      1483
           1       0.86      1.00      0.92       189

    accuracy                           0.98      1672
   macro avg       0.93      0.99      0.96      1672
weighted avg       0.98      0.98      0.98      1672



# Using Random forest model

In [22]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier().fit(X_train, Y_train)

##  random classifier prediction

In [23]:
Y_Pred=classifier.predict(X_test)

## accuracy score and classification report

In [24]:
from sklearn.metrics import accuracy_score, classification_report
score=accuracy_score(Y_Pred, Y_test)
score


0.9808612440191388

## classification report

In [25]:
print(classification_report(Y_Pred,Y_test))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99      1477
           1       0.87      0.98      0.92       195

    accuracy                           0.98      1672
   macro avg       0.93      0.98      0.96      1672
weighted avg       0.98      0.98      0.98      1672



# Word2Vec implementation

In [26]:
!pip install gensim



## Lemmitization

In [27]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [28]:
corpus=[]
nltk.download('wordnet')
for i in range(len(smses)):
  review=re.sub('[^a-zA-z]', ' ', smses['sms'][i])
  review=review.lower()
  review=review.split()
  review=[lemmatizer.lemmatize(word) for word in review if word not in stopwords.words('english')]
  review=" ".join(review)
  corpus.append(review)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [29]:
len(corpus)

5572

## Tokenizer the corpus

In [30]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
words=[]
labels=[]
i = 0
nltk.download('punkt')
for sent in corpus:
  sent_token=sent_tokenize(sent)
  for sent in sent_token:
    if smses['label'][i] == "spam":
      labels.append(1)
    else:
      labels.append(0)

    words.append(simple_preprocess(sent))
  i += 1

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [31]:
len(words)

5564

In [32]:
len(labels)

5564

## Create our own vocab

In [33]:
import gensim

In [34]:
model = gensim.models.Word2Vec(words, window=5, min_count=2)

In [35]:
model.wv['call'].shape

(100,)

In [36]:
model.corpus_count

5564

In [37]:
model.wv.similar_by_word("love")

[('day', 0.9997634887695312),
 ('hope', 0.999747097492218),
 ('life', 0.9997429847717285),
 ('one', 0.9997348189353943),
 ('say', 0.9997230172157288),
 ('need', 0.9997205138206482),
 ('thing', 0.9997036457061768),
 ('go', 0.9996981024742126),
 ('like', 0.9996948838233948),
 ('much', 0.9996901750564575)]

## AvgWord2Vec

In [38]:
import numpy as np
def avg_word2vec(sentance):
  return np.mean([model.wv[word] for word in sentance if word in model.wv.index_to_key], axis=0)

In [39]:
!pip install tqdm



In [40]:
X=[]
empty_word3_vec_index=[]
from tqdm import tqdm
for i in tqdm(range(len(words))):
  vec=avg_word2vec(words[i])
  if vec.shape==(100,):
    #print(vec.shape)
    X.append(vec)
  else:
    empty_word3_vec_index.append(i)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 5564/5564 [00:01<00:00, 2930.23it/s]


In [41]:
X_new=np.array(X)
X_new.shape

(5541, 100)

In [42]:
X_new[0].shape

(100,)

## Train classification model

In [43]:
Y_new=np.array(labels)

In [44]:
Y_new.shape

(5564,)

In [45]:
Y_new_dropped = np.delete(Y_new, empty_word3_vec_index, axis=0)

In [46]:
Y_new_dropped.shape

(5541,)

## Train model

In [47]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test= train_test_split(X_new, Y_new_dropped, test_size=0.3, random_state=0)

In [48]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier().fit(X_train, Y_train)

In [49]:
#predict
Y_pred=classifier.predict(X_test)

In [50]:
from sklearn.metrics import accuracy_score, classification_report
scopre=accuracy_score(Y_pred, Y_test)
print(score)

print(classification_report(Y_pred, Y_test))

0.9808612440191388
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1477
           1       0.77      0.92      0.84       186

    accuracy                           0.96      1663
   macro avg       0.88      0.95      0.91      1663
weighted avg       0.97      0.96      0.96      1663

