In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({'Text':['Learnbay teaches NLP', 'NLP is in demand in market', 'We have to learn NLP'], 
                   'Output':[1,0,1]})

In [3]:
df

Unnamed: 0,Text,Output
0,Learnbay teaches NLP,1
1,NLP is in demand in market,0
2,We have to learn NLP,1


# Bag of Words

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
# Count Vectorizer is the package for BOW

In [6]:
cv = CountVectorizer()

In [7]:
bow = cv.fit_transform(df['Text'])

In [8]:
#Vocab
print(cv.vocabulary_)

{'learnbay': 5, 'teaches': 8, 'nlp': 7, 'is': 3, 'in': 2, 'demand': 0, 'market': 6, 'we': 10, 'have': 1, 'to': 9, 'learn': 4}


In [9]:
len(cv.vocabulary_)

11

In [15]:
bow.toarray()

array([[0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0],
       [1, 0, 2, 1, 0, 0, 1, 1, 0, 0, 0],
       [0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1]], dtype=int64)

In [16]:
# This was an example of Count BOW

**Lets see binary BOW**

In [22]:
cv1 = CountVectorizer(binary = True)

In [23]:
bow = cv1.fit_transform(df['Text'])

In [24]:
print(cv1.vocabulary_)

{'learnbay': 5, 'teaches': 8, 'nlp': 7, 'is': 3, 'in': 2, 'demand': 0, 'market': 6, 'we': 10, 'have': 1, 'to': 9, 'learn': 4}


In [25]:
bow.toarray()

array([[0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0],
       [1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0],
       [0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1]], dtype=int64)

In [29]:
bow_2 = cv1.transform(['NLP is the leader in the makret and we all have to learn it'])

In [30]:
bow_2.toarray()

array([[0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1]], dtype=int64)

In [32]:
cv1.vocabulary_

{'learnbay': 5,
 'teaches': 8,
 'nlp': 7,
 'is': 3,
 'in': 2,
 'demand': 0,
 'market': 6,
 'we': 10,
 'have': 1,
 'to': 9,
 'learn': 4}

# Project

In [33]:
import os
import numpy as np
import pandas as pd

In [34]:
df = pd.read_csv(r"C:\Users\DeLL\Downloads\SPAM text message 20170820 - Data.csv")

In [35]:
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [36]:
df['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [37]:
df['Category'].value_counts()/len(df)

Category
ham     0.865937
spam    0.134063
Name: count, dtype: float64

In [38]:
# It is imbalanced Data.

In [39]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [40]:
ham = df[df['Category'] == 'ham']
spam = df[df['Category'] == 'spam']

In [41]:
print(ham.shape, spam.shape)

(4825, 2) (747, 2)


In [42]:
#Balance the data

In [43]:
spam = spam.sample(ham.shape[0], replace = True)

In [44]:
print(ham.shape, spam.shape)

(4825, 2) (4825, 2)


In [45]:
# We now have balanced data

In [46]:
data = pd.concat([ham, spam], axis = 0)

In [47]:
data.shape

(9650, 2)

In [48]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
6,ham,Even my brother is not like to speak with me. ...


In [49]:
# Split the data into train and test

In [50]:
from sklearn.model_selection import train_test_split

In [60]:
x_train, x_test, y_train, y_test = train_test_split(data['Message'], data['Category'], test_size = 0.3, random_state = 42, stratify = data['Category'])

# Feature Extraction and Model Building

In [61]:
from sklearn.feature_extraction.text import CountVectorizer

In [62]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

In [63]:
classifier = Pipeline([('bow', CountVectorizer()),
                       ('rfmodel', RandomForestClassifier())])

In [64]:
classifier.fit(x_train, y_train)

In [65]:
y_pred_train = classifier.predict(x_train)
y_pred_test = classifier.predict(x_test)

In [66]:
y_pred_test

array(['spam', 'ham', 'ham', ..., 'ham', 'spam', 'ham'], dtype=object)

In [67]:
# Evaluate the model

In [68]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [69]:
print(confusion_matrix(y_train, y_pred_train))

[[3377    0]
 [   0 3378]]


In [71]:
print(confusion_matrix(y_test, y_pred_test))

[[1448    0]
 [   0 1447]]


In [72]:
print(classification_report(y_train, y_pred_train))
print()
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

         ham       1.00      1.00      1.00      3377
        spam       1.00      1.00      1.00      3378

    accuracy                           1.00      6755
   macro avg       1.00      1.00      1.00      6755
weighted avg       1.00      1.00      1.00      6755


              precision    recall  f1-score   support

         ham       1.00      1.00      1.00      1448
        spam       1.00      1.00      1.00      1447

    accuracy                           1.00      2895
   macro avg       1.00      1.00      1.00      2895
weighted avg       1.00      1.00      1.00      2895



In [73]:
print(accuracy_score(y_train, y_pred_train))
print(accuracy_score(y_test, y_pred_test))

1.0
1.0


In [74]:
#checking model performance

In [75]:
test_1 = ['Hello, Hope you all are doing well.']
test_2 = ['Hi, hope you all are enjoying NLP session.']
test_3 = ['Congratulations, You won a lottery ticket worth $100,000! to claim call +65989894 and share your account details to ensure smooth transaction.']

In [76]:
print(classifier.predict(test_1))

['ham']


In [77]:
print(classifier.predict(test_2))
print(classifier.predict(test_3))

['ham']
['spam']


In [78]:
test_4 = ['Congratulation. you won a lottery ticket, please share your address on the below number']

In [79]:
print(classifier.predict(test_4))

['ham']


## We can see that since it is a machine learning model and not an SOTA (state of the art model), like llama3.0 etc., hence it can be tricked easily

In [80]:
## Also if we use deep learning models like word 2 vec or glove we can still see great results.

In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.DataFrame({'Text' : ['This pasta is tasty.', 'This pasta is not tasty'], 'Output' : [1,0]})

In [4]:
df

Unnamed: 0,Text,Output
0,This pasta is tasty.,1
1,This pasta is not tasty,0


In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
cv = CountVectorizer()
#ngram_range = (1,1) - unigram or BOW, both are same here because ngram range = (1,1)

In [9]:
#Vocab

In [11]:
bow_uni_gram = cv.fit_transform(df['Text'])
cv.vocabulary_

{'this': 4, 'pasta': 2, 'is': 0, 'tasty': 3, 'not': 1}

In [12]:
cv_1 = CountVectorizer(ngram_range = (1,2))
bow_bi_gram = cv_1.fit_transform(df['Text'])
cv_1.vocabulary_

{'this': 8,
 'pasta': 5,
 'is': 0,
 'tasty': 7,
 'this pasta': 9,
 'pasta is': 6,
 'is tasty': 2,
 'not': 3,
 'is not': 1,
 'not tasty': 4}

In [13]:
cv_2 = CountVectorizer(ngram_range = (1,3))
bow_tri_gram = cv_2.fit_transform(df['Text'])
cv_2.vocabulary_

{'this': 11,
 'pasta': 6,
 'is': 0,
 'tasty': 10,
 'this pasta': 12,
 'pasta is': 7,
 'is tasty': 3,
 'this pasta is': 13,
 'pasta is tasty': 9,
 'not': 4,
 'is not': 1,
 'not tasty': 5,
 'pasta is not': 8,
 'is not tasty': 2}

In [15]:
cv_3 = CountVectorizer(ngram_range = (3,3))
bow_tri_gram_1 = cv_3.fit_transform(df['Text'])
cv_3.vocabulary_

{'this pasta is': 3, 'pasta is tasty': 2, 'pasta is not': 1, 'is not tasty': 0}

In [16]:
bow_tri_gram.toarray()

array([[1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1],
       [1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1]], dtype=int64)