In [45]:
import pandas as pd
import nltk
from nltk.corpus import stopwords # There are some words in a sentence; those do not carry enough information and therefore even if we remove those, it does not affect much the and it also saves commputational power.
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, auc
import string
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [46]:
# Load data
data = pd.read_csv('text messages.csv')
Category_column = 'Category'
Message_column = 'Message'

In [47]:
data.head()

# "ham" refers to legitimate or non-spam emails.
# "spam" refers to unsolicited or unwanted emails.

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


**Step 1: Remove punctuation and stopwords**

In [48]:
data.Category[0]

'ham'

In [49]:
data.Message[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [50]:
stopword = set(stopwords.words('english'))
stopword

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [51]:
data.Message[200]

'Found it, ENC  &lt;#&gt; , where you at?'

In [52]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [53]:
'DaTa'.lower()

'data'

In [54]:
def preprocess_Message(Message):
    # Remove punctuation (using List Comprehension)
    remove_punc = [char for char in Message if char not in string.punctuation]
    clean_words = ''.join(remove_punc) # char joining

    # Remove stopwords
    Message = ([word for word in clean_words.split() if word.lower() not in stopword]) #converting all the data into lower format
    return Message

In [55]:
data[Message_column] = data[Message_column].apply(preprocess_Message)

In [56]:
data[Message_column]

0       [Go, jurong, point, crazy, Available, bugis, n...
1                          [Ok, lar, Joking, wif, u, oni]
2       [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3           [U, dun, say, early, hor, U, c, already, say]
4       [Nah, dont, think, goes, usf, lives, around, t...
                              ...                        
5567    [2nd, time, tried, 2, contact, u, U, £750, Pou...
5568                   [ü, b, going, esplanade, fr, home]
5569                     [Pity, mood, Soany, suggestions]
5570    [guy, bitching, acted, like, id, interested, b...
5571                                   [Rofl, true, name]
Name: Message, Length: 5572, dtype: object

**Step 2: Lemmatization** (Normalizing the text)

In [57]:
lemmatizer = WordNetLemmatizer()

def lemmatize_Message(Message):
    lemmatized_Message = ' '.join([lemmatizer.lemmatize(word) for word in Message])
    return lemmatized_Message

data[Message_column] = data[Message_column].apply(lemmatize_Message)


In [58]:
data[Message_column]

0       Go jurong point crazy Available bugis n great ...
1                                 Ok lar Joking wif u oni
2       Free entry 2 wkly comp win FA Cup final tkts 2...
3                     U dun say early hor U c already say
4                Nah dont think go usf life around though
                              ...                        
5567    2nd time tried 2 contact u U £750 Pound prize ...
5568                          ü b going esplanade fr home
5569                           Pity mood Soany suggestion
5570    guy bitching acted like id interested buying s...
5571                                       Rofl true name
Name: Message, Length: 5572, dtype: object

**Step 3: TF-IDF vectorizer**

In [59]:
vectorizer = TfidfVectorizer()

x = vectorizer.fit_transform(data[Message_column])
y = data[Category_column]


**Step 4: Fit models**

In [60]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)

**Step 5: Performance evaluation**

In [61]:
models = [
    MultinomialNB(),
    BernoulliNB()
]

for model in models:
    model.fit(xtrain, ytrain)

    ypred = model.predict(xtest)
    ypred_proba = model.predict_proba(xtest)[:, 1]
    print(f"Model: {type(model).__name__}")
    print('Accuracy Score =',model.score(xtest, ytest))
    print("Confusion Matrix:")
    print(confusion_matrix(ytest, ypred))
    print("AUC Score:", roc_auc_score(ytest, ypred_proba))

    print('\n')


Model: MultinomialNB
Accuracy Score = 0.9659192825112107
Confusion Matrix:
[[966   0]
 [ 38 111]]
AUC Score: 0.9781844456487001


Model: BernoulliNB
Accuracy Score = 0.9766816143497757
Confusion Matrix:
[[960   6]
 [ 20 129]]
AUC Score: 0.9842844637125349




**Step 6: Make predictions on random text**

In [62]:
random_Message = input()

preprocessed_Message = preprocess_Message(random_Message)
lemmatized_Message = lemmatize_Message(preprocessed_Message)
Message_vector = vectorizer.transform([lemmatized_Message])

for model in models:
    prediction = model.predict(Message_vector)
    print(f"Model: {type(model).__name__}")
    print("Prediction:", prediction)
    print('\n')

# 0 -> ham, 1-> spam.

These messages claim that the recipient has won a lottery of $100000000 and request personal information or payment to receive the supposed winnings. Remember, you have to send $1000 to claim this lottery. Legitimate lotteries and contests do not ask for upfront fees to claim prizes before the deadline 
Model: MultinomialNB
Prediction: ['spam']


Model: BernoulliNB
Prediction: ['spam']




In [63]:
random_Message = input()

preprocessed_Message = preprocess_Message(random_Message)
lemmatized_Message = lemmatize_Message(preprocessed_Message)
Message_vector = vectorizer.transform([lemmatized_Message])

for model in models:
    prediction = model.predict(Message_vector)
    print(f"Model: {type(model).__name__}")
    print("Prediction:", prediction)
    print('\n')

# 0 -> ham, 1-> spam.

I need an Iphone.
Model: MultinomialNB
Prediction: ['ham']


Model: BernoulliNB
Prediction: ['ham']


