# text classification

In [7]:
# import necessary library

import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, auc 
import string

#nltk.download('stopwords') 
#nltk.download('wordnet') 

In [66]:
# load the dataset
df = pd.read_csv('text messages.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [67]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [69]:
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [70]:
# find target columns class 
df['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [75]:
text_col = 'Message'
span_col = 'Category'

# label Encoding

In [76]:
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()

In [77]:
df['Category'] = le.fit_transform(df['Category'])

In [78]:
df.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


# preprocessing

# remove stopword ans punctuation

In [79]:
df.Message[5]

"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv"

In [80]:
stopword = set(stopwords.words('english'))
stopword

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [81]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [82]:
def preprocessing_text(Message):
    # remove punctuation
    rem_pun = [char for char in Message if char not in string.punctuation]
    clean_word = ''.join(rem_pun)
    # remove stopwords
    
    Message = ([word for word in clean_word.split() if word.lower() not in stopword])
    return Message
    

In [83]:
df[text_col] = df[text_col].apply(preprocessing_text)

In [84]:
df[text_col]

0       [Go, jurong, point, crazy, Available, bugis, n...
1                          [Ok, lar, Joking, wif, u, oni]
2       [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3           [U, dun, say, early, hor, U, c, already, say]
4       [Nah, dont, think, goes, usf, lives, around, t...
                              ...                        
5567    [2nd, time, tried, 2, contact, u, U, £750, Pou...
5568                   [ü, b, going, esplanade, fr, home]
5569                     [Pity, mood, Soany, suggestions]
5570    [guy, bitching, acted, like, id, interested, b...
5571                                   [Rofl, true, name]
Name: Message, Length: 5572, dtype: object

# Lemmatizing 

In [85]:
lemma = WordNetLemmatizer()

In [86]:
def lemmatize_text(Message):
    lemmatized_text = ' '.join([lemma.lemmatize(word) for word in Message])
    return lemmatized_text

df[text_col] = df[text_col].apply(lemmatize_text)

In [87]:
df[text_col]

0       Go jurong point crazy Available bugis n great ...
1                                 Ok lar Joking wif u oni
2       Free entry 2 wkly comp win FA Cup final tkts 2...
3                     U dun say early hor U c already say
4                Nah dont think go usf life around though
                              ...                        
5567    2nd time tried 2 contact u U £750 Pound prize ...
5568                          ü b going esplanade fr home
5569                           Pity mood Soany suggestion
5570    guy bitching acted like id interested buying s...
5571                                       Rofl true name
Name: Message, Length: 5572, dtype: object

# apply vectorizer 

In [88]:
vectorizer = TfidfVectorizer()

x = vectorizer.fit_transform(df[text_col])
y = df[span_col]

In [89]:
x.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# train Model

In [90]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y, test_size =.20, random_state= 42)

# model evaluation 

In [92]:
models = [
    MultinomialNB(),
    BernoulliNB()
]

for model in models:
    model.fit(xtrain,ytrain)
    ypred= model.predict(xtest)
    ypred_proba = model.predict_proba(xtest)[:, 1]
    
    print(f"model : {type(model).__name__}")
    print("Accuracy score =", model.score(xtrain,ytrain))
    print("Accuracy score =", model.score(xtest,ytest))
    print("counfusion Matrix")
    print(confusion_matrix(ytest,ypred))
    print("AUC Score:", roc_auc_score(ytest,ypred_proba))
    
    print('\n')
        
    

model : MultinomialNB
Accuracy score = 0.9741978909580435
Accuracy score = 0.9659192825112107
counfusion Matrix
[[966   0]
 [ 38 111]]
AUC Score: 0.9781844456487001


model : BernoulliNB
Accuracy score = 0.9818263405878394
Accuracy score = 0.9766816143497757
counfusion Matrix
[[960   6]
 [ 20 129]]
AUC Score: 0.9842844637125349




# predictions 

In [94]:
random_text = input()

preprossed_text = preprocessing_text(random_text)
lemma_text = lemmatize_text(preprossed_text)
text_vector = vectorizer.transform([lemma_text])

for model in models:
    
    predictions = model.predict(text_vector)
    print(f"model : {type(model).__name__}")
    print("predictions: ", predictions)
    print('\n')
        

 " These messages claim that the recipient has won a lottery of $100000000 and request personal information or payment to receive the supposed winnings. Remember, you have to send $1000 to claim this lottery. Legitimate lotteries and contests do not ask for upfront fees to claim prizes before the deadline "
model : MultinomialNB
predictions:  [1]


model : BernoulliNB
predictions:  [1]


