# assignment with handling imbalance data 

In [1]:
# import necessary library

import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, auc 
import string

#nltk.download('stopwords') 
#nltk.download('wordnet') 

In [3]:
df = pd.read_csv('text messages.csv') # load the dataset 

In [4]:
df.head() # visualize the data 

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# imbalance handling with over sampling

In [5]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [6]:
y = df[['Category']]
x = df[['Message']]

In [7]:
x.head()

Unnamed: 0,Message
0,"Go until jurong point, crazy.. Available only ..."
1,Ok lar... Joking wif u oni...
2,Free entry in 2 a wkly comp to win FA Cup fina...
3,U dun say so early hor... U c already then say...
4,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
y.head()

Unnamed: 0,Category
0,ham
1,ham
2,spam
3,ham
4,ham


In [9]:
from imblearn.over_sampling import RandomOverSampler
over = RandomOverSampler()

In [10]:
x_new, y_new = over.fit_resample(x,y) # store sampling value into new variables x_new, y_new

In [11]:
x_new.shape

(9650, 1)

In [12]:
y_new.shape

(9650, 1)

In [13]:
y_new.value_counts()

Category
ham         4825
spam        4825
dtype: int64

In [14]:
new_df = pd.concat([x_new, y_new], axis=1) # concat the dataset to have new dataset with sampling data

In [15]:
new_df.head()

Unnamed: 0,Message,Category
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [16]:
new_df.Category.value_counts()

ham     4825
spam    4825
Name: Category, dtype: int64

In [17]:
spam_col = 'Category'
spam_text = 'Message'

# label encoding 

In [18]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [19]:
new_df['Category'] = le.fit_transform(new_df['Category'])

In [20]:
new_df.head()

Unnamed: 0,Message,Category
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [21]:
new_df['Category'].value_counts()

0    4825
1    4825
Name: Category, dtype: int64

# Preprocessing

# stopword and punctuation removal 

In [22]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [23]:
stopword = set(stopwords.words('english'))
stopword

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [24]:
def preprocess(Message):
    #remove Punctuation
    rem_pun = [char for char in Message if char not in string.punctuation]
    clean_word = "".join(rem_pun)

    #stopword removal 
    Message =([word for word in clean_word.split() if word.lower() not in stopword])
    return Message

In [25]:
new_df[spam_text]= new_df[spam_text].apply(preprocess)

In [26]:
new_df[spam_text]

0       [Go, jurong, point, crazy, Available, bugis, n...
1                          [Ok, lar, Joking, wif, u, oni]
2       [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3           [U, dun, say, early, hor, U, c, already, say]
4       [Nah, dont, think, goes, usf, lives, around, t...
                              ...                        
9645    [4, Costa, Del, Sol, Holiday, £5000, await, co...
9646    [ever, notice, youre, driving, anyone, going, ...
9647    [mobile, 11mths, Update, FREE, Oranges, latest...
9648    [next, amazing, xxx, PICSFREE1, video, sent, e...
9649    [dont, prize, go, another, customer, TC, wwwtc...
Name: Message, Length: 9650, dtype: object

# Applying lematizer to find root word

In [27]:
lemma = WordNetLemmatizer()

In [28]:
def lemmatizer(Message):
    
    lematized_Message = ' '.join([lemma.lemmatize(word) for word in Message])
    return lematized_Message

new_df[spam_text] = new_df[spam_text].apply(lemmatizer)

In [29]:
new_df[spam_text]

0       Go jurong point crazy Available bugis n great ...
1                                 Ok lar Joking wif u oni
2       Free entry 2 wkly comp win FA Cup final tkts 2...
3                     U dun say early hor U c already say
4                Nah dont think go usf life around though
                              ...                        
9645    4 Costa Del Sol Holiday £5000 await collection...
9646    ever notice youre driving anyone going slower ...
9647    mobile 11mths Update FREE Oranges latest colou...
9648    next amazing xxx PICSFREE1 video sent enjoy on...
9649    dont prize go another customer TC wwwtcbiz 18 ...
Name: Message, Length: 9650, dtype: object

# vectorizer 

In [30]:
vec = TfidfVectorizer()

x = vec.fit_transform(new_df[spam_text]) # context text data into vector format
y = new_df[spam_col]

In [31]:
x.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# fit model

In [32]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y, test_size = .20, random_state = 42)

# model selection and performance Evaluation


In [33]:
models =[
    MultinomialNB(), 
    BernoulliNB()
  
]

for model in models:
    model.fit(xtrain,ytrain)
    
    ypred = model.predict(xtest)
    ypred_proba = model.predict_proba(xtest)[:, 1]
    
    print(f"model: {type(model).__name__}")
    print("Accuracy Score=", model.score(xtest,ytest), '\n')
    print("confusion Matrix")
    print(confusion_matrix(ytest, ypred))
    print("AUC_score:", roc_auc_score(ytest, ypred_proba))
    print("\n")
    
    

model: MultinomialNB
Accuracy Score= 0.9829015544041451 

confusion Matrix
[[965  25]
 [  8 932]]
AUC_score: 0.9990511497958307


model: BernoulliNB
Accuracy Score= 0.9875647668393782 

confusion Matrix
[[990   0]
 [ 24 916]]
AUC_score: 0.9994336986890179




# make predictions for new data 

In [34]:
ran = input()

preprocess_text = preprocess(ran)
lematize_text = lemmatizer(preprocess_text)
vec_text = vec.transform([lematize_text])

for model in models:
    predictions = model.predict(vec_text)
    print(f"model: {type(model).__name__}")
    print("predictions", predictions)
    
    print('\n')

 " These messages claim that the recipient has won a lottery of $100000000 and request personal information or payment to receive the supposed winnings. Remember, you have to send $1000 to claim this lottery. Legitimate lotteries and contests do not ask for upfront fees to claim prizes before the deadline "
model: MultinomialNB
predictions [1]


model: BernoulliNB
predictions [1]


