In [29]:
# Importing essential libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import nltk

In [30]:
# Loading the dataset
df =pd.read_csv("/spam.csv",encoding='ISO-8859-1')
le = LabelEncoder

**Exploratory Data Analysis
(EDA)**


*   check for NaN values








In [31]:
df.shape

(5572, 5)

In [32]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [33]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


**Dropping the unwanted columns :--**

Unnamed:2, Unnamed: 3 and Unnamed:4

In [34]:
df = df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
df = df.rename(columns={"v1":"label", "v2":"text"})
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [36]:
df.describe()

Unnamed: 0,label,text
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [37]:
data = df.to_numpy()

In [38]:
data

array([['ham',
        'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'],
       ['ham', 'Ok lar... Joking wif u oni...'],
       ['spam',
        "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"],
       ...,
       ['ham',
        'Pity, * was in mood for that. So...any other suggestions?'],
       ['ham',
        "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free"],
       ['ham', 'Rofl. Its true to its name']], dtype=object)

In [39]:
X = data[:,1]
y = data[:,0]

In [40]:
X

array(['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
       'Ok lar... Joking wif u oni...',
       "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
       ..., 'Pity, * was in mood for that. So...any other suggestions?',
       "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free",
       'Rofl. Its true to its name'], dtype=object)

In [41]:
X.shape,y.shape

((5572,), (5572,))

# Data Cleaning Processes


  
  * Removing special character using regexp tokenizer. 
  * Converting the whole text into lower case
  * Removing the stop words
  * Removing Stemmed words using porterstemmer
  * Create a corpus of text











In [45]:
# Importing essential libraries for performing NLP
from nltk.tokenize import RegexpTokenizer,word_tokenize
from nltk.stem import PorterStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [46]:
tokenizer = RegexpTokenizer("\w+")
sw = set(stopwords.words("english"))
ps = PorterStemmer()

In [47]:
def getStem(review):

  # Converting the entire text into lower case
  review = review.lower()

  # Tokenizing the text by words
  tokens = tokenizer.tokenize(review)

  # Removing the stop words
  remove_stopwords = [w for w in tokens if w  not in sw]

  # Removing stem words
  stemmed_words = [ps.stem(token) for token in remove_stopwords]
  
  # Joining the stemmed words
  clean_review = " ".join(stemmed_words)
  return clean_review

In [48]:
# Building a corpus of text
def getDoc(document):
  d = []
  for doc in document:
    d.append(getStem(doc))
  return d

In [49]:
stemmed_doc = getDoc(X)

In [50]:
stemmed_doc[:8]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send å 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun']

Data preprocessing with CountVEctorizer

In [51]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
vc = cv.fit_transform(stemmed_doc)
vc

<5572x7213 sparse matrix of type '<class 'numpy.int64'>'
	with 46972 stored elements in Compressed Sparse Row format>

In [52]:
X = vc.todense()

# Model Building

In [54]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [55]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train,y_train)
model.score(X_test,y_test)



0.977705274605764

In [56]:
text_messages =["""
You probably know that I don't make stock recommendations. However, I have two thoughts regarding your personal expenditures that can save you real money. I'm suggesting that you call on the services of two subsidiaries of Berkshire: GEICO and Borsheim's.""",

"""ohh, you have won 200$ as unexpected price.you need to below click on link for the money to get deposited in your account""",

"""I estimate that about 40% of all auto drivers in the country can save money by insuring with GEICO. The figure is not 100% because insurers differ in their underwriting judgments, with some favoring drivers who live in certain geographical areas and work in certain occupations more than GEICO does. I believe, however, that GEICO more frequently offers the low price than does any other national carrier selling insurance to all comers. You can quickly find out whether you can save money by going to www.geico.com or by calling 800-555-2756.


Fine jewelry, watches and giftware will almost certainly cost you less at Borsheim's. I've looked at the figures for all publicly-owned jewelry companies and the contrast with Borsheim's is startling. Our one-store operation, with its huge volume, enables us to operate with costs that are fully 15-20 percentage points below those incurred by our competitors. We pass the benefits of this low-cost structure along to our customers.


Every year Borsheim's sends out thousands of selections to customers who want a long-distance opportunity to inspect what it offers and decide which, if any, item they'd like to purchase. We do a huge amount of business in this low-key way, which allows the shopper to conveniently see the exceptional values that we offer. Call Sean Moore or Karen Goracke at Borsheim's (800-642-4438) and save substantial money on your next purchase of jewelry.

"""]

In [57]:
def prepare(text_messages):
  d = getDoc(text_messages)
  return cv.transform(d)
text_messages = prepare(text_messages)

In [60]:
y_pred = model.predict(text_messages)


In [61]:
y_pred = model.predict(text_messages)
y_pred

array(['ham', 'spam', 'spam'], dtype='<U4')