# start proj

In [1]:
import pandas as pd

df = pd.read_csv("spam.csv",encoding='ISO-8859-1')
#ISO 8859-1 is a single-byte encoding that can represent the first 256 Unicode characters.

### analyzing data

In [2]:
df.shape

(5572, 5)

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [5]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
#Renaming the columns
df.rename(columns={'v1':'label', 'v2':'msg'}, inplace=True)
df.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
#Checking for duplicated values
df.duplicated().sum()

403

In [8]:
df.drop_duplicates(inplace=True)
df.shape

(5169, 2)

In [9]:
#checking for percentage of spam and ham
df['label'].value_counts(normalize=True)*100

ham     87.366996
spam    12.633004
Name: label, dtype: float64

In [10]:
# convert label to a numerical variable
df['lbl_num'] = df.label.map({'ham':0, 'spam':1})
df.head()

Unnamed: 0,label,msg,lbl_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


### Data Preprocessing


In [11]:
import nltk
nltk.download('punkt')
import string
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aboth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aboth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
'''
1) Convert text into lower_case 
2) Tokenization :- breaking list into words 
3) Removing stopwords 
4) Removing punctuation 
5) stemming :- Stemming is a technique used to reduce an inflected word down to its word stem. ex-> go,goes,going into 'go'
'''

def process_msg(msg):

    STOPWORDS = stopwords.words('english') + ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure']

    msg = msg.lower()
    msg = nltk.word_tokenize(msg)
    msg = [word for word in msg if word not in string.punctuation]
    msg = [word for word in msg if word not in STOPWORDS]
    msg = [ps.stem(word) for word in msg]
    return msg

In [13]:
# calling function 'process_msg' by passing 'msg' records

df['clean_msg']=df['msg'].apply(process_msg)

In [14]:
df.head()

Unnamed: 0,label,msg,lbl_num,clean_msg
0,ham,"Go until jurong point, crazy.. Available only ...",0,"[go, jurong, point, crazi, .., avail, bugi, n,..."
1,ham,Ok lar... Joking wif u oni...,0,"[ok, lar, ..., joke, wif, oni, ...]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,"[free, entri, wkli, comp, win, fa, cup, final,..."
3,ham,U dun say so early hor... U c already then say...,0,"[dun, say, earli, hor, ..., c, alreadi, say, ...]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,"[nah, n't, think, goe, usf, live, around, though]"


In [15]:
from collections import Counter
#spam words
words = df[df.label=='ham'].clean_msg
ham_words = Counter()

for i in words:
    ham_words.update(i)
    
print(ham_words.most_common(50))

[('...', 1073), ('..', 524), ("'s", 413), ('go', 404), ("'m", 371), ('get', 349), ("n't", 338), ('gt', 288), ('lt', 287), ('come', 275), ('got', 236), ('know', 236), ('like', 234), ('call', 233), ('time', 219), ('ok', 217), ('love', 216), ('good', 213), ('want', 208), ("''", 201), ("'ll", 195), ('day', 190), ('need', 170), ('one', 165), ('lor', 159), ('home', 152), ('think', 149), ('see', 147), ('take', 143), ('still', 143), ('da', 138), ('tell', 133), ('make', 129), ('say', 127), ('back', 127), ('today', 123), ('hope', 122), ('ask', 121), ('sorri', 121), ('n', 120), ('ì_', 120), ('send', 120), ('r', 120), ('work', 118), ('meet', 112), ('hi', 111), ('well', 109), ('thing', 109), ('wat', 108), ('k', 107)]


In [16]:
words = df[df.label=='spam'].clean_msg
spam_words = Counter()

for i in words:
    spam_words.update(i)
    
print(spam_words.most_common(50))

[('call', 320), ('free', 191), ('txt', 141), ('text', 122), ('mobil', 114), ('stop', 104), ('repli', 103), ('claim', 98), ('prize', 82), ('get', 74), ('new', 64), ('servic', 64), ('tone', 63), ('send', 60), ("'s", 59), ('urgent', 57), ('nokia', 57), ('contact', 56), ('award', 55), ('phone', 52), ('cash', 51), ('pleas', 51), ('week', 49), ('win', 48), ('c', 45), ('collect', 45), ('min', 45), ('custom', 42), ('messag', 42), ('guarante', 42), ('per', 41), ('chat', 38), ('tri', 37), ('msg', 35), ('draw', 35), ('number', 35), ('cs', 35), ('show', 33), ('today', 33), ('offer', 33), ('line', 33), ('go', 32), ('receiv', 31), ('want', 31), ('latest', 30), ('rington', 30), ('landlin', 30), ('150ppm', 29), ('video', 29), ('1', 28)]


In [18]:
df['clean_msg'] = df.clean_msg.apply(' '.join)

In [19]:
# split X and y into training and testing sets 
from sklearn.model_selection import train_test_split

# how to define X and y (from the SMS data) for use with COUNTVECTORIZER
X = df.clean_msg
y = df.lbl_num
print(X.shape)
print(y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5169,)
(5169,)
(3876,)
(1293,)
(3876,)
(1293,)


In [20]:
from sklearn.feature_extraction.text import CountVectorizer

# instantiate the vectorizer
vect = CountVectorizer()
vect.fit(X_train)

# learn training data vocabulary, then use it to create a document-term matrix
X_train_dtm = vect.transform(X_train)

# equivalently: combine fit and transform into a single step
X_train_dtm = vect.fit_transform(X_train)


# examine the document-term matrix
print(type(X_train_dtm), X_train_dtm.shape)

# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
print(type(X_test_dtm), X_test_dtm.shape)

<class 'scipy.sparse._csr.csr_matrix'> (3876, 6279)
<class 'scipy.sparse._csr.csr_matrix'> (1293, 6279)


In [21]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(X_train_dtm)
tfidf_transformer.transform(X_train_dtm)

<3876x6279 sparse matrix of type '<class 'numpy.float64'>'
	with 32487 stored elements in Compressed Sparse Row format>

In [22]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [23]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
nb.fit(X_train_dtm, y_train)

In [24]:
from sklearn import metrics

# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

# calculate accuracy of class predictions
print("=======Accuracy Score===========")
print(metrics.accuracy_score(y_test, y_pred_class))

# print the confusion matrix
print("=======Confision Matrix===========")
metrics.confusion_matrix(y_test, y_pred_class)

0.9822119102861562


array([[1121,    7],
       [  16,  149]], dtype=int64)

In [25]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

pipe = Pipeline([('bow', CountVectorizer()), 
                 ('tfid', TfidfTransformer()),  
                 ('model', MultinomialNB())])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

# calculate accuracy of class predictions
print("=======Accuracy Score===========")
print(metrics.accuracy_score(y_test, y_pred))

# print the confusion matrix
print("=======Confision Matrix===========")
metrics.confusion_matrix(y_test, y_pred)

0.9551430781129157


array([[1128,    0],
       [  58,  107]], dtype=int64)

In [26]:
# import an instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(solver='liblinear')

# train the model using X_train_dtm
logreg.fit(X_train_dtm, y_train)

In [27]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test_dtm)

# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

array([0.00547747, 0.01291851, 0.0069199 , ..., 0.02320109, 0.01453453,
       0.00310945])

In [28]:
# calculate accuracy of class predictions
print("=======Accuracy Score===========")
print(metrics.accuracy_score(y_test, y_pred_class))

# print the confusion matrix
print("=======Confision Matrix===========")
print(metrics.confusion_matrix(y_test, y_pred_class))

# calculate AUC
print("=======ROC AUC Score===========")
print(metrics.roc_auc_score(y_test, y_pred_prob))

0.9706109822119103
[[1127    1]
 [  37  128]]
0.983991510853213
