## Text Classification



#### Importing modules and data pre-processing

In [1]:
import nltk
import numpy as np
import pandas as pd
import os
import re
import string 
from nltk.corpus import stopwords

In [2]:
table = str.maketrans({key: None for key in string.punctuation})
table

{33: None,
 34: None,
 35: None,
 36: None,
 37: None,
 38: None,
 39: None,
 40: None,
 41: None,
 42: None,
 43: None,
 44: None,
 45: None,
 46: None,
 47: None,
 58: None,
 59: None,
 60: None,
 61: None,
 62: None,
 63: None,
 64: None,
 91: None,
 92: None,
 93: None,
 94: None,
 95: None,
 96: None,
 123: None,
 124: None,
 125: None,
 126: None}

In [3]:
PATH = os.getcwd()
os.chdir(PATH)

In [6]:
data = pd.read_csv("SMSSpamCollection.csv",delimiter="\t") #Reading data
data.columns = ['Type','Text'] #Changing Column names 

In [7]:
data.head(5) #Displays first five records

Unnamed: 0,Type,Text
0,ham,Ok lar... Joking wif u oni...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,U dun say so early hor... U c already then say...
3,ham,"Nah I don't think he goes to usf, he lives aro..."
4,spam,FreeMsg Hey there darling it's been 3 week's n...


In [9]:
def preprocess_text(text):
    text = re.sub(r'\s+',' ',text)
    text = re.sub('[\d]','',text)
    text = text.translate(table)
    return text

In [10]:
data['Text'] = data['Text'].apply(lambda x : preprocess_text(x))

In [11]:
data.head()

Unnamed: 0,Type,Text
0,ham,Ok lar Joking wif u oni
1,spam,Free entry in a wkly comp to win FA Cup final...
2,ham,U dun say so early hor U c already then say
3,ham,Nah I dont think he goes to usf he lives aroun...
4,spam,FreeMsg Hey there darling its been weeks now ...


##### Constructing tf-idf matrix

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer 

In [13]:
tfidf_transformer = TfidfVectorizer(ngram_range=(1,1), stop_words='english')
X_train_tfidf = tfidf_transformer.fit_transform(data['Text'])

In [18]:
type(tfidf_transformer)

sklearn.feature_extraction.text.TfidfVectorizer

In [19]:
type(X_train_tfidf)

scipy.sparse.csr.csr_matrix

In [20]:
X_train_tfidf.shape

(5571, 8339)

In [28]:
X_train_tfidf

<5571x8339 sparse matrix of type '<class 'numpy.float64'>'
	with 41245 stored elements in Compressed Sparse Row format>

In [21]:
Dense_mat = X_train_tfidf.todense()

type(Dense_mat)

numpy.matrixlib.defmatrix.matrix

In [29]:
Dense_mat

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [22]:
len(tfidf_transformer.get_feature_names())

8339

In [27]:
len(tfidf_transformer.vocabulary_)

8339

In [23]:
tfidf_transformer.get_feature_names()

['aa',
 'aah',
 'aaniye',
 'aaooooright',
 'aathilove',
 'aathiwhere',
 'ab',
 'abbey',
 'abdomen',
 'abeg',
 'abelu',
 'aberdeen',
 'abi',
 'ability',
 'abiola',
 'abj',
 'able',
 'abnormally',
 'aboutas',
 'abroad',
 'absence',
 'absolutely',
 'abstract',
 'abt',
 'abta',
 'aburo',
 'abuse',
 'abusers',
 'ac',
 'academic',
 'acc',
 'accent',
 'accenture',
 'accept',
 'access',
 'accessible',
 'accidant',
 'accident',
 'accidentally',
 'accommodation',
 'accommodationvouchers',
 'accomodate',
 'accomodations',
 'accordin',
 'accordingly',
 'accordinglyor',
 'account',
 'accounting',
 'accounts',
 'accumulation',
 'achanammarakheshqatar',
 'ache',
 'achieve',
 'acid',
 'acknowledgement',
 'aclpm',
 'acnt',
 'acoentry',
 'acsmsrewards',
 'act',
 'acted',
 'actin',
 'acting',
 'action',
 'activ',
 'activate',
 'active',
 'activities',
 'actor',
 'actual',
 'actually',
 'acwicmbcktzr',
 'ad',
 'adam',
 'add',
 'addamsfa',
 'added',
 'addicted',
 'addie',
 'adding',
 'address',
 'addressul

In [30]:
stop_words = stopwords.words('english')

In [31]:
stop_words.extend(['aa','aah','aaniye','abj','ag'])

In [45]:
tfidf_transformer = TfidfVectorizer(ngram_range=(1,1), stop_words=stop_words)
X_train_tfidf = tfidf_transformer.fit_transform(data['Text'])
X_train_tfidf

<5571x8472 sparse matrix of type '<class 'numpy.float64'>'
	with 45591 stored elements in Compressed Sparse Row format>

In [33]:
#Lets consider TF-idf
Dense_mat = X_train_tfidf.todense()
A = pd.DataFrame(Dense_mat, columns=tfidf_transformer.get_feature_names())
A.shape
A.head()

Unnamed: 0,aaooooright,aathilove,aathiwhere,ab,abbey,abdomen,abeg,abelu,aberdeen,abi,...,zhong,zindgi,zoe,zogtorius,zoom,zouk,zs,zyada,üll,〨ud
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
X_train_tfidf.shape

(5571, 8472)

##### Perform the train test split on the data and then build the model

In [36]:
from sklearn.model_selection import train_test_split

In [37]:
X_train, X_test, y_train, y_test=train_test_split(Dense_mat, data['Type'],test_size=0.3,random_state=1234)

In [38]:
X_train.shape

(3899, 8472)

In [39]:
X_test.shape

(1672, 8472)

In [40]:
y_train.shape

(3899,)

In [41]:
y_train.value_counts()

ham     3378
spam     521
Name: Type, dtype: int64

In [42]:
y_test.value_counts()

ham     1446
spam     226
Name: Type, dtype: int64

##### Implementing Naive Bayes

In [49]:
# Importing the required Libraries
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix

# Building the Naive Bayes Model
clf_train = MultinomialNB()

# Modelling Up on Train & Predicting up on Test
clf_train.fit(X_train, y_train)
pred = clf_train.predict(X_test)



In [50]:
prediction = confusion_matrix(y_test,pred)
prediction

array([[1446,    0],
       [  67,  159]])

In [51]:
from sklearn.metrics import recall_score, precision_score, accuracy_score

acc = accuracy_score(y_test, pred)
rec = recall_score(y_test, pred,pos_label='spam')
prec = precision_score(y_test, pred,pos_label='spam')

# Printing the Metrics
print(acc)
print(rec)
print(prec)

0.9599282296650717
0.7035398230088495
1.0


##### Implmenting lositic regression

In [52]:
# Importing the Logistic-Regression Model from SKLearn Package
from sklearn.linear_model import LogisticRegression

# Building the Logistic Regression Model
logreg_train = LogisticRegression()
clf_lr = logreg_train.fit(X_train, y_train)



In [53]:
# Test Metrics
pred_lr_test = clf_lr.predict(X_test)
acc_lr_test = accuracy_score(y_test, pred_lr_test)
rec_lr_test = recall_score(y_test, pred_lr_test, pos_label='spam')
prec_lr_test = precision_score(y_test, pred_lr_test, pos_label='spam')

# Printing the Metrics
print(acc_lr_test)
print(rec_lr_test)
print(prec_lr_test)

0.9521531100478469
0.6592920353982301
0.9802631578947368
