# NLP with Spacy

In [1]:
# Importing required module
import pandas as pd
import numpy as np
import warnings;warnings.filterwarnings('ignore');

import spacy
# python -m spacy download en_core_web_sm
# Load the en_core_web_sm model

In [2]:
# Reading Data
# Determining weather message is spam or not
data = pd.read_csv("D:\Ravi_Data\spam.csv", encoding = 'latin-1')
data = data[['label','text']]
print(data.info())

data['label'] = data['label'].apply(lambda x: 1 if x == "spam" else 0)
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
label    5572 non-null object
text     5572 non-null object
dtypes: object(2)
memory usage: 87.2+ KB
None


Unnamed: 0,label,text
0,0,"Couldn't Go until jurong point, crazy.. Availa..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


# Text cleaning

In [3]:
# Inansiating Spacy and tokonizing text variable
nlp = spacy.load("en_core_web_sm")
# stop word list in spacy for english words
stopwords = spacy.lang.en.stop_words.STOP_WORDS

def processing(text):
    
  	# Create Doc object
    global doc 
    doc = nlp(text, disable=['ner', 'parser'])
    # Generate lemmas
    lemmas = [token.lemma_ for token in doc]
    # Only if word doesn't start with capital letter. remove stopwords and non-alphabetic characters
    clean = []
    for lemma in lemmas:
        if lemma[0].isupper():
            clean.append(lemma)
        else:
            if lemma.isalpha() and lemma not in stopwords:
                clean.append(lemma)
    cleaned = [x.lower() for x in clean]
    return cleaned

data['text'] = data['text'].apply(processing)
data.head()

Unnamed: 0,label,text
0,0,"[could, jurong, point, crazy, available, bugis..."
1,0,"[ok, lar, joking, wif, u, oni]"
2,1,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,0,"[u, dun, early, hor, u, c]"
4,0,"[nah, think, usf, live]"


In [4]:
# Removing words which appeared only once in all document combined
from collections import defaultdict

frequency = defaultdict(int)
for text in data['text']:
    for token in text:
        frequency[token] += 1

data['text'] = [[token for token in text if frequency[token] > 1]for text in data['text']]

# joining all token again in sentences
data['text'] = data['text'].apply(lambda x : ' '.join(x))
print(data[['text']].head())    

                                                text
0  could point crazy available bugis n great worl...
1                                   ok lar wif u oni
2  free entry wkly comp win fa cup final tkts may...
3                                u dun early hor u c
4                                 nah think usf live


# Creating additional features from POS & NER

In [5]:
#pos= ["PROPN","NOUN","NUM","ADJ"]
#tag =['NNP','PERSON','MONEY','ORG','GPE','ORDINAL','WORK_OF_ART','EVENT','FAC']
    
file = data['text'].tolist()


def pos_tag_features(list):
    
    n = len(list)
    pnoun,noun,num,adj,nnp,pers,money,org,gpe,ordi,war,event,fac = (np.zeros(n) for k in range(13))

    for i,text in enumerate(list):
        doc = nlp(text,disable=['ner', 'parser'])
        pos = [token.pos_ for token in doc]
        pnoun[i] = pos.count("PROPN")
        noun[i] = pos.count("NOUN")
        num[i] = pos.count("NUM")
        adj[i] = pos.count("ADJ")
        
        tag = [token.tag_ for token in doc]
        nnp[i] = tag.count("NNP")
        pers[i] = tag.count("PERSON")
        money[i] = tag.count("MONEY")
        org[i] = tag.count("ORG")
        gpe[i] = tag.count("GPE")
        ordi[i]= tag.count("ORDINAL")
        war[i] = tag.count("WORK_OF_ART")
        event[i] = tag.count("EVENT")
        fac[i] = tag.count("FAC") 
        
    return pd.DataFrame({"pnoun":pnoun,"noun":noun,"num":num,"adj":adj,"nnp":nnp,"pers":pers,
                         "money":money,"org":org,"gpe":gpe,"ordi":ordi,"war":war,"event":event,"fac":fac})
    
        
pos_tag_DF = pos_tag_features(file)
pos_tag_DF.head() 

Unnamed: 0,pnoun,noun,num,adj,nnp,pers,money,org,gpe,ordi,war,event,fac
0,6.0,1.0,0.0,3.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,3.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Feature Selection from newly created features ( Fisher score)

In [6]:
# Works for binary classification
# work better if independent features are numeric.
# Standardization is not needed
# It finds importance of each feature independently based on its binary claasification variance
# Input of function must be DataFrame and Target varible name
# Higher Fisher score higher importance

def fisherScore(df, target):
    fisher_score = []
    y = np.unique(df[target])
    df_col = df.columns.tolist()
    df_col.remove(target)
    for col in df_col:
        abs_mean_diff = abs(df[df[target] == y[0]][col].mean() - df[df[target] == y[1]][col].mean())
        sqrt_var = np.sqrt(df[df[target] == y[0]][col].var() + df[df[target] == y[1]][col].var())
        fisher = abs_mean_diff / sqrt_var
        fisher_score.append(fisher.round(3))
        
    scoreDF = pd.DataFrame({'Column Name':df_col,'Fisher Score':fisher_score}).sort_values('Fisher Score', ascending = False)
    return scoreDF


In [7]:
# combining pos_tag_DF with label df 
df = pd.concat([pos_tag_DF,data[['label']]], axis=1)

# Creating Fisher score
result = fisherScore(df=df,target="label")

# only 4 newly created features are relevent so selecting only those and will be combined with base table.
reduced = result["Column Name"].head(4).tolist()
pos_tag_DF_fisher = pos_tag_DF[reduced]

# Higher Fisher score higher importance
result

Unnamed: 0,Column Name,Fisher Score
1,noun,0.698
0,pnoun,0.65
4,nnp,0.646
3,adj,0.504
2,num,0.039
5,pers,
6,money,
7,org,
8,gpe,
9,ordi,


In [8]:
# # Combning with base data dataframe and then do train test split

# loading sciki learn libraries and spliting data into test and train
from sklearn.model_selection import train_test_split
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(data['text'],data['label'],test_size=0.3,
                                                    random_state = 111,stratify =data['label'],shuffle=True)


# Bag of words creation and vectorization


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english',max_df=0.7)

# Tfidf vectorizer fit and transform
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_train_array = tfidf_train.toarray() 
# only transform X test data set
tfidf_test = tfidf_vectorizer.transform(X_test)
tfidf_test_array = tfidf_test.toarray() 

In [10]:
# selecting and Combining POS_Tag dataframe with vectorized data fame 
# indices of X train and test
train_index =  X_train.index
test_index = X_test.index

pos_tag_train_array = pos_tag_DF_fisher.iloc[train_index,:].values
pos_tag_test_array = pos_tag_DF_fisher.iloc[test_index,:].values


final_train = np.concatenate((tfidf_train_array,pos_tag_train_array ), axis=1)
final_test = np.concatenate((tfidf_test_array,pos_tag_test_array ), axis=1)
final_test

array([[ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  1.,  1.,  0.],
       [ 0.,  0.,  0., ..., 11., 11.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  1.,  1.,  0.],
       [ 0.,  0.,  0., ...,  9.,  9.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.]])

# Multinomial Naive Bayes classifier

In [11]:
# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
from sklearn.naive_bayes import MultinomialNB
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(final_train,y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(final_test)

In [12]:
# Evaluating Model
score = metrics.accuracy_score(y_test,pred)
print("Score :",score.round(2))
print("Fbeta :",metrics.fbeta_score(y_test,pred, beta = 0.5).round(2))

Score : 0.96
Fbeta : 0.91
