# Text Preprocessing in Python using spaCy library





**Here are the topics which we are going to cover:**

Tokenization

Lemmatization

Removing Punctuations and Stopwords

Part of Speech Tagging

Entity Recognition

## Tokenization

In [1]:
#importing libraries
import spacy

#instantiating English module
nlp = spacy.load("en_core_web_sm")

#sample
x = "Embracing and analyzing self failures (of however multitude) is a virtue of nobelmen."

#creating doc object containing our token features
doc = nlp(x)

#Creating and updating our list of tokens using list comprehension 
tokens = [token.text for token in doc]
print(tokens)

['Embracing', 'and', 'analyzing', 'self', 'failures', '(', 'of', 'however', 'multitude', ')', 'is', 'a', 'virtue', 'of', 'nobelmen', '.']


### Below is a sample code for Sentence tokenizing our text.



In [2]:
nlp = spacy.load('en_core_web_sm')

#Creating the pipeline 'sentencizer' component
sbd = nlp.create_pipe('sentencizer')

# Adding the component to the pipeline
nlp.add_pipe(sbd)

text = "Embracing and analyzing self failures (of however multitude) is a virtue of nobelmen. And nobility is a treasure few possess."

#creating doc object carring our sentence tokens
doc = nlp(text)

#Creating and updating our list of tokens using list comprehension 
tokens = [token for token in doc.sents]
print(tokens)


[Embracing and analyzing self failures (of however multitude) is a virtue of nobelmen., And nobility is a treasure few possess.]


## Lemmatization

In [3]:
#sample
nlp = spacy.load('en_core_web_sm')

x = "Running down the street with my best buddy."

#creating doc object containing our token features
doc = nlp(x)

#Creating and updating our list of tokens using list comprehension 
tokens = [(token.text,token.lemma_) for token in doc]
print(tokens)


[('Running', 'run'), ('down', 'down'), ('the', 'the'), ('street', 'street'), ('with', 'with'), ('my', '-PRON-'), ('best', 'good'), ('buddy', 'buddy'), ('.', '.')]


## Stop Words

In [4]:
from spacy.lang.en.stop_words import STOP_WORDS
stop = STOP_WORDS
print(stop)

{'whole', 'made', 'herself', 'on', 'neither', 'became', 'see', 'wherever', "'d", '’m', 'will', 'whom', 'anyhow', 'even', 'mine', 'five', 'yourselves', 'should', 'than', 'moreover', 'therein', 'around', 'once', 'after', 'both', 'in', 'other', 'whether', 'already', 'three', 'here', 'for', 'eleven', "'ll", 'keep', 'anyone', 'rather', 'has', 'at', 'put', 'throughout', 'further', 'just', 'beforehand', 'have', 'somewhere', '‘re', 'get', 'nothing', 'due', 'between', "'ve", 'still', 'least', 'indeed', '‘m', 'some', 'may', 'less', '‘ve', 'against', 'us', 'doing', 'is', 'down', 'hers', 'before', 'whoever', 'across', 'either', 'your', 'back', 'none', 'be', 'regarding', 'their', 'them', 'never', 'yours', 'my', 'else', 'also', '’re', 'anywhere', 'n’t', 'nor', 'few', 'although', 'over', 'beside', 'now', 'with', 'it', 'onto', 'she', 'nowhere', '‘ll', 'of', 'much', 'always', 'many', 'former', 'because', 'hereafter', 'about', 'every', 'twelve', 'whose', 'amount', 'out', 'however', 'could', 'therefore',

In [5]:
#sample
x = "Running down the street with my best buddy."

#creation of doc object containing our token features

doc = nlp(x)

#Creating and updating our list of tokens using list comprehension 

tokens = [token.text for token in doc]

print(tokens)

#Creating and updating our list of filtered tokens using list comprehension 

filtered = [token.text for token in doc if token.is_stop == False]

print(filtered)


['Running', 'down', 'the', 'street', 'with', 'my', 'best', 'buddy', '.']
['Running', 'street', 'best', 'buddy', '.']


### Removing Punctuation

You can compare the above two lists and notice words such as down,the,with and my have been removed.Now, similarly, we can also remove punctuation from our text as well using "isalpha" method of string objects and using list comprehensions



In [6]:
#sample 
x = "BLIMEY!! Such an exhausting day, I can't even describe."

#creation of doc object containing our token features
doc = nlp(x)

#Unfiltered tokens 
tokens = [token.text for token in doc]
print(tokens)

#Filtering our tokens
filtered = [token.text for token in doc if token.is_stop == False and       
token.text.isalpha() == True]

print(filtered,"\n")
print(" ".join(filtered))


['BLIMEY', '!', '!', 'Such', 'an', 'exhausting', 'day', ',', 'I', 'ca', "n't", 'even', 'describe', '.']
['BLIMEY', 'exhausting', 'day', 'describe'] 

BLIMEY exhausting day describe


## Part-of-Speech Tagging (POS)


In [7]:
#sample
x = "Robin is an astute programmer"

#Creating doc object
doc = nlp(x)

#Extracting POS
pos = [(token.text,token.pos_) for token in doc]
print (pos)


[('Robin', 'PROPN'), ('is', 'AUX'), ('an', 'DET'), ('astute', 'NOUN'), ('programmer', 'NOUN')]


## Entity Recognition


In [8]:
#sample article
x = u""" India is considering a proposal to guarantee as much as 3  
trillion rupees ($39 billion) of loans to small businesses as part of a plan to 
restart Asia's third-largest economy, which is reeling under the impact of a 40-
day lockdown, people with knowledge of the matter said."""

#creating doc object
bloomberg= nlp(x)


#extracting entities 
entities=[(i, i.label_, i.label) for i in bloomberg.ents]
print(entities)

[(India, 'GPE', 384), (as much as 3, 'CARDINAL', 397), ($39 billion, 'MONEY', 394), (Asia, 'LOC', 385), (third, 'ORDINAL', 396)]


## testing all these techniques on our email-spam detection data set

In [9]:
import pandas as pd
import numpy as np

In [10]:
df=pd.read_csv("/Users/sudhanshukumar/Documents/Development/Machine Learning/0 csv files/SMSSpamCollection.csv",sep="\t",names=["label","message"])

In [11]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [32]:
def text_process(message):
    doc=nlp(message)
    final=[token.lemma_ for token in doc if token.is_stop== False and token.text.isalpha()== True ]
    return final
#     return " ".join(final)

In [13]:
text = "Embracing AND analyzing self failures (of however multitude) is a virtue of nobelmen. And nobility is a treasure few possess."
print(text_process(text),"\n")
print(" ".join(text_process(text)))


['embrace', 'analyze', 'self', 'failure', 'multitude', 'virtue', 'nobelman', 'nobility', 'treasure', 'possess'] 

embrace analyze self failure multitude virtue nobelman nobility treasure possess


In [14]:
df["message"].head(3).apply(text_process)  #apply func works on DataFrames

0    [jurong, point, crazy, available, bugis, n, gr...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, wkly, comp, win, FA, Cup, final,...
Name: message, dtype: object

### Detect & remove empty strings
Technically, we're dealing with "whitespace only" strings. If the original .tsv file had contained empty strings, pandas **.read_csv()** would have assigned NaN values to those cells by default.

In order to detect these strings we need to iterate over each row in the DataFrame. The **.itertuples()** pandas method is a good tool for this as it provides access to every field. For brevity we'll assign the names `i`, `lb` and `rv` to the `index`, `label` and `review` columns.

In [15]:
blanks = []  # start with an empty list

for i,lb,msg in df.itertuples():  # iterate over the DataFrame
    if type(msg)==str:            # avoid NaN values
        if msg.isspace():         # test 'review' for whitespace
            blanks.append(i)     # add matching index numbers to the list
        
print(len(blanks), 'blanks: ', blanks)

0 blanks:  []


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

In [17]:
tfidf=TfidfVectorizer(analyzer=text_process)

X=tfidf.fit_transform(df["message"])

In [18]:
y=df["label"]

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
clf=LinearSVC()

clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.9802690582959641

In [21]:
#without analyzer

tfidf=TfidfVectorizer()

X=tfidf.fit_transform(df["message"])
y=df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf=LinearSVC()

clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.9838565022421525

In [22]:
cv=CountVectorizer()

X=cv.fit_transform(df["message"])
y=df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf=LinearSVC()

clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.9901345291479821

In [30]:
df["processed"]=df["message"].apply(text_process)  #uncomment the last line of text_process func

In [31]:
df.head() 

Unnamed: 0,label,message,processed
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win FA Cup final tkts Tex...
3,ham,U dun say so early hor... U c already then say...,U dun early hor U c
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think go usf live
