Various approaches of converting text into Vector

1. Label Encoding
2. One hot encoding
3. Bag of words
4. TF-IDF4
5. Word Embeddings

# Text Representation Stop Words

In [1]:
import spacy

from spacy.lang.en.stop_words import STOP_WORDS

len(STOP_WORDS)

326

In [2]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("We just opened our wings, the flying part is coming soon")

for token in doc:
    if token.is_stop:
        print(token)

We
just
our
the
part
is


In [3]:
def preprocess(text):
    doc = nlp(text)
    
    no_stop_words = [token.text for token in doc if not token.is_stop]
    return " ".join(no_stop_words)

In [4]:
preprocess("Musk wants time to prepare for a trial over his")

'Musk wants time prepare trial'

Remove stop words from pandas dataframe text column

In [5]:
import pandas as pd

df = pd.read_json("doj_press.json",lines=True)
df_sample = df.head(100)
df_sample.shape

(100, 6)

In [6]:
df_sample = df_sample[df_sample["topics"].str.len() != 0]
df_sample.head()

Unnamed: 0,id,title,contents,date,topics,components
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division]
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U..."
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division]
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]"


In [7]:
df_sample["contents_new"] = df_sample.contents.apply(preprocess)
df_sample

Unnamed: 0,id,title,contents,date,topics,components,contents_new
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division],"U.S. Department Justice , U.S. Environmental P..."
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division],131 - count criminal indictment unsealed today...
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U...",United States Attorney Office Middle District ...
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division],"21st Century Oncology LLC , agreed pay $ 19.75..."
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]",21st Century Oncology Inc. certain subsidiarie...
26,18-961,24 Defendants Sentenced in Multimillion Dolla...,Twenty-one members of a massive India-based fr...,2018-07-20T00:00:00-04:00,"[Consumer Protection, Elder Justice]","[Criminal Division, USAO - Texas, Southern]",- members massive India - based fraud money la...
27,12-306,$25 Billion Mortgage Servicing Agreement Filed...,View the court documents. WASHINGTON – The Ju...,2012-03-12T00:00:00-04:00,"[Consumer Protection, StopFraud]",[Office of the Associate Attorney General],View court documents . WASHINGTON – Justice ...
29,17-1182,"30 Members and Associates of The ""Nine Trey Ga...",Federal agents have arrested 17 members and as...,2017-10-23T00:00:00-04:00,[Opioids],"[Criminal Division, USAO - Georgia, Northern]",Federal agents arrested 17 members associates ...
30,15-1560,32 Hospitals to Pay U.S. More Than $28 Million...,Thirty-two hospitals located throughout 15 sta...,2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division],Thirty - hospitals located 15 states agreed pa...
32,,34 Individuals Facing Federal and Tribal Charg...,Thirty-four individuals are facing federal and...,2015-12-14T00:00:00-05:00,"[Drug Trafficking, Indian Country Law and Just...",[USAO - New Mexico],Thirty - individuals facing federal tribal dru...


In [8]:
df_sample.contents[4]

"The U.S. Department of Justice, the U.S. Environmental Protection Agency (EPA), and the Rhode Island Department of Environmental Management (RIDEM) announced today that two subsidiaries of Stanley Black & Decker Inc.—Emhart Industries Inc. and Black & Decker Inc.—have agreed to clean up dioxin contaminated sediment and soil at the Centredale Manor Restoration Project Superfund Site in North Providence and Johnston, Rhode Island.\xa0 “We are pleased to reach a resolution through collaborative work with the responsible parties, EPA, and other stakeholders,” said\xa0Acting Assistant Attorney General Jeffrey H. Wood for the Justice Department's\xa0Environment and Natural Resources Division . “Today’s settlement ends protracted litigation and allows for important work to get underway to restore a healthy environment for citizens living in and around the Centredale Manor Site and the Woonasquatucket River.” “This settlement demonstrates the tremendous progress we are achieving working with 

In [9]:
df_sample.contents_new[4][]

'U.S. Department Justice , U.S. Environmental Protection Agency ( EPA ) , Rhode Island Department Environmental Management ( RIDEM ) announced today subsidiaries Stanley Black & Decker Inc.—Emhart Industries Inc. Black & Decker Inc.—have agreed clean dioxin contaminated sediment soil Centredale Manor Restoration Project Superfund Site North Providence Johnston , Rhode Island . \xa0  “ pleased reach resolution collaborative work responsible parties , EPA , stakeholders , ” said \xa0 Acting Assistant Attorney General Jeffrey H. Wood Justice Department \xa0 Environment Natural Resources Division . “ Today settlement ends protracted litigation allows important work underway restore healthy environment citizens living Centredale Manor Site Woonasquatucket River . ” “ settlement demonstrates tremendous progress achieving working responsible parties , states , federal partners expedite sites entire Superfund remediation process , ” said EPA Acting Administrator Andrew Wheeler . “ Centredale M

**Examples where removing stop words can create a problem**

(1) Sentiment detection: Not always but in some cases, based on your dataset it can change the sentiment of a sentence if you remove stop words

In [10]:
preprocess("this is a good movie")

'good movie'

In [11]:
preprocess("this is not a good movie")

'good movie'

(2) Language translation: Say you want to translate following sentence from english to telugu. Before actual translation if you remove stop words and then translate, it will produce horrible result

In [12]:
preprocess("how are you doing dhaval?")

'dhaval ?'

(3) Chat bot or any Q&A system


In [13]:
preprocess("I don't find yoga mat on your website. Can you help?")

'find yoga mat website . help ?'

**Exercises**

In [15]:
#import spacy and load the model

import spacy
nlp = spacy.load("en_core_web_sm")

In [16]:
text = '''
Thor: Love and Thunder is a 2022 American superhero film based on Marvel Comics featuring the character Thor, produced by Marvel Studios and 
distributed by Walt Disney Studios Motion Pictures. It is the sequel to Thor: Ragnarok (2017) and the 29th film in the Marvel Cinematic Universe (MCU).
The film is directed by Taika Waititi, who co-wrote the script with Jennifer Kaytin Robinson, and stars Chris Hemsworth as Thor alongside Christian Bale, Tessa Thompson,
Jaimie Alexander, Waititi, Russell Crowe, and Natalie Portman. In the film, Thor attempts to find inner peace, but must return to action and recruit Valkyrie (Thompson),
Korg (Waititi), and Jane Foster (Portman)—who is now the Mighty Thor—to stop Gorr the God Butcher (Bale) from eliminating all gods.
'''

#step1: Create the object 'doc' for the given text using nlp()
doc = nlp(text)


#step2: define the variables to keep track of stopwords count and total words count
stop_words_count = 0
total_words_count = 0


#step3: iterate through all the words in the document
for i in doc:
    if i.is_stop:
        stop_words_count +=1
    total_words_count +=1


#step4: print the count of stop words
print(f"Total Stop words presented in the given text: {stop_words_count}")
    

#step5: print the percentage of stop words compared to total words in the text
percentage_stop_words = (stop_words_count / total_words_count) * 100
print(f"Percentage of Stop words presented in the given text: {percentage_stop_words} %")

Total Stop words presented in the given text: 40
Percentage of Stop words presented in the given text: 25.0 %


In [18]:
#use this pre-processing function to pass the text and to remove all the stop words and finally get the cleaned form
def preprocess(text):
    doc = nlp(text)
    no_stop_words = [token.text for token in doc if not token.is_stop]
    return " ".join(no_stop_words)       


#Step1: remove the stopword 'not' in spacy
nlp.vocab['not'].is_stop = False


#step2: send the two texts given above into the pre-process function and store the transformed texts
positive_text = preprocess('He is good doctor')
negative_text = preprocess('He is not good surgeon')


#step3: finally print those 2 transformed texts
print(f"Text1: {positive_text}")
print(f"Text2: {negative_text}")

Text1: good doctor
Text2: not good surgeon


In [41]:
text = ''' The India men's national cricket team, also known as Team India or the Men in Blue, represents India in men's international cricket.
It is governed by the Board of Control for Cricket in India (BCCI), and is a Full Member of the International Cricket Council (ICC) with Test,
One Day International (ODI) and Twenty20 International (T20I) status. Cricket was introduced to India by British sailors in the 18th century, and the 
first cricket club was established in 1792. India's national cricket team played its first Test match on 25 June 1932 at Lord's, becoming the sixth team to be
granted test cricket status.
'''


#step1: Create the object 'doc' for the given text using nlp()
doc = nlp(text)


#step2: remove all the stop words and punctuations and store all the remaining tokens in a new list
def cleanup(doc):
    no_stop_punct = [token.text for token in doc if not (token.is_stop or token.is_punct or token.is_space)]
    return no_stop_punct

remaining_tokens = cleanup(doc)
remaining_tokens

#step3: create a new dictionary and get the frequency of words by iterating through the list which contains stored tokens  
frequency_tokens = {}
for token in remaining_tokens:
    if token != '\n' and token != ' ':      #As spacy considers new line and empty spaces as seperate token, it's better to ignore them
        if token not in frequency_tokens:     #if a particular token occurs for the first time, we initialise it to 1
            frequency_tokens[token] = 1
        else:
            frequency_tokens[token] += 1        #if a partcular token is already present, then increment by 1 based on value already presented

    

#step4: get the maximum frequency word
max_freq_word = max(frequency_tokens.keys(), key=(lambda key: frequency_tokens[key]))

#step5: finally print the result
print(f"Maximum frequency word: {max_freq_word}") 

Maximum frequency word: India
