In [1]:
import spacy

In [2]:
from spacy.lang.en.stop_words import STOP_WORDS

In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
text = ' Stop words are words that are commonly used in a language but are typically filtered out or ignored when processing natural language text. '

In [5]:
doc = nlp(text)

In [6]:
for token in doc:
    if token.is_stop:
        print(token.text)

are
that
are
used
in
a
but
are
out
or
when


In [7]:
def preprocessing(text):
    doc = nlp(text)
    non_stop_words = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return non_stop_words
    

In [8]:
preprocessing('These words are often common and do not carry much meaningful information about the content of the text.')

['words', 'common', 'carry', 'meaningful', 'information', 'content', 'text']

In [9]:
def preprocess(text):
    doc = nlp(text)
    non_stop_words = [token.text for token in doc if not token.is_stop]
    return ' '.join(non_stop_words)

In [10]:
preprocess('These words are often common and do not carry much meaningful information about the content of the text.')

'words common carry meaningful information content text .'

In [11]:
import pandas as pd

In [12]:
df = pd.read_json('doj_press.json',lines=True)

In [13]:
df.head()

Unnamed: 0,id,title,contents,date,topics,components
0,,Convicted Bomb Plotter Sentenced to 30 Years,"PORTLAND, Oregon. – Mohamed Osman Mohamud, 23,...",2014-10-01T00:00:00-04:00,[],[National Security Division (NSD)]
1,12-919,$1 Million in Restitution Payments Announced t...,WASHINGTON – North Carolina’s Waccamaw River...,2012-07-25T00:00:00-04:00,[],[Environment and Natural Resources Division]
2,11-1002,$1 Million Settlement Reached for Natural Reso...,BOSTON– A $1-million settlement has been...,2011-08-03T00:00:00-04:00,[],[Environment and Natural Resources Division]
3,10-015,10 Las Vegas Men Indicted \r\nfor Falsifying V...,WASHINGTON—A federal grand jury in Las Vegas...,2010-01-08T00:00:00-05:00,[],[Environment and Natural Resources Division]
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]


In [14]:
df.shape


(13087, 6)

In [15]:
df = df[df['topics'].str.len() != 0]

In [16]:
df.head()

Unnamed: 0,id,title,contents,date,topics,components
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division]
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U..."
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division]
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]"


In [17]:
df.shape


(4688, 6)

In [18]:
df = df.head(100)

In [20]:
df.shape

(100, 6)

In [29]:
df['contents_new'] = df.contents.apply(preprocess)

In [30]:
df.head()

Unnamed: 0,id,title,contents,date,topics,components,conents_new,contents_new
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division],"U.S. Department Justice , U.S. Environmental P...","U.S. Department Justice , U.S. Environmental P..."
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division],131 - count criminal indictment unsealed today...,131 - count criminal indictment unsealed today...
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U...",United States Attorney Office Middle District ...,United States Attorney Office Middle District ...
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division],"21st Century Oncology LLC , agreed pay $ 19.75...","21st Century Oncology LLC , agreed pay $ 19.75..."
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]",21st Century Oncology Inc. certain subsidiarie...,21st Century Oncology Inc. certain subsidiarie...


In [31]:
df['contents'].iloc[4][:200]

'21st Century Oncology Inc. and certain of its subsidiaries and affiliates have agreed to pay $26 million to the government to resolve a self-disclosure relating to the submission of false attestations'

In [32]:
df['contents_new'].iloc[4][:200]

'21st Century Oncology Inc. certain subsidiaries affiliates agreed pay $ 26 million government resolve self - disclosure relating submission false attestations company use electronic health records sof'

### Do not use Stop word removal in certain NLP task

#### 1. Sentiment detection: Not always but in some cases, based on your dataset it can change the sentiment of a sentence if you remove stop words

In [33]:
preprocess('This is a good place')

'good place'

In [34]:
preprocess('this is not a good place ')

'good place'

#### 2. Language translation: Say you want to translate following sentence from english to telugu. Before actual translation if you remove stop words and then translate, it will produce horrible result

In [35]:
preprocess("how are you doing Hazel?")


'Hazel ?'

#### 3. Chat bot or any Q&A system

In [37]:
preprocess('I did not recieve the product. Could you please help me? ')

'recieve product . help ?'