In [1]:
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
len(STOP_WORDS)

326

In [4]:
doc = nlp('there are many ways to guard a palace without a king')
for token in doc:
    if token.is_stop:
        print(token.text)

there
are
many
to
a
without
a


In [5]:
def preprocess(text):
    doc = nlp(text)
    no_stop_words = [token.text for token in doc if  not token.is_stop]
    return no_stop_words 

In [6]:
preprocess('there are many ways to guard a palace without a king.')

['ways', 'guard', 'palace', 'king', '.']

In [7]:
#exclude punctuations
def preprocess(text):
    doc = nlp(text)
    no_stop_words = [token.text for token in doc if  not token.is_stop and not token.is_punct]
    return " ".join(no_stop_words)  #converts the expected list to a string 


In [8]:
preprocess('there are many ways to guard a palace without a king.')

'ways guard palace king'

In [9]:
df = pd.read_json("doj_press.json", lines=True)
df.shape

(13087, 6)

In [10]:
df.head()

Unnamed: 0,id,title,contents,date,topics,components
0,,Convicted Bomb Plotter Sentenced to 30 Years,"PORTLAND, Oregon. – Mohamed Osman Mohamud, 23,...",2014-10-01T00:00:00-04:00,[],[National Security Division (NSD)]
1,12-919,$1 Million in Restitution Payments Announced t...,WASHINGTON – North Carolina’s Waccamaw River...,2012-07-25T00:00:00-04:00,[],[Environment and Natural Resources Division]
2,11-1002,$1 Million Settlement Reached for Natural Reso...,BOSTON– A $1-million settlement has been...,2011-08-03T00:00:00-04:00,[],[Environment and Natural Resources Division]
3,10-015,10 Las Vegas Men Indicted \r\nfor Falsifying V...,WASHINGTON—A federal grand jury in Las Vegas...,2010-01-08T00:00:00-05:00,[],[Environment and Natural Resources Division]
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]


In [11]:
#remove rows without topics
df = df[df['topics'].str.len() != 0] 

In [12]:
df['topics'].value_counts()


topics
[Tax]                                                                        706
[Consumer Protection]                                                        335
[Civil Rights]                                                               305
[Antitrust]                                                                  292
[Hate Crimes]                                                                246
                                                                            ... 
[Tax, Health Care Fraud]                                                       1
[Prescription Drugs, Consumer Protection, Health Care Fraud]                   1
[Civil Rights, Firearms Offenses]                                              1
[Antitrust, Financial Fraud, Securities, Commodities, & Investment Fraud]      1
[Hate Crimes, Civil Rights]                                                    1
Name: count, Length: 252, dtype: int64

In [13]:
df.head()

Unnamed: 0,id,title,contents,date,topics,components
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division]
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U..."
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division]
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]"


In [14]:
df.shape

(4688, 6)

In [15]:
#take 150 columns to build a simple application
df = df.head(150)
df.shape

(150, 6)

In [16]:
#remove stop words from contents
df['contents'].head()

4     The U.S. Department of Justice, the U.S. Envir...
7     A 131-count criminal indictment was unsealed t...
19    The United States Attorney’s Office for the Mi...
22    21st Century Oncology LLC, has agreed to pay $...
23    21st Century Oncology Inc. and certain of its ...
Name: contents, dtype: object

In [17]:
df['contents'].iloc[4]

'21st Century Oncology Inc. and certain of its subsidiaries and affiliates have agreed to pay $26 million to the government to resolve a self-disclosure relating to the submission of false attestations regarding the company’s use of electronic health records software and separate allegations that they violated the False Claims Act by submitting, or causing the submission of, claims for certain services provided pursuant to referrals from physicians with whom they had improper financial relationships. \xa0 “The Justice Department is committed to zealously investigating improper financial relationships that have the potential to compromise physicians’ medical judgment,” said Acting Assistant Attorney General Chad A. Readler of the Justice Department’s Civil Division.\xa0 “However, we will work with companies that accept responsibility for their past compliance failures and promptly take corrective action.”  \xa0 21st Century Oncology, which is headquartered in Fort Myers, Florida, owns a

In [18]:
len(df['contents'].iloc[4])

5504

In [19]:
#remove stop words
df['contents'] = df['contents'].apply(lambda x: preprocess(x))

In [20]:
df.head()

Unnamed: 0,id,title,contents,date,topics,components
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,U.S. Department Justice U.S. Environmental Pro...,2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]
7,14-1412,14 Indicted in Connection with New England Com...,131 count criminal indictment unsealed today B...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division]
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,United States Attorney Office Middle District ...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U..."
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,21st Century Oncology LLC agreed pay $ 19.75 m...,2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division]
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. certain subsidiarie...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]"


In [21]:
len(df['contents'].iloc[4])

4217

In [22]:
text = '''
Thor: Love and Thunder is a 2022 American superhero film based on Marvel Comics featuring the character Thor, produced by Marvel Studios and 
distributed by Walt Disney Studios Motion Pictures. It is the sequel to Thor: Ragnarok (2017) and the 29th film in the Marvel Cinematic Universe (MCU).
The film is directed by Taika Waititi, who co-wrote the script with Jennifer Kaytin Robinson, and stars Chris Hemsworth as Thor alongside Christian Bale, Tessa Thompson,
Jaimie Alexander, Waititi, Russell Crowe, and Natalie Portman. In the film, Thor attempts to find inner peace, but must return to action and recruit Valkyrie (Thompson),
Korg (Waititi), and Jane Foster (Portman)—who is now the Mighty Thor—to stop Gorr the God Butcher (Bale) from eliminating all gods.
'''

In [23]:
def remove_stopwords(text):
    stop_words = []
    word_count = 0
    doc = nlp(text)
    for token in doc:
        if token.is_stop:
            stop_words.append(token.text)
        word_count += 1
    percent_stopwords = (len(stop_words)/word_count) * 100
    print("There are", len(stop_words), "stop words out of a text with", word_count, "words"\
    ". Hence the percentage of stop words is",percent_stopwords,"%") 
    

In [24]:
remove_stopwords(text)

There are 40 stop words out of a text with 160 words. Hence the percentage of stop words is 25.0 %


In [31]:
#removing stopwords while still maintaining sentiments
line1 = "This is a good movie"
line2 = 'This is not a good movie'


def transform(text):
    doc = nlp(text)
    no_stop = []
    for token in doc:
        if not token.is_stop or (token.text=='not') and not token.is_punct:
            no_stop.append(token.text)
    return " ".join(no_stop)
            

In [32]:
transform(line1)

'good movie'

In [33]:
transform(line2)

'not good movie'

In [64]:
text = '''The India men's national cricket team, also known as Team India or the Men in Blue, represents India in men's international cricket.
It is governed by the Board of Control for Cricket in India (BCCI), and is a Full Member of the International Cricket Council (ICC) with Test,
One Day International (ODI) and Twenty20 International (T20I) status. Cricket was introduced to India by British sailors in the 18th century, and the 
first cricket club was established in 1792. India's national cricket team played its first Test match on 25 June 1932 at Lord's, becoming the sixth team to be
granted test cricket status.
'''

In [81]:
#getting the most frequently occuring non no-stop word from a text
from collections import Counter
word_frequency = Counter()
def process(text):
    doc = nlp(text)
    no_stop = []
    for token in doc:
        if not token.is_stop and not token.is_punct:
            no_stop.append(token.text)
            word_frequency = Counter(no_stop) 
    return word_frequency.most_common(1)[0][0]
       

In [82]:
process(text)

'India'