In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_json('combined.json',lines=True)

In [3]:
df.head()

Unnamed: 0,id,title,contents,date,topics,components
0,,Convicted Bomb Plotter Sentenced to 30 Years,"PORTLAND, Oregon. – Mohamed Osman Mohamud, 23,...",2014-10-01T00:00:00-04:00,[],[National Security Division (NSD)]
1,12-919,$1 Million in Restitution Payments Announced t...,WASHINGTON – North Carolina’s Waccamaw River...,2012-07-25T00:00:00-04:00,[],[Environment and Natural Resources Division]
2,11-1002,$1 Million Settlement Reached for Natural Reso...,BOSTON– A $1-million settlement has been...,2011-08-03T00:00:00-04:00,[],[Environment and Natural Resources Division]
3,10-015,10 Las Vegas Men Indicted \r\nfor Falsifying V...,WASHINGTON—A federal grand jury in Las Vegas...,2010-01-08T00:00:00-05:00,[],[Environment and Natural Resources Division]
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]


In [4]:
df.shape

(13087, 6)

In [5]:
df['topics']

0                                                       []
1                                                       []
2                                                       []
3                                                       []
4                                            [Environment]
                               ...                        
13082                                        [Environment]
13083                                                   []
13084                                 [Foreign Corruption]
13085    [Asset Forfeiture, Counterintelligence and Exp...
13086             [Counterintelligence and Export Control]
Name: topics, Length: 13087, dtype: object

In [6]:
#topics column seems to be having some empty list values, those rows can be filtered
df=df[df['topics'].str.len()!=0]
df['topics'] 

4                                            [Environment]
7                                    [Consumer Protection]
19                                           [Environment]
22                   [False Claims Act, Health Care Fraud]
23                   [Health Care Fraud, False Claims Act]
                               ...                        
13081                                        [Hate Crimes]
13082                                        [Environment]
13084                                 [Foreign Corruption]
13085    [Asset Forfeiture, Counterintelligence and Exp...
13086             [Counterintelligence and Export Control]
Name: topics, Length: 4688, dtype: object

In [7]:
df['components']

4             [Environment and Natural Resources Division]
7                                         [Civil Division]
19       [Environment and Natural Resources Division, U...
22                                        [Civil Division]
23                [Civil Division, USAO - Florida, Middle]
                               ...                        
13081    [Civil Rights Division, Civil Rights - Crimina...
13082         [Environment and Natural Resources Division]
13084    [Criminal Division, Criminal - Criminal Fraud ...
13085    [National Security Division (NSD), USAO - Texa...
13086    [National Security Division (NSD), USAO - Texa...
Name: components, Length: 4688, dtype: object

In [8]:
df=df.drop(['id','date','components'],axis='columns')

In [9]:
df.head()

Unnamed: 0,title,contents,topics
4,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",[Environment]
7,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,[Consumer Protection]
19,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,[Environment]
22,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...","[False Claims Act, Health Care Fraud]"
23,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,"[Health Care Fraud, False Claims Act]"


In [10]:
df.shape

(4688, 3)

In [11]:
#converting the list values in topics to strings
df['topics']=df['topics'].apply(lambda x: ' '.join(x))
df['topics'].head()

4                            Environment
7                    Consumer Protection
19                           Environment
22    False Claims Act Health Care Fraud
23    Health Care Fraud False Claims Act
Name: topics, dtype: object

In [12]:
titles=df['title'].tolist()
contents=df['contents'].tolist()
topics=df['topics'].tolist()

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
v=TfidfVectorizer(stop_words='english')

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
#transform the content to vector
tfidf_matrix=v.fit_transform(contents)
tfidf_matrix.shape

(4688, 37899)

In [16]:
def search(query):
    #transform the input to a vector
    input_vector=v.transform([query])
    
    #comparing the similarites between input vector and content vector
    similarities=cosine_similarity(input_vector,tfidf_matrix)
    
    #getting the top 10 similarities
    top_10=np.argsort(similarities[0])[-10:][::-1]
    top_results=[(titles[i],topics[i],similarities[0][i]) for i in top_10]
    result_df = pd.DataFrame(top_results, columns=['Title','Topic','Similarity Score'])
    return result_df

In [17]:
df['contents'].iloc[0]

"The U.S. Department of Justice, the U.S. Environmental Protection Agency (EPA), and the Rhode Island Department of Environmental Management (RIDEM) announced today that two subsidiaries of Stanley Black & Decker Inc.—Emhart Industries Inc. and Black & Decker Inc.—have agreed to clean up dioxin contaminated sediment and soil at the Centredale Manor Restoration Project Superfund Site in North Providence and Johnston, Rhode Island.\xa0 “We are pleased to reach a resolution through collaborative work with the responsible parties, EPA, and other stakeholders,” said\xa0Acting Assistant Attorney General Jeffrey H. Wood for the Justice Department's\xa0Environment and Natural Resources Division . “Today’s settlement ends protracted litigation and allows for important work to get underway to restore a healthy environment for citizens living in and around the Centredale Manor Site and the Woonasquatucket River.” “This settlement demonstrates the tremendous progress we are achieving working with 

In [18]:
df['title'].iloc[0] #the title of the above case

'$100 Million Settlement Will Speed Cleanup Work at Centredale Manor Superfund Site in North Providence, R.I.'

From the above case story,assume if a person is aware only about the fact that some cleanup work was done in Woonasquatucket river and tries to search more details about the case

In [24]:
search('what was the case where cleanup work was done in Woonasquatucket river?')

Unnamed: 0,Title,Topic,Similarity Score
0,$100 Million Settlement Will Speed Cleanup Wor...,Environment,0.375127
1,NCR Corporation Agrees to End Litigation and C...,Environment,0.296898
2,"Justice Department, EPA and the State of New M...",Environment,0.249354
3,Charles River Laboratories International Inc. ...,Health Care Fraud,0.18594
4,"Department Of Justice, EPA Reach Agreement Wit...",Environment,0.148875
5,Department of Justice and EPA Announce $50 Mil...,Environment,0.146511
6,Department of Justice and EPA Announce $29 Mil...,Environment,0.141136
7,Federal and State Trustees Reach $4.5 Million ...,Environment,0.138079
8,Public and Environment to Benefit from Propose...,Environment,0.130727
9,United States Files Suit Against Savannah Rive...,False Claims Act,0.12509
