In [8]:
import json
import requests
import pandas as pd
from tqdm import tqdm
from collections import Counter
import re
import Levenshtein
from cleanco import cleanco

In [2]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_lg')

In [3]:
doj_data = pd.read_json('combined.json', lines=True)

In [4]:
stock_ticker_data = requests.get('https://quantquote.com/docs/symbol_map_comnam.csv').content

In [5]:
stock_ticker_df = stock_ticker_data.decode('utf-8').split('\r\n')[3:]
stock_ticker_df = pd.DataFrame([i.split(',') for i in stock_ticker_df])
stock_ticker_df.columns = stock_ticker_df.iloc[0]
stock_ticker_df = stock_ticker_df[1:]
stock_ticker_df = stock_ticker_df.dropna(subset=['COMPANY NAME'])

## Tagging Organizations with Spacy

In [7]:
parsed_doj_contents = [set([w.text for w in nlp(c).ents if w.label_=='ORG'])
                       for c in tqdm_notebook(doj_data.contents.values)]

HBox(children=(IntProgress(value=0, max=13087), HTML(value='')))




In [9]:
parsed_doj_titles = [set([w.text for w in nlp(c).ents if w.label_=='ORG'])
                       for c in tqdm(doj_data.title.values)]

100%|██████████| 13087/13087 [02:18<00:00, 94.66it/s] 


In [10]:
doj_data['organizations'] = parsed_doj_contents

In [11]:
doj_data['organizations_titles'] = parsed_doj_titles

In [12]:
doj_data['all_orgs'] = doj_data['organizations'].apply(list)  + doj_data['organizations_titles'].apply(list)

In [13]:
all_orgs = [o.lower() for i in doj_data.all_orgs for o in i]

In [14]:
all_companies = [i.lower() for i in stock_ticker_df['COMPANY NAME']]

In [15]:
# doj_data.to_json('doj_data_with_orgs.json')

## Simpler Tagging :(

In [16]:
def process_name(nm):
    name = cleanco(nm).clean_name()
    name = re.sub(r"[[:punct:]]+", "", name)
    return name.lower()

In [17]:
clean_org_set_v2 = set([process_name(o) for i in tqdm(doj_data.all_orgs) for o in i])

100%|██████████| 13087/13087 [00:41<00:00, 318.97it/s]


In [18]:
clean_co_set_v2 = set([process_name(i) for i in tqdm(stock_ticker_df['COMPANY NAME']) ])

100%|██████████| 21190/21190 [00:02<00:00, 7236.37it/s]


In [19]:
clean_co_to_symbol_dict = {}
symbol_to_full_nm_dict = {}
for _,symbol,_,name in stock_ticker_df[~stock_ticker_df['QUANTQUOTE PERMTICK'].str.contains(r'\d')].itertuples():
    if len(name.strip())>0:
        clean_co_to_symbol_dict[process_name(name)] = symbol
        symbol_to_full_nm_dict[symbol] = name

In [20]:
doj_data['clean_orgs'] = doj_data.all_orgs.apply(lambda st: [process_name(o) for o in st])

In [21]:
doj_data['tagged_symbols'] = doj_data.clean_orgs.apply(lambda st: [clean_co_to_symbol_dict[o] for o in st if o in clean_co_to_symbol_dict])

In [129]:
doj_data_final = doj_data[doj_data.tagged_symbols.apply(lambda x: len(x)>0)].copy()

In [130]:
doj_data_final['tagged_companies'] = doj_data_final['tagged_symbols'].apply(lambda li: [symbol_to_full_nm_dict[i] for i in li])

In [131]:
# doj_data_final.to_json('doj_data_with_tags.json')

## Industry Tagging

In [132]:
nyse = pd.read_csv('nyse_company_list.csv')
nasdaq = pd.read_csv('nasdaq_company_list.csv')


In [133]:
nyse_symbol_set = set([i.lower() for i in nyse.Symbol.values])
nasdaq_symbol_set = set([i.lower() for i in nasdaq.Symbol.values])

In [134]:
nyse_symbol_sector_dict = {sym.lower():sector for sym,sector in zip(nyse.Symbol,nyse.Sector)}
nasdaq_symbol_sector_dict = {sym.lower():sector for sym,sector in zip(nasdaq.Symbol,nasdaq.Sector)}

In [135]:
nyse_symbol_industry_dict = {sym.lower():industry for sym,industry in zip(nyse.Symbol,nyse.Industry)}
nasdaq_symbol_industry_dict = {sym.lower():industry for sym,industry in zip(nasdaq.Symbol,nasdaq.Industry)}

In [136]:
doj_data_final['sectors'] = doj_data_final.tagged_symbols.apply(
    lambda li: 
    [nyse_symbol_sector_dict.get(i,nasdaq_symbol_sector_dict.get(i)) 
     for i in li if (i in nyse_symbol_sector_dict) or (i in nasdaq_symbol_sector_dict)])

In [137]:
doj_data_final['industries'] = doj_data_final.tagged_symbols.apply(
    lambda li: 
    [nyse_symbol_industry_dict.get(i,nasdaq_symbol_industry_dict.get(i)) 
     for i in li if i in nyse_symbol_industry_dict or i in nasdaq_symbol_industry_dict])

In [138]:
doj_data_final.to_json('doj_data_with_tags_and_industries.json')