## A Financial NER Using Spacy

##### The client wants to use  spacy to find all the company mentions, stock mentions in a text.

In [169]:
import spacy
import pandas as pd
from spacy import displacy
from spacy.language import Language
import re
from spacy.tokens import Span

In [170]:
#We can use the report from the stock exchange to get the name and symbols of all the stock and companies in taht stock exchanged
df=pd.read_csv("stocks.tsv",sep='\t')
print(df)
symbols=df.Symbol.tolist()
companies=df.CompanyName.tolist()
symbols=symbols +["TCPC", "STS", "ENCO", "CES", "RPI", "BGC", "PRI"]
companies=companies+ ["TechCorp Inc.", 
    "SmartTech Solutions", 
    "EnergyCo Inc.", 
    "CleanEnergy Solutions", 
    "RetailPro Inc.", 
    "BigBank Corp.", 
    "Prime Investments Inc."
]
print(symbols[:15])


# for symbol in symbols:
#     if symbol[0]=="F":
#         print(symbol)

     Symbol              CompanyName                          Industry  \
0         A     Agilent Technologies    Life Sciences Tools & Services   
1        AA                    Alcoa                   Metals & Mining   
2       AAC         Ares Acquisition                   Shell Companies   
3      AACG    ATA Creativity Global     Diversified Consumer Services   
4      AADI          Aadi Bioscience                   Pharmaceuticals   
...     ...                      ...                               ...   
5874   ZWRK       Z-Work Acquisition                   Shell Companies   
5875     ZY                 Zymergen                         Chemicals   
5876   ZYME                Zymeworks                     Biotechnology   
5877   ZYNE  Zynerba Pharmaceuticals                   Pharmaceuticals   
5878   ZYXI                    Zynex  Health Care Equipment & Supplies   

     MarketCap  
0       53.65B  
1        9.25B  
2        1.22B  
3       90.35M  
4      104.85M  
...      

In [171]:
#Now we can simply add these symbols as patterns into the entity ruler.
nlp=spacy.blank("en")
ruler=nlp.add_pipe("entity_ruler")
patterns=[]
for symbol in symbols:
    patterns.append({'label':'STOCKSYMBOL','pattern':symbol})
for company in companies:
    patterns.append({'label':'COMPANY','pattern':company})
ruler.add_patterns(patterns)

In [172]:
with open ("TaskValidator1.txt", "r") as f:
    text = f.read()
doc= nlp(text)
print(doc)
for ent in doc.ents:
    print(ent.text,ent.label_)
displacy.render(doc,style="ent")



#We can see that there is a problem with how the 'A' is treated as it is also a comman solo word in english.
#To fix this we can create a custom component and write a RegEx to prevent this 

Wall Street experienced significant volatility on Thursday as stocks across multiple sectors tumbled, driven by a combination of disappointing earnings reports, fears of rising inflation, and ongoing concerns about global economic growth. The Dow Jones Industrial Average dropped 3.1%, while the S&P 500 and Nasdaq Composite fell 2.8% and 3.4%, respectively, marking their worst daily losses in months.

Tech Stocks Hit Hard by Rising Costs Tech stocks, which had been among the best performers earlier this year, were hit particularly hard. TechCorp Inc. (NASDAQ: TCPC), a leader in cloud computing, saw its shares drop by 5.6%, following a weaker-than-expected earnings report. The company posted revenue of $12.9 billion for the quarter, falling short of analysts’ projections of $13.4 billion, due to increasing costs related to data center expansion and the global chip shortage. CEO Mark Stevens mentioned in the earnings call that the company was adjusting its growth outlook for the next quar

In [173]:
#To fix this we could use a sm model of spacy to ignore the symbol if it is followed by another noun. But, Since we are using blank model we can use a faster solution
print(symbols[:50])
#The stock Symbol are always closed by ')' so we use RegEx to check for this if not present then it is ignored.
@Language.component("FinancialNER")
def FinancialNER(doc):
    Entities=list(doc.ents)
    pattern = r'(?<!\w)(' + '|'.join(map(re.escape, symbols)) + r')\s?\)(?!\w)'
 #This RegEx searches for symbol in the Symbols list followed by ) and not part of larger word.

    #Now for each match of the RegEx we extract the match as Span
    for match in re.finditer(pattern,doc.text): 
        start,end=match.span() 
        word=doc.char_span(start,end-1)
        if word is not None:
            #Now the start and end of the span are in terms of tokens which are recognized by the Doc object
            word_with_label = Span(doc, word.start, word.end, label="STOCKSYMBOL")
            Entities.append(word_with_label) #We must append as Span for proper output
    doc.ents=Entities
    return doc


['A', 'AA', 'AAC', 'AACG', 'AADI', 'AAIC', 'AAL', 'AAMC', 'AAME', 'AAN', 'AAOI', 'AAON', 'AAP', 'AAPL', 'AAQC', 'AAT', 'AATC', 'AAU', 'AAWW', 'AB', 'ABB', 'ABBV', 'ABC', 'ABCB', 'ABCL', 'ABCM', 'ABEO', 'ABEV', 'ABG', 'ABGI', 'ABIO', 'ABM', 'ABMD', 'ABNB', 'ABOS', 'ABR', 'ABSI', 'ABST', 'ABT', 'ABTX', 'ABUS', 'ABVC', 'AC', 'ACA', 'ACAD', 'ACAH', 'ACB', 'ACBA', 'ACBI', 'ACC']


In [174]:
#Lets add this custom Component to new nlp object
nlp_Optimized=spacy.blank("en")

#This to ensure no double adding of same pipe
if not nlp_Optimized.has_pipe("entity_ruler"):
    ruler_optimized=nlp_Optimized.add_pipe("entity_ruler")
    
patterns=[]

#The company can be Used as before
for company in companies:
    patterns.append({'label':'COMPANY','pattern':company})
ruler_optimized.add_patterns(patterns)

#Now, for symbol
if not nlp_Optimized.has_pipe("FinancialNER"):
    nlp_Optimized.add_pipe("FinancialNER")

doc_optimized=nlp_Optimized(text)
for ent in doc_optimized.ents:
    print(ent.text,ent.label_)
displacy.render(doc_optimized, style="ent")

nlp_Optimized.analyze_pipes()

Nasdaq COMPANY
TechCorp Inc. COMPANY
TCPC STOCKSYMBOL
Morgan Stanley COMPANY
SmartTech Solutions COMPANY
STS STOCKSYMBOL
EnergyCo Inc. COMPANY
ENCO STOCKSYMBOL
Goldman Sachs COMPANY
CleanEnergy Solutions COMPANY
CES STOCKSYMBOL
RetailPro Inc. COMPANY
RPI STOCKSYMBOL
BigBank Corp. COMPANY
BGC STOCKSYMBOL
Prime Investments Inc. COMPANY
PRI STOCKSYMBOL


{'summary': {'entity_ruler': {'assigns': ['doc.ents',
    'token.ent_type',
    'token.ent_iob'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'FinancialNER': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False}},
 'problems': {'entity_ruler': [], 'FinancialNER': []},
 'attrs': {'token.ent_type': {'assigns': ['entity_ruler'], 'requires': []},
  'doc.ents': {'assigns': ['entity_ruler'], 'requires': []},
  'token.ent_iob': {'assigns': ['entity_ruler'], 'requires': []}}}