# Named Entity Recognition with spacy library

In [9]:
import spacy

In [10]:
txt = "Mark Zuckerberg is one of the founders of Facebook, a company from the United States"

In [11]:
nlp = spacy.load('en_core_web_sm')

In [12]:
doc = nlp(txt)

In [13]:
spacy.displacy.render(doc, style='ent')

## Entity exploration

Entity explanation

In [13]:
print(spacy.explain('PERSON'), spacy.explain('CARDINAL'), spacy.explain('GPE'), sep="\n")

People, including fictional
Numerals that do not fall under another type
Countries, cities, states


Entity extraction

In [18]:
print(doc.ents)
print(doc.ents[0].label_)

(Mark Zuckerberg, one, the United States)
PERSON


In [20]:
for entity in doc.ents:
    print(f"{entity.label_} : {entity.text}")

PERSON : Mark Zuckerberg
CARDINAL : one
GPE : the United States


Example : Extract organisation

In [25]:
new_txt = "Apple reached an all-time high stock price of 143 dollars this January." 
new_doc = nlp(new_txt)

org_list =  []
for entity in new_doc.ents:
    if (entity.label_ == "ORG"):
        org_list.append(entity.text)
print(org_list)

['Apple']


## Using Redddit

## Reddit API

### Authentification

In [50]:
client_id = '2plnnHDx2GYoubbtpIRzbA'
secret = 'ijC-vOjKy9HhfhSYf9wZfIKOZ1M57g'

In [51]:
import requests

In [75]:
auth = requests.auth.HTTPBasicAuth(client_id, secret)

In [76]:
data = {'grant_type' : 'password', 
        'username' : 'Automatic-Act-3799',
         'password' : pwd}

In [94]:
headers = {'User_Agent': 'UdemyTutorial/0.0.1'}

In [95]:
res = requests.post('https://www.reddit.com/api/v1/access_token', 
                   auth=auth, data = data, headers=headers)

Authentification problem !!
To be continued ...

In [96]:
res

<Response [429]>

In [97]:
token = res.json()['access_token']

KeyError: 'access_token'

In [63]:
headers['Authorization'] = f'bearer {token}'

In [40]:
headers

{'User_Agent': 'UdemyTutorial/0.0.1',
 'Authorization': 'bearer 1074800735230-y3Ed6PqT68ZgeULwDIHVXhiP3-e2LA'}

In [44]:
requests.get('https://oauth.reddit.com/api/v1/me', headers=headers)

<Response [429]>

### Pulling data

In [14]:
import pandas as pd

In [45]:
api = 'https://oauth.reddit.com'

In [47]:
res = requests.get (f' (api}/r/investing/new', headers=headers, params={'limit' : '100'})

In [49]:
res.json()['data']['children'][0]['data']

<Response [429]>

In [72]:
df = pd.DataFrame ({
    'name' : [],
    'created_utc': [],
    'subreddit': [],
    'title': [],
    'selftext': [],
    'upvote_ratio' : [],
    'ups' : [],
    'downs': [],
    'score': [],
})

In [None]:
for post in res.json()['data']['children']:
    df = df.append({
    'name' : post['data']['name'],
    'created_utc': post['data']['created_utc'],
    'subreddit': post['data']['subreddit'],
    'title': post['data']['title'],
    'selftext': post['data']['selftext'],
    'upvote_ratio' : post['data']['upvote_ratio'],
    'ups' : post['data']['ups'],
    'downs': post['data']['downs'],
    'score': post['data']['score'],
})

In [None]:
while True:
    res = requests.get (f' (api}/r/investing/new', headers=headers, 
                        params={'limit' : '100', 'after' : df['name'].iloc[len(df)-1]})
    for post in res.json()['data']['children']:
        df = df.append({
        'name' : post['data']['name'],
        'created_utc': post['data']['created_utc'],
        'subreddit': post['data']['subreddit'],
        'title': post['data']['title'],
        'selftext': post['data']['selftext'],
        'upvote_ratio' : post['data']['upvote_ratio'],
        'ups' : post['data']['ups'],
        'downs': post['data']['downs'],
        'score': post['data']['score'],
        }; ignore_index=True)

In [None]:
df = df.replace({'|': ''}; regex=True)

In [None]:
df.to_csv('reddit_investing.csv', sep='|', index=False)

## NER tagging

In [17]:
from collections import Counter

In [18]:
def get_orgs(text):
    doc = nlp(text)
    org_list =  []
    for entity in doc.ents:
        if (entity.label_ == "ORG"):
            org_list.append(entity.text)
    org_list = list(set(org_list))
    return org_list

### With Reddit API

In [19]:
df = pd.read_csv('reddit_investing.csv', sep='|')

In [20]:
df['organizations'] = df['selftext'].apply(get_orgs)
df.head()

Unnamed: 0,created_utc,downs,id,score,selftext,subreddit,title,ups,upvote_ratio,organizations
0,1614290000.0,0.0,t3_lshtjn,10.0,Bloomberg article: [https://www.bloomberg.com/...,investing,Fed Views Rising Yields as Bullish Sign Reflec...,10.0,0.86,"[Federal Reserve, Treasury, Bostic, St. Louis ..."
1,1614286000.0,0.0,t3_lsgahw,56.0,Given the recent downturn in stocks especially...,investing,ARK ETFs implosion risk ------------------------,56.0,0.83,[The Bear Cave](https://thebearcave.substack.c...
2,1614283000.0,0.0,t3_lsf8td,1.0,[https://twitter.com/desogames/status/13649710...,investing,The Counter-Party Risk Bubble,1.0,0.53,"[OWN, ITM]"
3,1614282000.0,0.0,t3_lsf3nh,6.0,"When you think of futures, what comes to your ...",investing,Futures were made for days like these,6.0,0.62,[]
4,1614278000.0,0.0,t3_lsdcib,3.0,I've been on this sub for quite some time and ...,investing,Let's talk about liquidity premiums,3.0,0.67,[]


In [21]:
orgs = [org for sublist in df['organizations'].to_list() for org in sublist]

### With another dataset : https://www.kaggle.com/datasets/cosmos98/twitter-and-reddit-sentimental-analysis-dataset?select=Reddit_Data.csv

In [104]:
df = pd.read_csv('Reddit_Data.csv', sep=',')

In [105]:
df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [134]:
df = df.dropna()

In [135]:
df

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1
...,...,...
37244,jesus,0
37245,kya bhai pure saal chutiya banaya modi aur jab...,1
37246,downvote karna tha par upvote hogaya,0
37247,haha nice,1


In [133]:
df['clean_comment'][414]

' htm happy reading '

In [136]:
df['organizations'] = df['clean_comment'].apply(get_orgs)
df.head()

Unnamed: 0,clean_comment,category,organizations
0,family mormon have never tried explain them t...,1,[]
1,buddhism has very much lot compatible with chr...,1,[]
2,seriously don say thing first all they won get...,-1,[]
3,what you have learned yours and only yours wha...,0,[]
4,for your own benefit you may want read living ...,1,[]


In [137]:
orgs = [org for sublist in df['organizations'].to_list() for org in sublist]

### Common analysis

In [22]:
org_freq = Counter(orgs)
org_freq.most_common(10)

[('ETF', 37),
 ('Tesla', 34),
 ('Amazon', 28),
 ('COVID', 27),
 ('EPS', 26),
 ('SEC', 24),
 ('ARK', 23),
 ('NYSE', 23),
 ('EBITDA', 22),
 ('GME', 21)]

In [23]:
df.to_csv('reddit_investing_ner.csv', sep='|', index=False)

### NER with Sentiment

In [1]:
import flair

In [47]:
model = flair.models.TextClassifier.load('en-sentiment')

2022-06-11 16:32:42,624 loading file C:\Users\thoma\.flair\models\sentiment-en-mix-distillbert_4.pt


In [68]:
def get_sentiment(text):
    # tokenize input text
    sentence = flair.data.Sentence(text)
    # make sentiment prediction
    model.predict(sentence)
    sentiment = list(sentence.labels[0].to_dict().values())
    return sentiment

### Sentiment prediction with flair from text

In [69]:
df['sentiment'] = df['selftext'].apply(get_sentiment)
df.head()

Unnamed: 0,created_utc,downs,id,score,selftext,subreddit,title,ups,upvote_ratio,organizations,sentiment
0,1614290000.0,0.0,t3_lshtjn,10.0,Bloomberg article: [https://www.bloomberg.com/...,investing,Fed Views Rising Yields as Bullish Sign Reflec...,10.0,0.86,"[Federal Reserve, Treasury, Bostic, St. Louis ...","[NEGATIVE, 0.9916453957557678]"
1,1614286000.0,0.0,t3_lsgahw,56.0,Given the recent downturn in stocks especially...,investing,ARK ETFs implosion risk ------------------------,56.0,0.83,[The Bear Cave](https://thebearcave.substack.c...,"[NEGATIVE, 0.9974728226661682]"
2,1614283000.0,0.0,t3_lsf8td,1.0,[https://twitter.com/desogames/status/13649710...,investing,The Counter-Party Risk Bubble,1.0,0.53,"[OWN, ITM]","[NEGATIVE, 0.9996134638786316]"
3,1614282000.0,0.0,t3_lsf3nh,6.0,"When you think of futures, what comes to your ...",investing,Futures were made for days like these,6.0,0.62,[],"[NEGATIVE, 0.9999294281005859]"
4,1614278000.0,0.0,t3_lsdcib,3.0,I've been on this sub for quite some time and ...,investing,Let's talk about liquidity premiums,3.0,0.67,[],"[NEGATIVE, 0.9893319606781006]"


In [70]:
import ast

In [91]:
sentiment = {}

# loop through dataframe and extract org labels and sentiment scores into sentiment dictionary
for i, row in df.iterrows():
    # extract sentiment direction and score
    direction = row['sentiment'][0]
    score = row['sentiment'][1]
    # loop through each label in organizations column
    for org in row['organizations']:
        # check if org label exists in sentiment dictionary already
        if org not in sentiment.keys():
            # if it doesn't, initialize new entry in dictionary
            sentiment[org] = {'POSITIVE': [], 'NEGATIVE': []}
        # append positive/negative score to respective dictionary entry
        sentiment[org][direction].append(score)

### Calculating scores for user sentiment mean(positive sentiment) - mean(negative sentiment)

In [92]:
avg_sentiment = []

# loop through each organization
for org in sentiment.keys():
    # get number of positive and negative ratings
    freq = len(sentiment[org]['POSITIVE']) + len(sentiment[org]['NEGATIVE'])
    for direction in ['POSITIVE', 'NEGATIVE']:
        # assign to variable for cleaner code
        score = sentiment[org][direction]
        # if there are no entries, set to 0
        if len(score) == 0:
            sentiment[org][direction] = 0.0
        else:
            # otherwise calculate total
            sentiment[org][direction] = sum(score)
    # now calculate total amount
    total = sentiment[org]['POSITIVE'] - sentiment[org]['NEGATIVE']
    # and the average score
    avg = total/freq
    # add to sentiment list
    avg_sentiment.append({
        'entity': org,
        'positive': sentiment[org]['POSITIVE'],
        'negative': sentiment[org]['NEGATIVE'],
        'frequency': freq,
        'score': avg
    })

In [93]:
sentiment_df = pd.DataFrame(avg_sentiment)
sentiment_df.head()

Unnamed: 0,entity,positive,negative,frequency,score
0,Federal Reserve,0.981379,1.935663,3,-0.318095
1,Treasury,1.596512,3.990731,6,-0.399036
2,Bostic,0.0,0.991645,1,-0.991645
3,St. Louis Fed,0.0,0.991645,1,-0.991645
4,Powell,0.0,0.991645,1,-0.991645


In [94]:
sentiment_df = sentiment_df[sentiment_df['frequency'] > 3]
sentiment_df

Unnamed: 0,entity,positive,negative,frequency,score
1,Treasury,1.596512,3.990731,6,-0.399036
5,Fed,2.326643,9.933153,13,-0.585116
8,ARK,8.405831,13.094553,23,-0.203857
11,COVID,2.972623,23.019012,27,-0.742459
14,eBay,1.879811,2.979899,5,-0.220018
...,...,...,...,...,...
1521,Tilray,0.944136,4.664325,6,-0.620032
1559,PLTR,1.624317,1.911521,4,-0.071801
1627,LMND,0.000000,4.746248,5,-0.949250
2087,SaaS,0.989073,2.852114,4,-0.465760


### Sort entities from goood to bad images

In [95]:
sentiment_df.sort_values('score', ascending=False).head(10)

Unnamed: 0,entity,positive,negative,frequency,score
1387,IBM,2.965104,0.88397,4,0.520283
269,ROIC,2.962342,0.996891,4,0.491363
363,TAM,6.161799,1.880311,9,0.475721
403,Google,3.283157,0.972579,5,0.462116
115,marijuana,2.812573,0.991353,4,0.455305
2225,Sony,4.888051,1.970413,7,0.416805
311,Tencent,2.45179,0.99819,4,0.3634
530,Intel,3.268426,1.95394,6,0.219081
372,YouTube,2.936317,2.180854,6,0.125911
469,Company,7.915094,6.876054,16,0.06494


### NER with Transformers

In [4]:
import spacy

In [5]:
txt = "Apple reached an all-time high stock price of 143 dollars this January." 

In [9]:
trf = spacy.load('en_core_web_trf')
lg = spacy.load('en_core_web_lg')

In [10]:
doc = trf(txt)
spacy.displacy.render(doc, style='ent')



In [11]:
doc = lg(txt)
spacy.displacy.render(doc, style='ent')