# Named Entity Recognition with spacy library

In [1]:
import spacy

In [2]:
txt = "Mark Zuckerberg is one of the founders of Facebook, a company from the United States"

In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
doc = nlp(txt)

In [5]:
spacy.displacy.render(doc, style='ent')

## Entity exploration

Entity explanation

In [13]:
print(spacy.explain('PERSON'), spacy.explain('CARDINAL'), spacy.explain('GPE'), sep="\n")

People, including fictional
Numerals that do not fall under another type
Countries, cities, states


Entity extraction

In [18]:
print(doc.ents)
print(doc.ents[0].label_)

(Mark Zuckerberg, one, the United States)
PERSON


In [20]:
for entity in doc.ents:
    print(f"{entity.label_} : {entity.text}")

PERSON : Mark Zuckerberg
CARDINAL : one
GPE : the United States


Example : Extract organisation

In [25]:
new_txt = "Apple reached an all-time high stock price of 143 dollars this January." 
new_doc = nlp(new_txt)

org_list =  []
for entity in new_doc.ents:
    if (entity.label_ == "ORG"):
        org_list.append(entity.text)
print(org_list)

['Apple']


## Using Redddit

## Reddit API

### Authentification

In [50]:
client_id = '2plnnHDx2GYoubbtpIRzbA'
secret = 'ijC-vOjKy9HhfhSYf9wZfIKOZ1M57g'

In [51]:
import requests

In [75]:
auth = requests.auth.HTTPBasicAuth(client_id, secret)

In [76]:
data = {'grant_type' : 'password', 
        'username' : 'Automatic-Act-3799',
         'password' : pwd}

In [94]:
headers = {'User_Agent': 'UdemyTutorial/0.0.1'}

In [95]:
res = requests.post('https://www.reddit.com/api/v1/access_token', 
                   auth=auth, data = data, headers=headers)

Authentification problem !!
To be continued ...

In [96]:
res

<Response [429]>

In [97]:
token = res.json()['access_token']

KeyError: 'access_token'

In [63]:
headers['Authorization'] = f'bearer {token}'

In [40]:
headers

{'User_Agent': 'UdemyTutorial/0.0.1',
 'Authorization': 'bearer 1074800735230-y3Ed6PqT68ZgeULwDIHVXhiP3-e2LA'}

In [44]:
requests.get('https://oauth.reddit.com/api/v1/me', headers=headers)

<Response [429]>

### Pulling data

In [7]:
import pandas as pd

In [45]:
api = 'https://oauth.reddit.com'

In [47]:
res = requests.get (f' (api}/r/investing/new', headers=headers, params={'limit' : '100'})

In [49]:
res.json()['data']['children'][0]['data']

<Response [429]>

In [72]:
df = pd.DataFrame ({
    'name' : [],
    'created_utc': [],
    'subreddit': [],
    'title': [],
    'selftext': [],
    'upvote_ratio' : [],
    'ups' : [],
    'downs': [],
    'score': [],
})

In [None]:
for post in res.json()['data']['children']:
    df = df.append({
    'name' : post['data']['name'],
    'created_utc': post['data']['created_utc'],
    'subreddit': post['data']['subreddit'],
    'title': post['data']['title'],
    'selftext': post['data']['selftext'],
    'upvote_ratio' : post['data']['upvote_ratio'],
    'ups' : post['data']['ups'],
    'downs': post['data']['downs'],
    'score': post['data']['score'],
})

In [None]:
while True:
    res = requests.get (f' (api}/r/investing/new', headers=headers, 
                        params={'limit' : '100', 'after' : df['name'].iloc[len(df)-1]})
    for post in res.json()['data']['children']:
        df = df.append({
        'name' : post['data']['name'],
        'created_utc': post['data']['created_utc'],
        'subreddit': post['data']['subreddit'],
        'title': post['data']['title'],
        'selftext': post['data']['selftext'],
        'upvote_ratio' : post['data']['upvote_ratio'],
        'ups' : post['data']['ups'],
        'downs': post['data']['downs'],
        'score': post['data']['score'],
        }; ignore_index=True)

In [None]:
df = df.replace({'|': ''}; regex=True)

In [None]:
df.to_csv('reddit_investing.csv', sep='|', index=False)

## NER tagging

In [8]:
from collections import Counter

In [9]:
def get_orgs(text):
    doc = nlp(text)
    org_list =  []
    for entity in doc.ents:
        if (entity.label_ == "ORG"):
            org_list.append(entity.text)
    org_list = list(set(org_list))
    return org_list

### With Reddit API

In [10]:
df = pd.read_csv('reddit_investing.csv', sep='|')

In [11]:
df['organizations'] = df['selftext'].apply(get_orgs)
df.head()

Unnamed: 0,created_utc,downs,id,score,selftext,subreddit,title,ups,upvote_ratio,organizations
0,1614290000.0,0.0,t3_lshtjn,10.0,Bloomberg article: [https://www.bloomberg.com/...,investing,Fed Views Rising Yields as Bullish Sign Reflec...,10.0,0.86,"[Powell, Fed, Bostic, the Atlanta Fed’s, Feder..."
1,1614286000.0,0.0,t3_lsgahw,56.0,Given the recent downturn in stocks especially...,investing,ARK ETFs implosion risk ------------------------,56.0,0.83,"[ARK, The Bear Cave](https://thebearcave.subst..."
2,1614283000.0,0.0,t3_lsf8td,1.0,[https://twitter.com/desogames/status/13649710...,investing,The Counter-Party Risk Bubble,1.0,0.53,"[OWN, ITM]"
3,1614282000.0,0.0,t3_lsf3nh,6.0,"When you think of futures, what comes to your ...",investing,Futures were made for days like these,6.0,0.62,[]
4,1614278000.0,0.0,t3_lsdcib,3.0,I've been on this sub for quite some time and ...,investing,Let's talk about liquidity premiums,3.0,0.67,[]


In [12]:
orgs = [org for sublist in df['organizations'].to_list() for org in sublist]

### With another dataset : https://www.kaggle.com/datasets/cosmos98/twitter-and-reddit-sentimental-analysis-dataset?select=Reddit_Data.csv

In [104]:
df = pd.read_csv('Reddit_Data.csv', sep=',')

In [105]:
df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [134]:
df = df.dropna()

In [135]:
df

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1
...,...,...
37244,jesus,0
37245,kya bhai pure saal chutiya banaya modi aur jab...,1
37246,downvote karna tha par upvote hogaya,0
37247,haha nice,1


In [133]:
df['clean_comment'][414]

' htm happy reading '

In [136]:
df['organizations'] = df['clean_comment'].apply(get_orgs)
df.head()

Unnamed: 0,clean_comment,category,organizations
0,family mormon have never tried explain them t...,1,[]
1,buddhism has very much lot compatible with chr...,1,[]
2,seriously don say thing first all they won get...,-1,[]
3,what you have learned yours and only yours wha...,0,[]
4,for your own benefit you may want read living ...,1,[]


In [137]:
orgs = [org for sublist in df['organizations'].to_list() for org in sublist]

### Common analysis

In [13]:
org_freq = Counter(orgs)
org_freq.most_common(10)

[('ETF', 37),
 ('Tesla', 34),
 ('Amazon', 28),
 ('COVID', 27),
 ('EPS', 26),
 ('SEC', 24),
 ('ARK', 23),
 ('NYSE', 23),
 ('EBITDA', 22),
 ('GME', 21)]

In [14]:
df.to_csv('reddit_investing_ner.csv', sep='|', index=False)

### NER with Sentiment

In [17]:
import flair

ModuleNotFoundError: No module named 'flair'

In [144]:
model = flair.models.TextClassifier.load('en_sentiment')

2022-06-11 15:41:39,362 loading file en_sentiment


FileNotFoundError: [Errno 2] No such file or directory: 'en_sentiment'

In [None]:
def ger_sentiment(text):
    sentence = flair.data.Sentence(text)
    model.predict(sentence)
    sentiment = sentiment.labels[0]
    return sentiment

In [None]:
df['sentiment'] = df['selftext'].apply(get_sentiment)
df.head()

In [103]:
import ast

In [None]:
df['organizations'] = df['organizations'].apply(lambda x: ast.literal_eval(x))

In [None]:
sentiment = {}

for i, row in df.iterrows():
    direction = row['sentiment'].value
    score = row['sentiment'].score
    for org in row['organizations']:
        if org not in sentiment.keys():
            sentiment[org] = {'POSITIVE' : []; 'NEGATIVE' : []}
        sentiment[org][direction].append(score)