-
Notifications
You must be signed in to change notification settings - Fork 0
/
NER_sentiment_analysis.py
94 lines (78 loc) · 3.31 KB
/
NER_sentiment_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import pandas as pd
import feedparser
import requests
import json
from time import sleep
from functions import get_headlines, get_sentiment, get_summaries
# Dict of RSS feeds
newsurls = {
'globenewswire-us': 'http://www.globenewswire.com/RssFeed/country/United%20States/feedTitle/GlobeNewswire%20-%20News%20from%20United%20States',
}
# Function to fetch the rss feed and return the parsed RSS
def parse_rss(rss_url):
return feedparser.parse(rss_url)
# Lists to hold information about all articles
allheadlines = []
summaries = []
entities = []
RICs = []
topics = []
headline_sentiments = []
summary_sentiments = []
# Iterate over the feed urls
for key, url in newsurls.items():
# Call get_eadlines() and combine the returned headlines with allheadlines
allheadlines.extend(get_headlines(url))
summaries.extend(get_summaries(url))
# Using the Thomson Reuters Intelligent Tagging API
# Sample content to send request to API
for i in range(len(allheadlines)):
curr_entities = []
curr_RICs = []
curr_topics = []
contentText = allheadlines[i]
sleep(2) # sleep time introduced so that we don't exceed the api calls limit
headType = "text/raw"
token = 'oSyQfYcRShExGJmJPXRgr4kOFAsIHqoJ'
url = "https://api-eit.refinitiv.com/permid/calais"
payload = contentText.encode('utf8')
headers = {
'Content-Type': headType,
'X-AG-Access-Token': token,
'outputformat': "application/json"
}
# The daily limit is 5,000 requests, and the concurrent limit varies by API from 1-4 calls per second.
TRITResponse = requests.request("POST", url, data=payload, headers=headers)
# Load content into JSON object
JSONResponse = json.loads(TRITResponse.text)
#Entities
for key in JSONResponse:
if ('_typeGroup' in JSONResponse[key]):
if JSONResponse[key]['_typeGroup'] == 'entities':
curr_entities.append(JSONResponse[key]['_type'] +
", " + JSONResponse[key]['name'])
#RICs
for entity in JSONResponse:
for info in JSONResponse[entity]:
if (info == 'resolutions'):
for companyinfo in (JSONResponse[entity][info]):
if 'primaryric' in companyinfo:
symbol = companyinfo['primaryric']
curr_RICs.append(symbol)
#Topics
for key in JSONResponse:
if ('_typeGroup' in JSONResponse[key]):
if JSONResponse[key]['_typeGroup'] == 'topics':
curr_topics.append(JSONResponse[key]['name'] + ", " +
str(JSONResponse[key]['score']))
entities.append(curr_entities)
RICs.append(curr_RICs)
topics.append(curr_topics)
#Analysing sentiments on headlines and summaries
for i in range(len(allheadlines)):
headline_sentiments.append(get_sentiment(allheadlines[i]))
summary_sentiments.append(get_sentiment(summaries[i]))
#Generating a dataframe with headlines, summaries, entities, RICs, topics, headline_sentiments and summary_sentiments as columns
headlines_dataframe = pd.DataFrame({"Headlines": allheadlines, "Summaries": summaries, "entities": entities, "RICs": RICs,
"topics": topics, "headline_sentiments": headline_sentiments, "summary_sentiment": summary_sentiments})
print(headlines_dataframe)