In [1]:
import datetime
import pandas as pd
import nltk
import spacy

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from collections import defaultdict

nltk.download('wordnet')
nltk.download('omw-1.4')
nlp = spacy.load("en_core_web_lg")

2022-06-26 11:39:57.408812: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[nltk_data] Downloading package wordnet to /home/sajith/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/sajith/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Build Data

In [2]:
investing = pd.read_csv("investing.csv", header = 0, parse_dates = [1])
fxstreet = pd.read_csv("fxstreet.csv", header = 0, parse_dates = [1])

In [3]:
scraped_news_df = pd. concat([investing,fxstreet])
scraped_news_df = scraped_news_df.reset_index()
scraped_news_df

Unnamed: 0,index,page,date,title,body,source
0,0,1,2022-05-25,Dollar rises off of 1-month low ahead of Fed m...,By John McCrank NEW YORK (Reuters) - The U.S....,investing
1,1,1,2022-05-25,Analysis-As U.S. economy's exceptionalism fade...,By Tommy Wilkes and Saikat Chatterjee LONDON ...,investing
2,2,1,2022-05-25,Russia will start a pilot project for 'digital...,\n(Reuters) - Russia will start a pilot projec...,investing
3,3,1,2022-05-25,Rouble firms to 4-yr high vs dollar as Russian...,(Reuters) - The Russian rouble firmed past 56...,investing
4,4,1,2022-05-25,"Dollar Stabilizes, but Hikes Elsewhere Point t...",By Geoffrey Smith\nInvesting.com -- The dollar...,investing
...,...,...,...,...,...,...
217,165,9,2022-05-24,PBOC: Will keep stable credit growth in proper...,The People’s Bank of China (PBOC) said in a st...,fxstreet
218,166,9,2022-05-24,"GBP/USD recovers modest intraday losses, flirt...",The GBP/USD pair reversed modest intraday loss...,fxstreet
219,167,9,2022-05-24,USD/CNH could face some initial consolidation ...,"In light of the ongoing price action, USD/CNH ...",fxstreet
220,168,9,2022-05-24,NZD/USD must pop over the 0.6713/0.6821 resist...,NZD/USD has seen a 4.4% recovery from its mid-...,fxstreet


# Tokenizing

In [4]:
eng_stopwords = stopwords.words('english')         

In [5]:
with open('fx-bear-terms.csv', 'r') as bears:
    bear_words = bears.read()

with open('fx-bull-terms.csv', 'r') as bulls:
    bull_words = bulls.read()

with open('5min.csv', 'r') as fivemin:
    fivemin_words = fivemin.read()

with open('1hr.csv', 'r') as onehr:
    onehr_words = onehr.read()

with open('1day.csv', 'r') as oneday:
    oneday_words = oneday.read()

pair_list = pd.read_csv('fx-pairs-collection.csv', header = 0, )

pair_vocabulary = pd.read_csv('fx-pairs-slang.csv', header = 0, parse_dates = [2])

def pair_to_dictionary(pair_vocabulary):
    pair_vocab_dic = defaultdict(dict)
    for i in pair_vocabulary.columns:
        pair_vocab_dic[i] = pair_vocabulary[i].to_list()
        
    return pair_vocab_dic

def pairs_lematizer(pair_vocabulary):
    for x in pair_vocabulary.keys():
        
        for word in pair_vocabulary[x]:
            word = lem.lemmatize(word, 'n')

    return pair_vocabulary

def pairs_tokenize(pair_vocabulary):
    for x in pair_vocabulary.keys():
        for word in pair_vocabulary[x]:
            word = word_tokenize(word)

    return pair_vocabulary

pair_vocabulary = pair_to_dictionary(pair_vocabulary)
lem = WordNetLemmatizer()
stem = PorterStemmer()

eng_stopwords = stopwords.words('english')

bear_words = lem.lemmatize(bear_words, 'v')
bull_words = lem.lemmatize(bull_words, 'v')
fivemin_words  = lem.lemmatize(fivemin_words, 'v')
onehr_words  = lem.lemmatize(onehr_words, 'v')
oneday_words  = lem.lemmatize(oneday_words, 'v')

pair_vocabulary      = pairs_lematizer(pair_vocabulary)    

bear_words = word_tokenize(bear_words)
bull_words = word_tokenize(bull_words)
fivemin_words = word_tokenize(fivemin_words)
onehr_words = word_tokenize(onehr_words)
oneday_words = word_tokenize(oneday_words)

pair_vocabulary      = pairs_tokenize(pair_vocabulary)

bear_words = [word for word in bear_words if word not in eng_stopwords] 
bull_words = [word for word in bull_words if word not in eng_stopwords]
fivemin_words = [word for word in fivemin_words if word not in eng_stopwords]
onehr_words = [word for word in onehr_words if word not in eng_stopwords]
oneday_words = [word for word in oneday_words if word not in eng_stopwords]

In [6]:
def lemmatizer(text):
    return [WordNetLemmatizer().lemmatize(word, 'v') for word in text]

In [7]:
def setup_and_token(dataframe):
    eng_stopwords = stopwords.words('english') 

    dataframe.title = dataframe.title.str.replace("- ()", "")
    dataframe.title = dataframe.title.str.replace("[()\"{:,;%]", "")
    dataframe.body = dataframe.body.str.replace("- ()", "")
    dataframe.body = dataframe.body.str.replace("[()\"{:,;%]", "")
    
    news_title = dataframe.title.astype(str)
    news_title = news_title.str.lower()
    news_body = dataframe.body.astype(str)
    news_body = news_body.str.lower()

    #tokenize
    news_title = news_title.apply(word_tokenize)
    # news_body = news_body.apply(word_tokenize)
    #remove stop words
    news_title = news_title.apply(lambda x:[item for item in x if item not in eng_stopwords])
    # news_body = news_body.apply(lambda x:[item for item in x if item not in eng_stopwords])
    #lemmatize
    news_title = news_title.apply(lemmatizer)
    # news_body = news_body.apply(lemmatizer)
    # news_body = news_body.apply(lambda x: ' '.join(x))
    
    dataframe['tokenized'] = news_title
    dataframe['body'] = news_body
    return dataframe

In [8]:
def found_char(char, lst):
    the_pair =''
    for word in lst:
        if char in word:
            the_pair = word

    return the_pair

In [9]:
def search_pairs (lst):
    found_pair = ''
    for trade_pair in pair_list['pairs']:
        if lst[0] in trade_pair and lst[1] in trade_pair:
            found_pair = trade_pair
    return found_pair

In [10]:
def sentimental_calculator(news):
    some_news = []

    for i in range(0,news.shape[0]):
        currency_pair = []

        action = 'na' 
        duration = '1hr'
        
        bull_counter = 0
        bear_counter = 0
        duration_counter = {
            "fivemin_counter": 0,
            "onehr_counter": 0,
            "oneday_counter": 0
        }
        for word in news.tokenized[i]:
            if word in bull_words:
                bull_counter += 1

            if word in bear_words:
                bear_counter += 1
            
            if word in fivemin_words:
                duration_counter["fivemin_counter"] += 1
            
            if word in onehr_words:
                duration_counter["onehr_counter"] += 1
            
            if word in oneday_words:
                duration_counter["oneday_counter"] += 1
                
            if '/' in word:
                for trade_pair in pair_list['pairs']:
                    if word in trade_pair:
                        if trade_pair not in currency_pair:
                            currency_pair.append(trade_pair)

            else:
                for x in pair_vocabulary.keys():
                    if word.startswith(x.lower()):
                        if x.lower() not in currency_pair:
                            currency_pair.append(x.lower())

                    if word.endswith(x.lower()):
                        if x.lower() not in currency_pair:
                            currency_pair.append(x.lower())

                    else:
                        for trade_pair_vocab in pair_vocabulary[x]:
                            if word == trade_pair_vocab:
                                if x.lower() not in currency_pair:
                                    currency_pair.append(x.lower())

        x = found_char('/', currency_pair)                            
        if x != '':
            currency_pair = found_char('/', currency_pair)

        else:
            if len(currency_pair) > 1:
                if len(currency_pair) < 3:
                    currency_pair = search_pairs(currency_pair)

        sentiment = bull_counter - bear_counter
        if sentiment > 0:
            action = 'bull'

        else:
            if sentiment < 0:
                action = 'bear'
        
        duration_sentiment = max(duration_counter, key=duration_counter.get)
        some_news.append([news.source[i], news.page[i], news.date[i], news.title[i], news.body[i], news.tokenized[i], action, currency_pair, duration_sentiment])

    some_news = pd.DataFrame(some_news, columns =['source', 'page', 'date', 'title', 'body', 'tokenized', 'action', 'pair', 'duration'])

    return some_news

In [None]:
news_dataframe_processed = setup_and_token(scraped_news_df)
news_dataframe_processed.to_csv("news_dataframe_processed.csv", index = False)
some_news = pd.DataFrame(sentimental_calculator(news_dataframe_processed), columns =['source', 'page', 'date', 'title', 'body', 'tokenized', 'action', 'pair', 'duration'])
some_news.to_csv('fx-news-tokenized.csv')
print ('tokenizing completed')            

In [None]:
def check_similarity(source,text):
    source = nlp(source)
    text = nlp(text)
    score = source.similarity(text)
    return score
    
source = ""
for i, row in some_news.iterrows():
    score_temp = 0
    sent_temp = ""
    source = row["source"]
    for j, row_ in some_news.loc[i:].iterrows():
        if source != row_["source"]:
            score = check_similarity(row["body"],row_["body"][:300])
            if score>score_temp: 
                score_temp = score
                sent_temp = row_["body"]
    print(score_temp)
    print({row["body"]:sent_temp})

In [19]:
df = pd.read_csv('fx-news-tokenized.csv', parse_dates = [2])
sorted_orders = []
for i in range(0,df.shape[0]):
    for pair in df.pair[[i]]:
        if df.action[i] != 'na':
            if type(pair).__name__ != 'float':
                if '/' in pair:
                    sorted_orders.append([df.page[i], df.date[i], df.title[i], df.action[i], df.pair[i]])

sorted_orders = pd.DataFrame(sorted_orders, columns = ['page', 'date', 'title', 'action', 'pair'])           

sorted_orders.to_csv('fx-news-sorted.csv')
print ('sorting completed')
sorted_orders

sorting completed


Unnamed: 0,page,date,title,action,pair
0,1,2022-05-19,Dollar drops as yen Swiss franc draw safe-have...,bear,chf/jpy
1,0,2022-05-25,GBP/USD bounces from weekly lows and marches f...,bear,gbp/usd
2,0,2022-05-25,USD/CHF clings to gains amid stronger USD lack...,bull,usd/chf
3,0,2022-05-25,USD/CAD to slide back to the low 1.27 zone – S...,bear,usd/cad
4,0,2022-05-25,USD/CAD eases from multi-day peak still well b...,bull,usd/cad
5,0,2022-05-25,USD/JPY sticks to gains near 127.00 amid stron...,bull,usd/jpy
6,0,2022-05-25,EUR/USD Price Analysis Downside pressure allev...,bear,eur/usd
7,1,2022-05-25,AUD/USD slides below mid-0.7000s closer to wee...,bear,aud/usd
8,1,2022-05-25,GBP/USD slides back below 1.2500 with Fed spea...,bear,gbp/usd
9,1,2022-05-25,EUR/JPY Price Analysis Minor contention comes ...,bear,eur/jpy
