In [1]:
%%capture
import os
import pandas as pd
import regex as re
import numpy as np
import multiprocessing
from datetime import datetime
from pathlib import Path
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.models.word2vec import Word2Vec

# Import functions from other ipynb files
import importlib
import ipynb
from ipynb.fs.full.utilities import CleanNews, CleanNews_w2v, ToPercent, FilterNews, Dataloader, GetTotalExamples
from tqdm.notebook import tqdm
tqdm().pandas()

# Downlaod the NLP Model: https://spacy.io/models/en
#nlp = spacy.load("en_core_web_md")

#path = "C:/Users/Stefa/Documents/Sentiment Analysis Files/FX/"

In [2]:
# Reload Module
importlib.reload(ipynb.fs.full.utilities)
from ipynb.fs.full.utilities import CleanNews, CleanNews_w2v, ToPercent, FilterNews, Dataloader, GetTotalExamples

In [3]:
class TextualFactors_preprocessing():
    """
    This class contains all necessary preprocessing tasks for the Textual Factors class.
    Methods:
    clean_raw_news:        Clean raw news articles for subesquent tasks
    gen_w2v_training_data: Cleans raw news articles for the use of training the word2vec model. Output is a text file 
                           containing one sentence per line. Optionally it allows to generate also bigrams and trigrams
                           with the gensim Phraser model.
    generate_grams:        Generate bigrams and trigrams from unigrams
    train_word2vec:        Train the word2vec model
    """
    
    def __init__(self, root_dir, raw_data_dir, docs_dir, w2v_data_src):
        self.root_dir     = root_dir            # Textual Factors root directory
        self.raw_data_dir = raw_data_dir        # directory that contains the raw news articles
        self.docs_dir     = docs_dir            # directory that contains the cleaned news articles
        self.w2v_data_src = w2v_data_src        # directory that contains the word2vec training files
        
        Path(self.root_dir+'/models/').mkdir(parents=True, exist_ok=True) 
        Path(self.docs_dir).mkdir(parents=True, exist_ok=True) 
        Path(self.w2v_data_src).mkdir(parents=True, exist_ok=True) 
    
    
    def clean_raw_news(self, start_year=2011, end_year=2016, filename='news_data_fx_v2_####'):
        print(f"Clean raw news articles ... {datetime.now().strftime('%H:%M:%S')}")
        
        for year in list(np.arange(start_year, end_year)):
            year = str(year)
            print(year)
            news = pd.read_csv(self.raw_data_dir + re.sub(r'####', str(year), filename)+'.csv', index_col=0, encoding='utf-8')  
            #raw_news = raw_news.reset_index(drop=True)

            # Drop all news with duplicate headlines and keep the most recent one
            news = news[news.duplicated(subset='Headline', keep='first') == False]
            news = news.reset_index(drop=True)

            # Clean Headlines and Body
            news.loc[:, 'Headline'] = news['Headline'].progress_apply(CleanNews, to_lowercase=True)
            news.loc[:, 'Body']     =     news['Body'].progress_apply(CleanNews, to_lowercase=True)

            # Convert price changes with the pattern 'to XX$ from XY$' to 'by YX percent'    
            news.loc[:, 'Headline'] = news['Headline'].progress_apply(ToPercent)
            news.loc[:, 'Body']     =     news['Body'].progress_apply(ToPercent)

            # Save to csv
            news.to_csv(self.docs_dir + re.sub(r'####', str(year), filename)+'_clean.csv', encoding='utf-8', index=True)
               
    
    class tiny_dataloader(object):
        def __init__(self, file_dir, fname):
            self.file_dir = file_dir
            self.fname    = fname
            
        def __iter__(self):    
            for i, line in enumerate(open(os.path.join(self.file_dir, self.fname))):
                yield line.split()
    
    
    def generate_grams(self, src_dir, output_dir, model_dir, min_count, threshold, gram_type='bigram'):
        """
        Generate bigrams or trigrams with gensim Phrases.
        src_dir:    directory that contains files with unigrams (to generate bigrams) or files with bigrams (to generate trigrams)
        output_dir: directory to save the generated bigram and trigram files
        model_dir:  directory to save the Phrase model
        min_count:  (float, optional) – Ignore all words and bigrams with total collected count lower than this value.
        threshold:  (float, optional) – Represent a score threshold for forming the phrases (higher means fewer phrases). 
                    A phrase of words a followed by b is accepted if the score of the phrase is greater than threshold.
        gram_type:  select either 'bigram' or 'trigram', default='bigram'
        """  
        files = os.listdir(src_dir)
        years = []
        for i, fname in enumerate(files):
            year = re.findall(r"\d\d\d\d", fname)[0]
            years.append(year)
            
        print(f'Train the {gram_type} Phraser model ...')
        documents = Dataloader(path=src_dir, split=True)
        model     = Phrases(documents, min_count=min_count, threshold=threshold, delimiter='_')

        # Export the trained model = use less RAM, faster processing. Model updates no longer possible.
        frozen_model = model.freeze()     
        frozen_model.save(model_dir+gram_type+"_phrase_model_"+years[0]+'-'+years[-1]+".pkl")          
        #phraser   = Phraser(model)      
            
        print(f'Generate {gram_type} training files ...')
        for i, fname in enumerate(files):
            year = re.findall(r"\d\d\d\d", fname)[0]
            documents = self.tiny_dataloader(file_dir=src_dir, fname=fname)
            
            with open(output_dir + "w2v_"+gram_type+"_phrases_"+year+".txt", 'w') as f:
                for i, sent in enumerate(documents):
                    f.write(' '.join(frozen_model[sent]))
                    f.write('\n')

            
        
    def gen_w2v_training_data(self, start_year=2011, end_year=2016, filename='news_data_fx_v2_####', 
                               min_sent_len=5, generate_unigrams=True, generate_bigrams=True, generate_trigrams=True,
                               bigram_min_count=50, bigram_threshold=20, trigram_min_count=50, 
                               trigram_threshold=200):
        
        """
        Clean News for word2vec and write to .txt files
        min_sent_len: Minium number of words that are considered as a sentence and are wirtten to the .txt files
        """
        
        Path(self.w2v_data_src+'/unigrams').mkdir(parents=True, exist_ok=True) 
                    
        if generate_unigrams:
            print(f"Clean raw news articles for word2vec ... {datetime.now().strftime('%H:%M:%S')}")
            for year in list(np.arange(start_year, end_year)):
                year = str(year)
                print(year)

                raw_news = pd.read_csv(self.raw_data_dir + re.sub(r'####', str(year), filename)+'.csv', index_col=0, encoding='utf-8')  

                # Drop all news that contain no body text
                raw_news = raw_news[raw_news.Body.isna() == False]

                # Drop all news with duplicate headlines and keep the most recent one
                raw_news = raw_news[raw_news.duplicated(subset='Headline', keep='last') == False]
                raw_news = raw_news.reset_index(drop=True)

                w2v_train      = pd.DataFrame(columns=['Body'])
                w2v_train.Body = raw_news.Body.progress_apply(CleanNews_w2v, to_lowercase=True)
                w2v_train      = w2v_train.dropna().reset_index(drop=True)  

                with open(self.w2v_data_src+'/unigrams/w2v_train_'+year+'.txt', 'w') as f:
                    for article in w2v_train.Body:
                        l = re.split('(?<=\w\w)\.\s|\s\.\s|\.$|^\.|\*', article)
                        for s in l:
                            if len(s.split(' ')) > min_sent_len:
                                f.write('%s\n' % s.strip())
        
        
        if generate_bigrams:
            print(f"Generate bigrams ... {datetime.now().strftime('%H:%M:%S')}")
            Path(self.w2v_data_src+'/bigrams').mkdir(parents=True, exist_ok=True) 
            self.generate_grams(src_dir    = self.w2v_data_src+'unigrams/', 
                                output_dir = self.w2v_data_src+'bigrams/',
                                model_dir  = self.root_dir+'models/',
                                min_count  = bigram_min_count, 
                                threshold  = bigram_threshold,
                                gram_type  ='bigram'
                               )
                          
        if generate_trigrams:
            print(f"Generate trigrams ... {datetime.now().strftime('%H:%M:%S')}")
            Path(self.w2v_data_src+'/trigrams').mkdir(parents=True, exist_ok=True) 
            self.generate_grams(src_dir    = self.w2v_data_src+'bigrams/', 
                                output_dir = self.w2v_data_src+'trigrams/',
                                model_dir  = self.root_dir+'models/',
                                min_count  = trigram_min_count, 
                                threshold  = trigram_threshold,
                                gram_type  ='trigram'
                               )
                               
                
    def train_word2vec(self, w2v, model_name, gram_type='trigrams', epochs=10):
        """
        Train a Word2Vec model
        w2v:       dictionary containing the word2vec training parameters
        gram_type: select either 'unigrams', 'bigrams', 'trigrams'
        epochs:    number of epochs to train word2vec
        """
        print(f"Train the word2vec model ... {datetime.now().strftime('%H:%M:%S')}")
        examples  = GetTotalExamples(self.w2v_data_src+gram_type)
        sentences = Dataloader(self.w2v_data_src+gram_type, print_epoch=True, split=True) 
        model     = Word2Vec(sg=w2v['sg'], hs=w2v['hs'], vector_size=w2v['size'], 
                             negative=w2v['negative'],   window=w2v['window'], 
                             min_count=w2v['min_count'], alpha=w2v['alpha'],
                             min_alpha=w2v['min_alpha'], workers=multiprocessing.cpu_count()-1)

        # Train the model
        print('Build vocabulary ...')
        model.build_vocab(sentences)

        print('Train ...')
        model.train(sentences, total_examples=examples, epochs=epochs)  # optimal: 80 epochs

        # Save the model
        print('Save the model ...')
        model.save(self.root_dir+'/models/'+model_name+'.word2vec')        
                               

In [4]:
tf_pre = TextualFactors_preprocessing(
    root_dir     = "C:/Users/Stefa/Documents/Uni/Projektassistenz/FX/Python/Textual_Factors/",
    raw_data_dir = "F:/Sentiment Analysis Files/FX/",
    docs_dir     = "F:/Sentiment Analysis Files/FX/clean_fx_news_v2/",
    w2v_data_src = "F:/Sentiment Analysis Files/FX/train_w2v_data_2016-2019/"
)

In [None]:
tf_pre.clean_raw_news(
    start_year = 2006, 
    end_year   = 2011, 
    filename   = 'news_data_fx_v2_####'
)

In [28]:
tf_pre.gen_w2v_training_data(
    start_year = 2016, 
    end_year   = 2019, 
    filename   = 'news_data_fx_v2_####', 
    min_sent_len = 5, 
    generate_unigrams = False, 
    generate_bigrams  = True, 
    generate_trigrams = False,
    bigram_min_count  = 50, 
    bigram_threshold  = 10, 
    trigram_min_count = 50, 
    trigram_threshold = 150
)

Generate bigrams ... 16:32:47
Train the bigram Phraser model ...
Generate bigram training files ...
Generate trigrams ... 16:51:03
Train the trigram Phraser model ...
Generate trigram training files ...


In [None]:
tf_pre.train_word2vec(
    w2v = {'sg':0, 'hs':0, 'size':32, 'negative':5, 'window':8, 'min_count':50, 'alpha':0.03, 'min_alpha':0.005},
    model_name = 'w2v_cbow_32_neg_5_window_8_40_epochs_trigrams_2016_2019',
    gram_type  = 'trigrams',
    epochs     = 40
)

In [25]:
raw_data_dir = "F:/Sentiment Analysis Files/FX/"
filename   = 'news_data_fx_v2_####'

for year in range(2006, 2020, 1):
    news = pd.read_csv(raw_data_dir + re.sub(r'####', str(year), filename)+'.csv', index_col=0, encoding='utf-8', nrows=5)  
    print(year, f"CO2 in Columns: {'CO2' in set(news.columns)}")

2006 CO2 in Columns: True
2007 CO2 in Columns: True
2008 CO2 in Columns: True
2009 CO2 in Columns: True
2010 CO2 in Columns: True
2011 CO2 in Columns: False
2012 CO2 in Columns: False
2013 CO2 in Columns: False
2014 CO2 in Columns: False
2015 CO2 in Columns: False
2016 CO2 in Columns: True
2017 CO2 in Columns: True
2018 CO2 in Columns: True
2019 CO2 in Columns: True


In [5]:
class tiny_dataloader(object):
    def __init__(self, file_dir, fname):
        self.file_dir = file_dir
        self.fname    = fname

    def __iter__(self):    
        for i, line in enumerate(open(os.path.join(self.file_dir, self.fname))):
            yield line
            
f = tiny_dataloader("F:/Sentiment Analysis Files/FX/train_w2v_data_1996-2018/bigrams/", "w2v_bigram_phrases_2002.txt")

In [None]:
for i, text in enumerate(f):
    print(text)
    
    if i >= 100:
        break