# Project 2: Post-crisis Press Releases Analysis

<b>Creator:</b> Congci(Damon) Hao, conghao@iu.edu

<b>Objective:</b> The goal of this project is to extract content of information in press releases issued following business crises, e.g., oil spills, casualty accidents, or investor class action lawsuits. 

<b>Text to be processed:</b> Each press release starts a 5-digit company identifier permno, crisis_date, and disclosure_date. Please create three columns for these three. Please follow Marco’s earlier code. Process all text in a press release but remove boilerplate safe harbor statement and business description/contact info.        

<b>Output items:</b>

•	Readability_Index

•	Total_words	

•	Number_Entities	

•	Words_in_Entities	

•	Number_of_Times	

•	Words_in_Times	

•	Number_of_Locations	

•	Words_in_Locations	

•	Number_of_Organizations	

•	Words_in_Organizations	

•	Number_of_Persons	

•	Words_in_Persons	

•	Number_of_Money	

•	Words_in_Money	

•	Number_of_Percentages	

•	Words_in_Percentages	

•	Number_of_Dates	

•	Words_in_Dates

•	Number of forward-looking words (Bozanic Roulstone Buskirk 2016 Appendix A word list)

•	Number of uncertain words (Bozanic et al. 2018 use Loughran and McDonald’s uncertainty measure)

•	Number of positive words (Harvard dictionary)

•	Number of negative words (Harvard dictionary)


In [2]:
# Standard Library
import os
import re
import csv
import time
import codecs
import string
from datetime import datetime
import pandas as pd
import numpy as np

# Third Party Libraries
import nltk
import nltk.data
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import opinion_lexicon

# Name Entity Recognitation
# https://juejin.im/post/5971a4b9f265da6c42353332?utm_source=gold_browser_extension%5D
import spacy 
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

# Measure the Readability
# https://pypi.org/project/textstat/
import textstat

# Measure the Sentiment 
# https://www.analyticsvidhya.com/blog/2018/02/natural-language-processing-for-beginners-using-textblob/
from textblob import TextBlob

In [3]:
#Run all pre-requisites

REGEX = r'\d{5} \d+/\d+/\d+ \d+/\d+/\d+'
TAG = r'<(.*?)>(.*?)</(.*?)>'

LINES = [temp.strip() for temp in open('expressions.txt', 'r').readlines()]
FWD_REGEX = re.compile(r'%s' % (r'\b' + r'\b|\b'.join(LINES) + r'\b'),
                   re.IGNORECASE)
IGNORE = ['call', r'questions?', 'press release', 'slides?', 'webcast',
          r'\?', r'(can|do|will|have) you', r'Q ?:', r'\[Q', r'\[?Operator\]?']
REG_IGNORE = re.compile(r'%s' %  r'|'.join(IGNORE), re.IGNORECASE)

CURRENT_DIR = os.path.dirname(os.path.abspath('__file__'))
FILE = '218_disclosures.txt'
PATH = os.path.join(CURRENT_DIR, FILE)


In [4]:
#Example: Write a python list into person.csv file

csvData = [['Person', 'Age'], ['Peter', '22'], ['Jasmine', '21'], ['Sam', '24']]
with open('person.csv', 'w',newline='') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerows(csvData)
csvFile.close()

In [5]:
def get_tokens(text):
    """Get a list of tokens (words) for a given text."""
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    
    filtered = [i for i in tokens if not all(j in string.punctuation for j in i)]
    final = [w.upper() for w in filtered if not w in stop_words]
    
    return final

# get_tokens(text)

In [6]:
def dump_splitter(data, headline=True):
    """Generator of article chunks."""
    buff = []
    for line in data:
        if re.findall(REGEX, line):
            if buff:
                if not headline:
                    buff.pop(1)
                yield u' '.join(buff)
                buff[:] = []
        if line.strip():
            buff.append(line.strip())
    yield u' '.join(buff)


In [7]:
def get_articles(headline=True):
    """Split the document file into articles."""
    path = os.path.join(CURRENT_DIR, '218_disclosures.txt')
    lines = codecs.open(path, 'rU', 'latin').readlines()
    docs = list(dump_splitter(lines, headline=headline))
    return docs

# get_articles(PATH)

In [8]:
def preprocess_text(doc):
    """Preprocess text."""
    # Extract preamble
    preamble = re.findall(REGEX, doc)[0].split()
    
    month_c = preamble[1].split('/')[0].zfill(2)
    day_c = preamble[1].split('/')[1].zfill(2)
    year_c = preamble[1].split('/')[2]
    crsis_date = month_c+day_c+year_c
    
    month_d = preamble[2].split('/')[0].zfill(2)
    day_d = preamble[2].split('/')[1].zfill(2)
    year_d = preamble[2].split('/')[2]
    disclosure_date = month_d+day_d+year_d
    
    #create an identifier to match with the bog_identifiers
    identifier = preamble[0]+crsis_date+disclosure_date
#     identifier = [preamble[0],
#             datetime.strptime(preamble[1],'%m/%d/%Y'),
#             datetime.strptime(preamble[2],'%m/%d/%Y')]
#     identifier ="{:02}".format(preamble[2])
#     identifier = preamble[2]
    
    text = re.sub(REGEX, '', doc).strip()
       
    # Remove irrelevant text
    text = re.sub((r'. (More information|For information on|For more '
                   'information) .*?$'), '', text)

    # Titlecase uppercase headlines
    capital = ''
    for char in text:
        if char.isupper() or char in string.punctuation + ' ':
            capital += char
        else:
            break
    if len(capital.split()) > 3:
        text = text.replace(capital, capital.title())

    return text, preamble,identifier

text, preamble,identifier = preprocess_text(get_articles(PATH)[100])
identifier,len(identifier)

('548270302200503212005', 21)

In [9]:
def get_readability(text):
    "Calculate some readability measures from the textstat package "
    
    #Return the Flesch Reading Ease Score
    read_ease = textstat.flesch_reading_ease(text)
    
    #Return the Fog Index Grade
    read_grade = textstat.gunning_fog(text)
    
    return read_ease,read_grade

get_readability(text)

(22.75, 13.22)

In [10]:
def is_fwd(sentence):
    """Return true if the sentece is a fwd looking statement."""
    if sentence.isupper():
        return False
    if REG_IGNORE.search(sentence):
        return False
    return bool(FWD_REGEX.search(sentence))

is_fwd("Now we will move to page 21. And I'm going to ask Drew to go over the pro forma financial impact.")

True

In [11]:
def get_sentences(text):
    """Sentence tokenizer."""
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    return sent_detector.tokenize(text.strip())

get_sentences("Now we will move to page 21. And I'm going to ask Drew to go over the pro forma financial impact.")

['Now we will move to page 21.',
 "And I'm going to ask Drew to go over the pro forma financial impact."]

In [12]:
def get_fwd_statements(text):
    """Get number of forward-looking statements."""
    all_sents = get_sentences(text)
    len_all = len(all_sents)
    if not len_all:
        return None, None, None
    fwd = 0
    fwd_sents = []
    for sent in all_sents:
        if is_fwd(sent):
            fwd += 1
            fwd_sents.append(sent)
    return len_all, fwd, fwd * 1.0 / len_all, fwd_sents

get_fwd_statements("Now we will move to page 21. And I'm going to ask Drew to go over the pro forma financial impact.")

(2, 1, 0.5, ['Now we will move to page 21.'])

In [13]:
def get_results(text):
    "Count and sum the number of words in each entity and the number of entities "
    
    text = nlp(text)

    labels = set([w.label_ for w in text.ents])
    entity_results = dict()
    word_results = dict()

    for label in labels:
        entities = [e.string for e in text.ents if label==e.label_]
        
        #get the number of words
        entity_list = " ".join(entities).strip()
        tokens = word_tokenize(entity_list)
        word_results[label] = len(tokens)
        
        #get the number of entities
        entity_results[label] = len(entities)


    for cat in ['TIME','LOC','ORG','PERSON','MONEY','PERCENT','DATE']:
            if not cat in entity_results.keys():
                entity_results[cat] = 0
            if not cat in word_results.keys():
                word_results[cat] = 0

    
    
    total_entities = sum(entity_results.values())
    e_times = entity_results['TIME']
    e_locations = entity_results['LOC']
    e_organizations = entity_results['ORG']
    e_persons = entity_results['PERSON']
    e_money = entity_results['MONEY']
    e_percentages = entity_results['PERCENT']
    e_dates = entity_results['DATE']
    
    total_entity_words = sum(word_results.values())
    w_times = word_results['TIME']
    w_locations = word_results['LOC']
    w_organizations = word_results['ORG']
    w_persons = word_results['PERSON']
    w_money = word_results['MONEY']
    w_percentages = word_results['PERCENT']
    w_dates = word_results['DATE']
        
    
    return total_entities, e_times, e_locations, e_organizations, e_persons, e_money, e_percentages, e_dates,\
    total_entity_words, w_times, w_locations, w_organizations, w_persons, w_money, w_percentages, w_dates


In [16]:
def get_uncertainty(text):
    "Count the frequencies of uncertain words from a list stored in uncertainty text file"
    
    cnt = Counter()
    wanted = re.findall('\w+',open('LM_uncertainty.txt').read())
    words = get_tokens(text)
    
    for word in words:
        if word in wanted:
            cnt[word] += 1
            
    total_cnt = sum(dict(cnt).values())
    
    return total_cnt,cnt

get_uncertainty(text)


(2, Counter({'UNSPECIFIED': 1, 'BELIEVES': 1}))

In [17]:
def get_sentiments(text):
    "Count the number of positive and negative words based off the LoughranMcDonald_SentimentWordLists_2018"
  
    cnt_pos = Counter()
    cnt_neg = Counter()
    words = get_tokens(text)
    
    wanted_pos = re.findall('\w+',open('LM_positive.txt').read())
    wanted_neg = re.findall('\w+',open('LM_negative.txt').read())
    
    for word in words:
        if word in wanted_pos:
            cnt_pos[word] += 1
        elif word in wanted_neg:
            cnt_neg[word] += 1
            
    pos = sum(dict(cnt_pos).values())
    neg = sum(dict(cnt_neg).values())
    
    return pos,neg,cnt_pos,cnt_neg

# get_sentiments(text)

## Retrieve Bog Index By StyleWriter

Check the other script,"Python Automation With pyautogui", automating the analysis of each disclosure in StyleWriter

In [22]:
#Retrieve Bog Index from "SW4stats.txt"

BOG = pd.read_csv("SW4stats.txt",delimiter="\\t")

# Need to drop the irrelevant Time column to check the duplicates of all other columns
BOG = BOG.drop(['Date & Time'], axis=1)

#Check the duplicates
# bog_index[bog_index['Document']=='93080_08112010_09012010.docx']
BOG = BOG.drop_duplicates(subset='Document',keep='first')

l = [re.sub('[^0-9]+','',''.join(i))[:21] for i in list(BOG['Document'].str.split('_'))]
    
BOG['ID'] = l
BOG = BOG.set_index(['ID'])

  This is separate from the ipykernel package so we can avoid doing imports until


## Run the Final Script Combining All Pre-Defined Functions

In [223]:
### RUN THIS ###

#create the header
header = [['Id','Crsis_Date','Disclosure_Date', 'Total Words',
           'Bog Index','Flesch Reading Ease Score','Gunning Fog Index',
           'Number of Entities','Words in Entities',
           'Number of Times','Words in Times',
           'Number of Locations','Words in Locations',
           'Number of Organizations','Words in Organizations',
           'Number of Persons','Words in Persons',
           'Number of Money','Words in Money',
           'Number of Percentages','Words in Percentages',
           'Number of Dates','Words in Dates',
           'Total Sentences', 'Total Forward Sentences', 'Forward Ratio',
           'Uncertainty Words',
           'Polarity','Subjectivity',
           'Positive Words','Negative Words'
           ]]

#open a new output csv file 
with open('output.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(header)
    
    #parse document into list of articles
    docs = get_articles(PATH)       
    print ('\nScanning %d articles in "%s" \n' % (len(docs), FILE))
    
    print('%d files are confirmed and ready to be processed \n' % len(BOG.index))
    
    for num, doc in enumerate(docs):
        
        text, preamble,identifier = preprocess_text(doc)
#         print('Article: ' + preamble[0] + '\n')
                
        ID = preamble[0]
        crsis_date = preamble[1]
        disclosure_date = preamble[2]
        
        if text == None:
            print ('\tERROR for file %s: 0 length' % ID)
            continue
            
        if identifier in BOG.index:
            
            total_entities, e_times, e_locations, e_organizations, e_persons, e_money, e_percentages, e_dates,\
            total_entity_words, w_times, w_locations, w_organizations, w_persons, w_money, w_percentages, w_dates \
            = get_results(text)

            bog_index = BOG.loc[identifier]['Bog Index']
            
            read_ease,read_grade = get_readability(text)

            total_words = len(get_tokens(text))

            fwd = get_fwd_statements(text)

            #count the frequencies of uncertain words 
            total_cnt,cnt = get_uncertainty(text)

            #use textblob package to analyze the sentiment
            blob = TextBlob(text)
            polarity = round(blob.sentiment.polarity,2)
            subjectivity = round(blob.sentiment.subjectivity,2)

            #count the positive and negative words
            pos, neg, cnt_pos, cnt_neg = get_sentiments(text)

            row = [ID,crsis_date, disclosure_date, total_words,\
                    bog_index,read_ease,read_grade,\
                    total_entities, total_entity_words, e_times, w_times, e_locations, w_locations, \
                    e_organizations, w_organizations, e_persons, w_persons, e_money, w_money, \
                    e_percentages, w_percentages, e_dates, w_dates,\
                    fwd[0],fwd[1],round(fwd[2],2),\
                    total_cnt,\
                    polarity, subjectivity,\
                    pos,neg] 


            writer.writerow(row)
        
        #list the positive and negative words by documents
#         print('Positive Words： ' + str(cnt_pos) + '\nNegative Words: ' + str(cnt_neg) + '\nUncertainty Words: ' + str(cnt) + '\n\n')
        
        else:      
            continue
            
    print('All done.\n')


Scanning 363 articles in "218_disclosures.txt" 

218 files are confirmed and ready to be processed 

All done.

