# Project 1: M&A transcripts 

<b>Creator:</b> Congci(Damon) Hao, conghao@iu.edu

<b>Objective:</b> The goal of this project is to show how much mangers know about sources of merger synergies. Based on readings of many cases, Professor Tseng hypothesized that mangers who know more about sources of synergies provide longer answers, more specific answers, and more forward-looking answers. 

<b>Input data:</b> Each transcript starts with management presentation, but we need only answers in the Q&A session following the presentation. Each M&A transcript is in a separate .txt file with a unique identifier as the file name. 


<b>Output items:</b>

•	Number of questions

•	Total_words	

•	Number_Entities	

•	Words_in_Entities	

•	Number_of_Times	

•	Words_in_Times	

•	Number_of_Locations	

•	Words_in_Locations	

•	Number_of_Organizations	

•	Words_in_Organizations	

•	Number_of_Persons	

•	Words_in_Persons	

•	Number_of_Money	

•	Words_in_Money	

•	Number_of_Percentages	

•	Words_in_Percentages	

•	Number_of_Dates	

•	Words_in_Dates

•	Number of forward-looking words (Bozanic Roulstone Buskirk 2016 Appendix A word list)

•	Number of uncertain words (Bozanic et al. 2018 use Loughran and McDonald’s uncertainty measure)

•	Number of positive words (Harvard dictionary)

•	Number of negative words (Harvard dictionary)


### Import Libraryies/Modules

In [4]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Functions for forward-looking statements extraction."""

# Standard Library
import re
import os
import csv
import time
import glob
import string
import argparse

# Third Party Libraries
import nltk
import nltk.data
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords

# Parse XML
import xml.etree.ElementTree as ET


# Name Entity Recognitation
# https://juejin.im/post/5971a4b9f265da6c42353332?utm_source=gold_browser_extension%5D
import spacy 
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

# Measure the Readability
# https://pypi.org/project/textstat/
import textstat

# Measure the Sentiment 
# https://www.analyticsvidhya.com/blog/2018/02/natural-language-processing-for-beginners-using-textblob/
from textblob import TextBlob

# Regx to select and measure the forward-looking statements
LINES = [temp.strip() for temp in open('expressions.txt', 'r').readlines()]
REGEX = re.compile(r'%s' % (r'\b' + r'\b|\b'.join(LINES) + r'\b'),
                   re.IGNORECASE)
IGNORE = ['call', r'questions?', 'press release', 'slides?', 'webcast',
          r'\?', r'(can|do|will|have) you', r'Q ?:', r'\[Q', r'\[?Operator\]?']
REG_IGNORE = re.compile(r'%s' %  r'|'.join(IGNORE), re.IGNORECASE)


In [5]:
def get_tokens(text):
    """Get a list of tokens (words) for a given text."""
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    
    filtered = [i for i in tokens if not all(j in string.punctuation for j in i)]
    final = [w.upper() for w in filtered if not w in stop_words]
    
    return final

# get_tokens(content)

In [6]:
def get_uncertainty(filein):
    "Count the frequencies of uncertain words from a list stored in uncertainty text file"
    
    cnt = Counter()
    wanted = re.findall('\w+',open('uncertainty.txt').read())
    text = get_answers(filein)[0].upper()
    words = get_tokens(text)
    
    for word in words:
        if word in wanted:
            cnt[word] += 1
            
    total_cnt = sum(dict(cnt).values())
    
    return total_cnt

# get_uncertainty('824351.txt')

In [7]:
def get_sentences(text):
    """Sentence tokenizer."""
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    return sent_detector.tokenize(text.strip())

get_sentences("Now we will move to page 21. And I'm going to ask Drew to go over the pro forma financial impact.")

['Now we will move to page 21.',
 "And I'm going to ask Drew to go over the pro forma financial impact."]

In [8]:
def get_answers(filein):
    "Get the asnwer section of each TEXT disclosure and use it for analyzing the fwd statement, tagging entities\
    , and measuring sentiment"
    CURRENT_DIR = CURRENT_DIR = os.path.dirname(os.path.abspath('__file__'))
    folder = os.path.join(CURRENT_DIR, 'atseng_MA_transcript')

    path = os.path.join(folder, filein)
    data = open(path,'r',encoding='utf-8',errors="surrogateescape")
    mess = data.read().replace('\n',' ')

    tokens = word_tokenize(mess)
    stop_words = set(stopwords.words('english'))
    stop_words.remove('a')
    stop_words.remove('and')

    
    mess = [word.upper() for word in tokens if not word in stop_words]
    content = ' '.join(mess)

    try:
        qa = re.search(r'QUESTIONS? ?-?A?N?D? ?-?ANSWERS?(.+)',content).group(1)
    except AttributeError:
        qa = ''
    
    qa = get_sentences(qa)
    answers = []
    questions = 0

    
    for i in qa:
        try:
            if i[-2] == '.' or i[-1] == '.':
                answers.append(i)
            elif "?" in i:
                questions += 1
            else:
                question = 0
        except IndexError:
            continue
    
    answers = ' '.join(answers)
    
    return answers, questions
#1811892.txt does not have a q&a section
# get_answers('1811892.txt')
# get_answers("3734288.txt")

#Standard output desired from the algorithem 
# get_answers('1444369.txt')
# get_answers('1002821.txt')
# get_answers('300327.txt')
# get_answers('1080743.txt')

#The function can not reach 100% accuracy
#This is an example of missclassification that captures questions and answers clauser earlier
# get_answers('AAB_882345935_deal_callstreet_8_8_2012.txt')

In [9]:
def is_fwd(sentence):
    """Return true if the sentece is a fwd looking statement."""
    if sentence.isupper():
        return False
    if REG_IGNORE.search(sentence):
        return False
    return bool(REGEX.search(sentence))

#Example of testing the forward-looking statements
is_fwd("Now we will move to page 21. And I'm going to ask Drew to go over the pro forma financial impact.")

True

In [10]:
def get_fwd_statements(text):
    """Get number of forward-looking statements."""
    all_sents = get_sentences(text)
    len_all = len(all_sents)
    if not len_all:
        return None, None, None
    fwd = 0
    fwd_sents = []
    for sent in all_sents:
        if is_fwd(sent):
            fwd += 1
            fwd_sents.append(sent)
    return len_all, fwd, fwd * 1.0 / len_all, fwd_sents

#Extract important measures from forward-looking statements
get_fwd_statements("Now we will move to page 21. And I'm going to ask Drew to go over the pro forma financial impact.")

(2, 1, 0.5, ['Now we will move to page 21.'])

In [11]:
#Example: Write a python list into person.csv file

import csv
csvData = [['Person', 'Age'], ['Peter', '22'], ['Jasmine', '21'], ['Sam', '24']]
with open('person.csv', 'w',newline='') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerows(csvData)
csvFile.close()

In [12]:
def get_entities(answers):
    "Tag all the required entities by spacy"
    
    text = nlp(answers)
    labels =[x.label_ for x in text.ents]
    entities = dict(Counter(labels))
    
    for entity in ['TIME','LOC','ORG','PERSON','MONEY','PERCENT','DATE']:
        if not entity in entities.keys():
            entities[entity] = 0
    
    total_entities = sum(entities.values())
    times = entities['TIME']
    locations = entities['LOC']
    organizations = entities['ORG']
    persons = entities['PERSON']
    money = entities['MONEY']
    percentages = entities['PERCENT']
    dates = entities['DATE']
        
    
    return total_entities,times,locations, organizations, persons, money, percentages, dates

In [13]:
def get_results(answers):
    "Count and sum the number of words in each entity and the number of entities "
    
    text = nlp(answers)

    labels = set([w.label_ for w in text.ents])
    entity_results = dict()
    word_results = dict()

    for label in labels:
        entities = [e.string for e in text.ents if label==e.label_]
        
        #get the number of words
        entity_list = " ".join(entities).strip()
        tokens = word_tokenize(entity_list)
        word_results[label] = len(tokens)
        
        #get the number of entities
        entity_results[label] = len(entities)


    for cat in ['TIME','LOC','ORG','PERSON','MONEY','PERCENT','DATE']:
            if not cat in entity_results.keys():
                entity_results[cat] = 0
            if not cat in word_results.keys():
                word_results[cat] = 0

    
    
    total_entities = sum(entity_results.values())
    e_times = entity_results['TIME']
    e_locations = entity_results['LOC']
    e_organizations = entity_results['ORG']
    e_persons = entity_results['PERSON']
    e_money = entity_results['MONEY']
    e_percentages = entity_results['PERCENT']
    e_dates = entity_results['DATE']
    
    total_entity_words = sum(word_results.values())
    w_times = word_results['TIME']
    w_locations = word_results['LOC']
    w_organizations = word_results['ORG']
    w_persons = word_results['PERSON']
    w_money = word_results['MONEY']
    w_percentages = word_results['PERCENT']
    w_dates = word_results['DATE']
        
    
    return total_entities, e_times, e_locations, e_organizations, e_persons, e_money, e_percentages, e_dates,\
    total_entity_words, w_times, w_locations, w_organizations, w_persons, w_money, w_percentages, w_dates,


In [14]:
def get_sentiments(text):
    "Count the number of positive and negative words based off the LoughranMcDonald_SentimentWordLists_2018"
    
    #cnt_pos and cnt_neg store all the instances
    cnt_pos = Counter()
    cnt_neg = Counter()
    words = get_tokens(text)
    
    wanted_pos = re.findall('\w+',open('LM_positive.txt').read())
    wanted_neg = re.findall('\w+',open('LM_negative.txt').read())
    
    for word in words:
        if word in wanted_pos:
            cnt_pos[word] += 1
        elif word in wanted_neg:
            cnt_neg[word] += 1
            
    pos = sum(dict(cnt_pos).values())
    neg = sum(dict(cnt_neg).values())
    
    return pos,neg,cnt_pos,cnt_neg

# get_sentiments(get_answers('1080743.txt')[0])

In [16]:
CURRENT_DIR = CURRENT_DIR = os.path.dirname(os.path.abspath('__file__'))
folder = os.path.join(CURRENT_DIR, 'atseng_MA_transcript')
files = sorted([str(i) for i in os.listdir(folder) if i != '.DS_Store'])

#create the header
header = [['Id','Number of Questions','Total Words',
           'Number of Entities','Words in Entities',
           'Number of Times','Words in Times',
           'Number of Locations','Words in Locations',
           'Number of Organizations','Words in Organizations',
           'Number of Persons','Words in Persons',
           'Number of Money','Words in Money',
           'Number of Percentages','Words in Percentages',
           'Number of Dates','Words in Dates',
           'Total Sentences', 'Total Forward Sentences', 'Forward Ratio',
           'Uncertainty Words',
           'Polarity','Subjectivity',
           'Positive Words','Negative Words'
           ]]

#open a new output csv file 
with open('MA_Transcripts_Analysis.csv', 'w',newline='') as fileout:
    writer = csv.writer(fileout)
    writer.writerows(header)
    print ('\nScanning %d files in "%s"' % (len(files), folder))
    
    num_txt = 0
    no_qa = []
    for num, filein in enumerate(files):
        basename, extension = os.path.splitext(filein)
        
        if extension == '.txt':
            print(filein)
            num_txt += 1
            
            ID = filein.strip('.txt')

            answers,questions = get_answers(filein)
           
            words = len(get_tokens(answers))

            if answers == '':
                no_qa.append(filein)
                print ('\tERROR for file %s: 0 length. There is no standard\
                Question and Answers section.' % filein)
                continue

            total_entities,e_times,e_locations, e_organizations, e_persons, e_money, e_percentages, e_dates,\
            total_entity_words,w_times,w_locations, w_organizations, w_persons, w_money, w_percentages, w_dates,\
            = get_results(answers)

            fwd = get_fwd_statements(answers.lower())

            #use textblob package to analyze the sentiment
            blob = TextBlob(answers)
            polarity = blob.sentiment.polarity
            subjectivity = blob.sentiment.subjectivity

            #count the frequencies of uncertain words 
            total_cnt = get_uncertainty(filein)
            
            pos,neg,cnt_pos,cnt_neg = get_sentiments(answers)
    #             if verbose:
    #                 os.system('clear')
    #                 print ('\n%d)' % num)
    #                 for sent in res[3]:
    #                     print ('\t%s' % sent)
    #                 _ = raw_input('')

            row = [ID, questions, words,\
                total_entities, total_entity_words, e_times, w_times, e_locations, w_locations, \
                e_organizations, w_organizations, e_persons, w_persons, e_money, w_money, \
                e_percentages, w_percentages, e_dates, w_dates,\
                fwd[0], fwd[1], round(fwd[2],2),\
                total_cnt, \
                polarity, subjectivity,\
                pos,neg]

            writer.writerow(row)
            
    print ('\nAll %d text files are processed.\nThese %d text files do not have QA sections:\n' % (num_txt,len(no_qa))) 
    print(no_qa)

    
    