In [1]:
from sec_edgar_downloader import Downloader
import timeit
# import our libraries
import re
import requests
import unicodedata
from bs4 import BeautifulSoup
import pprint
import pandas
import sys
import os
import spacy
import numpy as np
sys.setrecursionlimit(100000)
#%load_ext line_profiler
import urllib.request
import time
import os
nlp = spacy.load("en_core_web_md")
from datetime import datetime as dt
import unidecode
import gender_guesser.detector as gender


In [2]:
cik=pandas.read_csv('ticker_cik.txt', sep='\t', lineterminator='\n')
SP500= pandas.read_csv('SP500.csv')
SP500['co_tic']=SP500['co_tic'].str.lower()
cik_SP500=pandas.merge(SP500, cik, on='co_tic')

In [7]:
# Cette fonction va chercher un mot et retourner le tag html table qui le contient le plus de fois.
# L'hypothese est que les board members vont se trouver dans un tag html table et que c'est celui qui contiendra le plus de fois le mot "director"
# Si cette hypothese est (souvent) fausse alors l'heuristique pour trouver l'emplacement du director board ne sera pas fiable et il faudra la retravailler.
# On utilise la librarie de webscraping beautifulsoup
def find_board_table(data, searched_word):
    results=[]
    soup=[]
    best_text=[]
    
    soup = BeautifulSoup(data, 'html.parser')
    results = soup.find_all(string=re.compile('.*{0}.*'.format(searched_word), re.IGNORECASE), recursive=True)
    
    print('Found the word "{0}" {1} times'.format(searched_word, len(results)))
    best_occurences = 0
    # Pour toutes les tables contenant "director" que BS a trouve, on trouve celle qui contient le mot le plus et on recupere son text (sans les tags)
    for content in results:
        table = content.findParent('table')
        if table :
            text = " ".join([td.text for td in table.find_all('td')])
            text = re.sub(' +', ' ', text)
            word_occurences = len(re.findall(r'\bdirector\b', text, re.IGNORECASE))
            if word_occurences > best_occurences :
                best_text = text
                best_occurences = word_occurences
    best_text = unidecode.unidecode(best_text)
    return best_text

# Cette fonction permet de trouver les entitees de noms de personne dans un text donnee, il faut installer spacy et un modele preentraine (ici on utilise en_core_web_md)
# On retourne une liste contenant les entites PERSON qui devrait contenir les board members
# Le modele de NER est preentraine sur de gros corpus (wikipedia) donc pas besoin d'entrainement, on peut seulement l'utiliser pour faire des predictions sur nos donnees.
def find_entities(text):
    board_members = []
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == 'PERSON' and ent.text not in board_members :
            board_members.append(ent.text)
    return board_members

def clean_results(board_members):
    # enlever les linebreak markers,et éléments vides
    board_members = [re.sub(' +', ' ', e.replace('\n',' ').replace('/s/','').strip()) for e in board_members]
    # enlever les elements vides ou qui ne contiennent qu'un seul mot   
    board_members = list(filter(lambda x: (x is not None and len(x.split())>1) and 'signature' not in x.lower(), board_members))
    # enlever les duplicates    
    marker = set()
    filtered_board_members = []
    for l in board_members:
        ll = l.lower()
        if ll not in marker:   # test presence
            marker.add(ll)
            filtered_board_members.append(l)   #
    unique_board_members = []
    #enlever les entitées qui en contiendrait une autre (ex : une entité contenant deux personnes différentes ou une personne répétée + un titre)
    for s in filtered_board_members:
        if not any([r in s for r in filtered_board_members if s != r]):
            unique_board_members.append(s)
    # remove name that are repeated twice in an element
    unique_board_members = [" ".join(e.split()[:len(e.split())//2]) if " ".join(e.lower().split()[:len(e.split())//2]) == " ".join(e.lower().split()[(len(e.split())//2):]) else e for e in unique_board_members]
    print('The cleaned board list is :\n',unique_board_members)
    return unique_board_members
    
def get_gender_ratio(board_members):
    d = gender.Detector(case_sensitive=False)
    male, female, unknown = 0, 0, 0
    for member in board_members:
        pred = d.get_gender(member.split()[0])
        if pred == 'male' or pred == 'mostly_male':
            male += 1
        elif pred == 'female' or pred == 'mostly_female':
            female += 1
        else:
            unknown += 1
    print('On this board :\nthe ratio of male is {0}\nthe ratio of female is {1}\nthe ratio of unknown gender is {2}'.format(male/(len(board_members)), female/(len(board_members)), unknown/(len(board_members))))

In [None]:
start = time.time()

#board_wrds=pandas.read_csv('WRDS_boardmembers.csv',encoding = "ISO-8859-1")
#board_wrds=pandas.read_csv('AAPL_boardmembers.csv')
#board_wrds.AnnualReportDate=pandas.to_datetime(board_wrds.AnnualReportDate,format='%Y%m%d')
#board_wrds = board_wrds[board_wrds.RowType != 'Disclosed Earner']

counter_notanalyzed=0
result_dict=[]
result_dict2=[]
path = 'sec_edgar_filings'
files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(path):
    for file in f:
        if '.html' in file:
            files.append(os.path.join(r, file))

for k in range(0,len(files)): 
#for k in range(0,100):

    try:
        path=files[k]
        short_path=(os.path.splitext(path)[0])
        ticker=short_path.split('/')[1]
        year='20'+short_path.split('/')[2].split('-')[1]
        year=dt.strptime(year,"%Y")
        print(year)


        abspath=os.path.abspath(path)
        data= open(abspath,"r")
        text = find_board_table(data, 'Director')
        board_members = find_entities(text)
        board_members = clean_results(board_members)

    except:
        print('Text not analyzed')

    

print('processing took : ',time.time() - start, 'seconds')

2014-01-01 00:00:00
Found the word "Director" 48 times
The cleaned board list is :
 ['THOMAS C. FREYMAN', 'ROBERT E. FUNCK', 'ROBERT J. ALPERN', 'ROXANNE S. AUSTIN', 'SALLY E. BLOUNT', 'EDWARD M. LIDDY', 'NANCY MCKINSTRY']
2003-01-01 00:00:00
Found the word "Director" 53 times
The cleaned board list is :
 ['ROXANNE S. AUSTIN', 'RICHARD A. GONZALEZ', 'JEFFREY M. LEIDEN', 'JACK M. GREENBERG', 'THOMAS C. FREYMAN', 'DAVID A. JONES', 'GREG W. LINDER', 'DAVID A. L. OWEN']
2016-01-01 00:00:00
Found the word "Director" 41 times
The cleaned board list is :
 ['BRIAN B. YOOR', 'ROBERT E. FUNCK', 'ROBERT J. ALPERN', 'Roxanne S. Austin', 'SALLY E. BLOUNT', 'EDWARD M. LIDDY', 'NANCY MCKINSTRY']
2013-01-01 00:00:00
Found the word "Director" 46 times
The cleaned board list is :
 ['THOMAS C. FREYMAN', 'GREG W. LINDER', 'ROBERT J. ALPERN', 'ROXANNE S. AUSTIN', 'SALLY E. BLOUNT', 'EDWARD M. LIDDY', 'NANCY MCKINSTRY']
2008-01-01 00:00:00
Found the word "Director" 42 times
The cleaned board list is :
 ['RO