In [None]:
#####################
#We process the raw data retrieved from the providers
####################

In [None]:
#import libraries
import pandas as pd
import os
from bs4 import BeautifulSoup
import glob
import json
from datetime import datetime, timedelta
import hashlib

In [None]:
# paths to raw data
IB_path = './../IB_API/IB/'
AV_path = './../IB_API/AV/'
#valuation dates
valuation_dates = ['2019-12-31','2018-12-31','2017-12-31','2016-12-31','2015-12-31','2014-12-31','2013-12-31','2012-12-31','2011-12-31','2010-12-31','2009-12-31','2008-12-31','2007-12-31']
#paths to saved data
have_data_path ='./financials.txt'
missing_data_path ='./financials_no_prices.txt'
secret_salt=b"secret"

In [None]:
#parse file and return data in list of dictionary items
def parse_IB_data(item_path):
    #collect the statement data
    item_data = []    
    symbol = item_path.split("/")[-1]
    #hash company symbol
    symbol = hashlib.pbkdf2_hmac('sha256', symbol.encode('utf-8'), secret_salt, 100000).hex()[0:10]
    ccy = item_path.split("/")[-2]
    with open(item_path) as fd:
        soup = BeautifulSoup(fd.read(), "xml")
        country =  soup.find('Exchange')['Country']
        fiscal_periods = soup.find_all('FiscalPeriod')
        for period in fiscal_periods:
            #extract the data from within the statements
            entry = {}
            entry['Symbol'] = symbol
            entry['Country'] = country
            entry['Currency'] = ccy
            entry['PeriodType'] = period['Type']
            entry['PeriodEndDate'] = period['EndDate']
            source = period.find('Source')
            #deal with missing item attributes
            if((source is None) or (source.has_attr('Date')==False)):
                entry['SourceDate'] = valuation_dates[0]
                entry['Source'] = 'Missing'  
            else:
                entry['SourceDate'] = source['Date']
                entry['Source'] = source.string  
            
            auditoropinion = period.find('AuditorOpinion')
            if(auditoropinion is None):
                entry['AuditorOpinion']='None'
            else:
                entry['AuditorOpinion'] = auditoropinion['Code']
            #Add all the line item data
            lineitems = period.find_all('lineItem')
            for line in lineitems:
                entry[line['coaCode']]=float(line.string)
            item_data.append(entry)

    return item_data

In [None]:
def increment_date(date_in):
    date_out = (datetime.strptime(date_in, '%Y-%m-%d') + timedelta(days=1)).strftime('%Y-%m-%d')
    return date_out

#parse the stock prices
def parse_AV_data(AV_path,dates):
    prices =[-1]*len(dates)
    with open(AV_path) as json_file:
        data = json.load(json_file)
    if 'Time Series (Daily)' in data:
        for idx, item in enumerate(dates):
            #we will take the average adjusted close price over the next 30 days 
            try_date = increment_date(item)
            days = 30
            price_matched = 0
            while(days>0):
                if(try_date in data['Time Series (Daily)']):
                    prices[idx] = float(data['Time Series (Daily)'][try_date]['5. adjusted close'])
                    price_matched +=1
                days -=1
                try_date = increment_date(try_date)
            if(price_matched>0):
                #take the average
                prices[idx]=prices[idx]/price_matched
    return prices

In [None]:
# step through each IB stock
exclude="nodata"
have_data = []
missing_data =[]
cnt=0

for (dirpath, dirnames, filenames) in os.walk(IB_path,topdown=True):
    dirnames[:] = [d for d in dirnames if d not in exclude]
    if(len(dirnames)==0):
        ccy=dirpath.split("/")[-1]
        for sym in filenames:
            #check that we have stock data before continuing
            av_search_file = os.path.join(AV_path,ccy,sym) + '_*'
            av_file =''
            for file in glob.glob(av_search_file):
                av_file = file
                break
            if(av_file!=''):
                #check valid contents
                item_path = os.path.join(dirpath,sym)
                financial_data = parse_IB_data(item_path)
                
                #get dates we want for stock prices
                dates=[]
                for entry in financial_data:
                    dates.append(entry['SourceDate'])
                #add on the valuation dates
                dates += valuation_dates
                stock_data = parse_AV_data(av_file,dates)
                # get the valution points
                stock_valuations = stock_data[len(financial_data):]
                no_data =True
                for idx, entry in enumerate(financial_data):                
                    #add in the stock price at each valuation date
                    if(stock_data[idx]>0):
                        no_data=False
                    for idxv, entryv in enumerate(stock_valuations):
                        entry[valuation_dates[idxv]] = entryv
                        if(entryv>0):
                            no_data=False
                if(no_data):
                    #we keep the data without any financials just in case
                    missing_data+=financial_data
                else:
                    #now we have all of the valuation data we include these in the financials data                
                    have_data+=financial_data
                cnt+=1
                if(cnt%500==0):
                    print(cnt)

#save the processed data to disk
financials_missing_df = pd.DataFrame(missing_data)
financials_missing_df.to_csv(missing_data_path,sep='\t')
financials_df = pd.DataFrame(have_data)
financials_df.to_csv(have_data_path,sep='\t')