In [9]:
import numpy as np
import pandas as pd
from os import listdir, chdir, scandir
from os.path import isfile, join
import re
# import requests
# import bs4

## Scrape website and download multiple files

In [10]:
## source: https://employersinfocmp.cma.gov.il/#/publicreports
# using the generated XHR request, available via Chrome Developer Tools

# TODO: Scrape the website / get API access

# request_url = "https://employersinfocmp.cma.gov.il/api/PublicReporting/GetPublicReports"
# request_headers = {"Host": "employersinfocmp.cma.gov.il",
# "Connection": "keep-alive",
# "Content-Length": "239",
# "Accept": "application/json, text/plain, */*",
# "sec-ch-ua-mobile": "?1",
# "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Mobile Safari/537.36",
# "content-type": "application/json",
# "Origin": "https://employersinfocmp.cma.gov.il",
# "Sec-Fetch-Site": "same-origin",
# "Sec-Fetch-Mode": "cors",
# "Sec-Fetch-Dest": "empty",
# "Referer": "https://employersinfocmp.cma.gov.il/",
# "Accept-Encoding": "gzip, deflate, br",
# "Accept-Language": "en-US,en;q=0.9,he;q=0.8"
# }
# payload = {"corporation": "",
#            "fromQuarter": "",
#             "fromYear": "",
#             "toQuarter": "",
#             "toYear": "",
#             "reportType": "",
#             "systemField": "",
#             "statusReport": 1,
#             "investmentName": "",
#             "reportFromDate": "",
#             "reportToDate": ""
#         };
# response = requests.post(request_url, data=payload, headers=request_headers)

# json_data = response.json()

## Process downloaded files --> Unified file

In [19]:
# Process reports in domains (subdirectories) of this path
reports_path = "/Users/urimarom/Downloads/holdings reports/2021q2 reports/"
included_sheets = ['מניות', 'אג"ח קונצרני', 'לא סחיר - אג"ח קונצרני', 'לא סחיר - מניות']
cols_to_keep = ['שם המנפיק/שם נייר ערך',
                'מספר ני"ע',
                'מספר מנפיק',
                'ענף מסחר',
                'שווי',
                'שעור מערך נקוב מונפק',
                'שעור מנכסי אפיק ההשקעה',
                'שעור מסך נכסי השקעה',
                'גוף',
                'אפיק'
               ]

In [20]:
# utility functions
def remove_arrow_symbol(s):
    if type(s) == str:
        return re.sub(r'[◄]+','', s)
    else:
        return s

def remove_leading_numbering(s):
    if type(s) == str:
        clean = re.sub(r'^.*\.', '', s).strip()
        clean = re.sub(r'\(.*\)', '', clean).strip()
        clean = clean.replace(':','').strip()
        if len(clean) > 0:
            return clean
        else:
            return s
    else:
        return s

def remove_empty_rows_and_cols(sheet):
    ''' remove columns and rows with only missing values
    Args:
      sheet (DataFrame): sheet to be handled

    Returns:
      DataFrame: sheet without empty rows and columns
    '''
    # remove special characters first
    sheet = sheet.applymap(lambda x: remove_arrow_symbol(x))
    sheet = sheet.replace(r'^\s*$', None, regex=True)
    sheet = sheet.dropna(axis=0, how='all')
    sheet = sheet.dropna(axis=1, how='all')

    return sheet

def clean_sheet(sheet, null_pct_thresh=0.5):
    ''' remove columns and rows with > null_pct_thresh% nulls
    Args:
      sheet (DataFrame): sheet to be handled
      null_pct_thresh (int): 

    Returns:
      DataFrame: sheet without rows and columns with > null_pct_thresh% nulls
    '''
    sheet = sheet.dropna(axis=1, thresh=len(sheet)*null_pct_thresh)
    num_cols = len(sheet.columns)
    sheet = sheet.dropna(thresh=num_cols*null_pct_thresh)
    return sheet

def verify_sheets(df_dict, sheets):
    ''' get the sheet names to be included in the processing
    '''
    existing_sheets = [k for k in list(report_df_dict.keys()) if k in sheets]
    if len(existing_sheets) < len(sheets):
        missing_sheets = [s for s in sheets if s not in existing_sheets]
        print("missing sheets: {}".format(missing_sheets))
        return False
    else:
        print("all needed sheets found!")
        return True

In [21]:
def get_summary_sheet_name(report_df_dict):
    '''identify summary sheet by sheet name
    '''
    report_df_dict.keys()
    summary_sheet_name = [k for k in list(report_df_dict.keys()) if k.startswith("סכום")][0]
    print("Summary sheet: {}".format(summary_sheet_name))
    return summary_sheet_name

In [22]:
def get_company_name_from_summary_sheet(summary_sheet):
    '''get company name
    '''
    # locate the word חברה in the summary sheet
    company_loc = np.where(summary_sheet.apply(lambda col: col.str.contains('חברה', na=False), axis=1))
    if company_loc[0].size > 0:
        company_row = summary_sheet.loc[company_loc[0]]
        company_cell = summary_sheet.loc[company_loc].iloc[0, 0]
        # handle reports with company name in the same cell as החברה המדווחת
        if len(company_cell.split()) > 2:
            full_company_name = ' '.join(company_cell.split()[2:])
        else:
            full_company_name = summary_sheet.loc[company_loc[0], company_loc[1] + 1].iloc[0, 0]
        company_name = full_company_name.partition(' ')[0]
        print("found company name: {}".format(company_name))
        return company_name
    else:
        print("no company found")
        return
    
def get_asset_allocation_from_summary_sheet(summary_sheet):
    '''get asset allocation data from summary sheet
    returns: a DataFrame
    '''
    # locate the word מזומנים as anchor for asset allocation
    anchor_loc = np.where(summary_sheet.apply(lambda col: col.str.contains('מזומנים', na=False), axis=1))
    if anchor_loc[0].size > 0:
        anchor_row_num = anchor_loc[0][0]
        anchor_col_num = anchor_loc[1][0]
        headers = summary_sheet.iloc[anchor_row_num:,anchor_col_num]
        # find the first null in the headers (where to stop parsing)
        headers_first_null = headers.loc[headers.isnull()].index[0]
        headers_end_index = headers_first_null - 1
        asset_alloc = pd.DataFrame()
        asset_alloc["asset"] = summary_sheet.iloc[anchor_row_num:headers_end_index, anchor_col_num]
        asset_alloc["sum"] = summary_sheet.iloc[anchor_row_num:headers_end_index, anchor_col_num+1]
        asset_alloc["pct"] = summary_sheet.iloc[anchor_row_num:headers_end_index, anchor_col_num+2]
    else:
        print("No headers found :(((")
        #         print(summary_sheet.iloc[anchor_row_num:,anchor_col_num:anchor_col_num+2])
    return asset_alloc

In [23]:
def fix_col_name(col_name):
    '''Fix column names across files
    '''
    # remove * from col_name, fix שיעור to שעור
    if type(col_name) == str:
        col_name = re.sub(r'[*]+','', col_name)
        col_name = col_name.replace('שיעור','שעור')
        col_name = col_name.replace('פידיון','פדיון')
    # map variations to the same format
    if col_name in ['שם נ"ע', 'שם המנפיק / שם נייר ערך']:
        return 'שם המנפיק/שם נייר ערך'
    elif col_name in ['מספר נ"ע']:
        return 'מספר ני"ע'
    elif col_name in ['פדיון/ריבית לקבל', 'פידיון/ריבית לקבל', 'פדיון/ ריבית לקבל', 'דיבידנד לקבל']:
        return 'פדיון/ריבית/דיבידנד לקבל'
    elif col_name in ['שעור מנכסי השקעה']:
        return 'שעור מסך נכסי השקעה'
    elif col_name in ['ספק המידע']:
        return 'ספק מידע'
    elif col_name in ['שווי הוגן', 'שווי שוק']:
        return 'שווי'
    else:
        return col_name
    
def fix_asset_name(asset_name):
    '''fix asset names across files
    '''
    if type(asset_name) == str:
        asset_name = asset_name.replace("''",'"').replace('ל"ס', 'לא סחיר')
        asset_name = asset_name.replace('הקופה', 'המסלול או הקרן')
    return asset_name

In [24]:
def process_holdings_sheet(sheet, domain, company_name):
    '''process holdings sheet, preparing it to be merged across companies and types
    '''
    sheet = clean_sheet(sheet)
    # use first row as header, fix column names
    sheet.columns = sheet.iloc[0].str.strip().map(fix_col_name)
    sheet = sheet[1:]
    sheet["גוף"] = company_name
    sheet["אפיק"] = domain
    return(sheet)

def process_asset_alloc(input_df, domain, company_name):
    '''process asset allocation data, preparing it to be merged across companies
    handle סחיר / לא סחיר by location in sheet
    '''
    df = input_df.copy()
    df["marketable"] = pd.Series()
    marketable_loc = np.where(df["asset"].str.contains('סחיר', na=False))[0][0]
    non_marketable_loc = np.where(df["asset"].str.contains('לא סחיר', na=False))[0][0]
    loans_loc = np.where(df["asset"].str.contains('הלווא', na=False))[0][0]
    df["marketable"][marketable_loc:non_marketable_loc] = True
    df["marketable"][non_marketable_loc:loans_loc] = False
    df = clean_sheet(df)
    df["asset"] = df["asset"].map(remove_leading_numbering).map(fix_asset_name)
    df["גוף"] = company_name
    df["אפיק"] = domain
    return df

In [25]:
def clean_results(df, cols_to_keep):
    '''clean results DataFrame - keep only necessary columns, remove rows that aren't holdings
    '''
    df = df[df.columns[df.columns.isin(cols_to_keep)]]
    df = df[df['שם המנפיק/שם נייר ערך'].notnull()]
    df = df[df['שם המנפיק/שם נייר ערך'] != '0']
    df = df[df['שם המנפיק/שם נייר ערך'].str.strip() != '']
    df = df[df['מספר ני"ע'].notnull()]
    df = df[cols_to_keep]
    return df

In [26]:
# loop over domains (subdirectories)
domains = [f.name for f in scandir(reports_path) if f.is_dir()]
holdings = {s: pd.DataFrame() for s in included_sheets}
asset_allocs = pd.DataFrame()
column_names = {s: {} for s in included_sheets}

for d in domains:
    print("\nProcessing domain: {}".format(d))
    print("***********************")
    domain_path = reports_path+d
    chdir(domain_path)
    # loop over files within each domain (subdirectory)
    reports_fn_list = [f for f in listdir(domain_path) if isfile(join(domain_path, f)) and not(f.startswith("."))]
    for fn in reports_fn_list:
        print("\nProcessing file: {}".format(fn))
        report_df_dict = pd.read_excel(fn, sheet_name=None, header=None)
        summary_sheet_name = get_summary_sheet_name(report_df_dict)
        company_name = get_company_name_from_summary_sheet(report_df_dict[summary_sheet_name])
        asset_alloc = get_asset_allocation_from_summary_sheet(report_df_dict[summary_sheet_name])
        asset_allocs = asset_allocs.append(process_asset_alloc(asset_alloc, domain=d, company_name=company_name))
        if verify_sheets(report_df_dict, included_sheets):
            for sheet_name in included_sheets:
                sheet = process_holdings_sheet(sheet=report_df_dict[sheet_name],
                                               domain=d,
                                               company_name=company_name
                                              )
                holdings[sheet_name] = holdings[sheet_name].append(sheet)
                column_names[sheet_name][company_name] = sheet.columns


Processing domain: גמל
***********************

Processing file: Report_9_27_2021 (3).xlsx
Summary sheet: סכום נכסי הקרן
found company name: הפניקס
all needed sheets found!

Processing file: Report_9_27_2021 (1).xlsx


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summary sheet: סכום נכסי הקרן
found company name: אנליסט
all needed sheets found!

Processing file: Report_9_27_2021 (5).xlsx


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summary sheet: סכום נכסי הקרן
found company name: כלל
all needed sheets found!

Processing file: Report_9_27_2021 (7).xlsx


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summary sheet: סכום נכסי הקרן
found company name: מור
all needed sheets found!

Processing file: Report_9_27_2021 (2).xlsx


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summary sheet: סכום נכסי הקרן
found company name: מזרחי
all needed sheets found!

Processing file: yalin gemel 2021q2.xlsx


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summary sheet: סכום נכסי הקרן
found company name: ילין
all needed sheets found!

Processing file: Report_9_27_2021 (6).xlsx


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summary sheet: סכום נכסי הקרן
found company name: מגדל
all needed sheets found!

Processing file: Report_9_27_2021 (4).xlsx


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summary sheet: סכום נכסי הקרן
found company name: הראל
all needed sheets found!

Processing file: Report_9_27_2021 (8).xlsx


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summary sheet: סכום נכסי הקרן
found company name: מיטב
all needed sheets found!

Processing file: Report_9_28_2021.xlsx


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summary sheet: סכום נכסי הקרן
found company name: אלטשולר
all needed sheets found!

Processing file: Report_9_27_2021 (9).xlsx


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summary sheet: סכום נכסי הקרן
found company name: מנורה
all needed sheets found!

Processing file: Report_9_27_2021 (10).xlsx


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summary sheet: סכום נכסי הקרן
found company name: פסגות
all needed sheets found!

Processing domain: פנסיה
***********************

Processing file: Report_9_27_2021 (3).xlsx


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summary sheet: סכום נכסי הקרן
found company name: הראל
all needed sheets found!

Processing file: Report_9_27_2021 (1).xlsx


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summary sheet: סכום נכסי הקרן
found company name: הלמן
all needed sheets found!

Processing file: Report_9_27_2021.xlsx


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summary sheet: סכום נכסי הקרן
found company name: אלטשולר
all needed sheets found!

Processing file: Report_9_27_2021 (5).xlsx


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summary sheet: סכום נכסי הקרן
found company name: מגדל
all needed sheets found!

Processing file: Report_9_27_2021 (7).xlsx


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summary sheet: סכום נכסי הקרן
found company name: מנורה
all needed sheets found!

Processing file: Report_9_27_2021 (2).xlsx


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summary sheet: סכום נכסי הקרן
found company name: הפניקס
all needed sheets found!

Processing file: Report_9_27_2021 (6).xlsx


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summary sheet: סכום נכסי הקרן
found company name: מיטב
all needed sheets found!

Processing file: Report_9_27_2021 (4).xlsx


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summary sheet: סכום נכסי הקרן
found company name: כלל
all needed sheets found!

Processing file: Report_9_27_2021 (8).xlsx


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summary sheet: סכום נכסי הקרן
found company name: פסגות
all needed sheets found!

Processing domain: ביטוח
***********************

Processing file: Report_9_27_2021 (3).xlsx


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summary sheet: סכום נכסי הקרן
found company name: הפניקס
all needed sheets found!

Processing file: Report_9_27_2021 (1).xlsx


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summary sheet: סכום נכסי הקרן
found company name: מגדל
all needed sheets found!

Processing file: menora - 2021q2 - bituach.xlsx


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summary sheet: סכום נכסי הקרן
found company name: מנורה
all needed sheets found!

Processing file: Report_9_27_2021.xlsx


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summary sheet: סכום נכסי הקרן
found company name: כלל
all needed sheets found!

Processing file: Report_9_27_2021 (2).xlsx


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summary sheet: סכום נכסי הקרן
found company name: הראל
all needed sheets found!


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [27]:
asset_allocs.to_csv(reports_path+"asset_allocs.csv")

In [28]:
# write results to excel
output_path = reports_path+'quarterly_holdings_for_classification.xlsx'
writer = pd.ExcelWriter(output_path)

for k in holdings:
    df = holdings[k]
    df = clean_results(df, cols_to_keep=cols_to_keep)
    df.to_excel(writer, sheet_name = k, index = False)
writer.save()
print("Unified holdings file written to: {}".format(output_path))

Unified holdings file written to: /Users/urimarom/Downloads/holdings reports/2021q2 reports/quarterly_holdings_for_classification.xlsx
