In [1]:
import requests
import json
import numpy as np
import pandas as pd
import io
import time
import pickle
from datetime import datetime
pd.set_option('display.max_columns', None)

# Refactor (with functions)

### Input
1. holdings
2. previously classified
3. TASE list

### Process
1. prepare Fossil Free Funds list
2. prepare TASE list
3. prepare previously classified
4. classify holdings:
    1. look for previously classified holdings
    2. match holdings with TASE list (exact ISIN, issuer, or corp match)
    3. match holdings with FFF
        1. use open FIGI to add Ticker per ISIN in holdings
        2. match holdings with FFF by Ticker
        3. match holdings with FFF by company name (fuzzy)

### Output
holdings with additional columns:
1. is_fossil
2. source
3. confidence

## 1. prepare Fossil Free Funds list

In [25]:
# 1. fetch Fossil Free Funds list
def fetch_latest_fff_list():
    # fetch newest file from Fossil Free Funds
    # returns Dataframe read from excel file
    from bs4 import BeautifulSoup
    import urllib.request
    site = "https://fossilfreefunds.org/how-it-works"
    hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}
    req = urllib.request.Request(site, headers=hdr)
    html_page = urllib.request.urlopen(req)
    soup = BeautifulSoup(html_page, "html.parser")
    links_in_page = [link.get('href') for link in soup.findAll('a')]
    fff_latest_company_screens_url = [l for l in links_in_page if 'Invest+Your+Values+company+screens' in l][0]
    print("Using "+fff_latest_company_screens_url)
    return pd.read_excel(fff_latest_company_screens_url, sheet_name=1)

def prepare_fff(df):
    # Input: Fossil Free Funds list as dataframe
    # Output: 
    criteria = (df['Fossil Free Funds: Coal screen'] == 'Y') | (df['Fossil Free Funds: Oil / gas screen'] == 'Y') | (df['Fossil Free Funds: Fossil-fired utility screen'] == 'Y')
    df['Fossil Fuels any'] = criteria
    print(df['Fossil Fuels any'].value_counts(dropna=False))
    df['Company'] = df['Company'].str.upper()
    df['Tickers'] = df['Tickers'].str.upper()
    fff = df[criteria]
    # explode lists, to get one row per ticker
    fff = fff.assign(Tickers=fff['Tickers'].str.split(',')).explode('Tickers')
    # remove irrelevant columns
    fff_cols = [c for c in fff.columns if 'Fossil Free' in c]
    id_cols = ["Company","Country","Tickers"]
    fff = fff[id_cols + fff_cols]
    fff = fff[fff['Tickers'].notnull()]
    fff['Tickers'] = fff['Tickers'].str.strip().str.upper()
    return fff

common_words_in_company_name = ['LTD', 'INC', 'CORP', 'CO', 'GROUP', 'PLC', 'HOLDINGS', '&']

In [8]:
fff_all = fetch_latest_fff_list()

Using https://iyv-charts.s3-us-west-2.amazonaws.com/files/Invest+Your+Values+company+screens+20210303.xlsx


In [28]:
fff = prepare_fff(fff_all)

False    3749
True     2642
Name: Fossil Fuels any, dtype: int64


Unnamed: 0,Company,Country,Tickers,Fossil Free Funds: Coal screen,Fossil Free Funds: Oil / gas screen,Fossil Free Funds: Macroclimate30 coal-fired utility screen,Fossil Free Funds: Fossil-fired utility screen,Fossil Free Funds: Clean200 screen
1,11 PLC,Nigeria,MOBIL,,Y,,,
2,1ST NRG CORP,United States,FNRC,,Y,,,
4,3A-BESTGROUP JSC,Kazakhstan,BSGR,,Y,,,
5,3D OIL LTD,Australia,MUE,,Y,,,
5,3D OIL LTD,Australia,TDO,,Y,,,


# Prepare input files
## 1. Fossil Free Funds list

In [None]:
# fetch newest file from Fossil Free Funds
from bs4 import BeautifulSoup
import urllib.request

site = "https://fossilfreefunds.org/how-it-works"
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}

req = urllib.request.Request(site, headers=hdr)
html_page = urllib.request.urlopen(req)

soup = BeautifulSoup(html_page, "html.parser")
links_in_page = [link.get('href') for link in soup.findAll('a')]
fff_latest_company_screens_url = [l for l in links_in_page if 'Invest+Your+Values+company+screens' in l][0]


In [None]:
fff_all = pd.read_excel(fff_latest_company_screens_url, sheet_name=1)
criteria = (fff_all['Fossil Free Funds: Coal screen'] == 'Y') | (fff_all['Fossil Free Funds: Oil / gas screen'] == 'Y') | (fff_all['Fossil Free Funds: Fossil-fired utility screen'] == 'Y')
fff_all['Fossil Fuels any'] = criteria
print(fff_all['Fossil Fuels any'].value_counts(dropna=False))

fff_all['Company'] = fff_all['Company'].str.upper()
fff_all['Tickers'] = fff_all['Tickers'].str.upper()
fff_all

In [None]:
fff = fff_all[criteria]
# explode lists, to get one row per ticker
fff = fff.assign(Tickers=fff['Tickers'].str.split(',')).explode('Tickers')
fff
drop_cols = ['Deforestation Free Funds: Producer screen',
       'Deforestation Free Funds: Financier screen',
       'Deforestation Free Funds: Consumer brand screen',
       'Deforestation Free Funds: Palm oil producer screen',
       'Deforestation Free Funds: Palm oil consumer brand screen',
       'Deforestation Free Funds: Paper / pulp producer screen',
       'Deforestation Free Funds: Paper / pulp consumer brand screen',
       'Deforestation Free Funds: Rubber producer screen',
       'Deforestation Free Funds: Rubber consumer brand screen',
       'Deforestation Free Funds: Timber producer screen',
       'Deforestation Free Funds: Timber consumer brand screen',
       'Deforestation Free Funds: Cattle producer screen',
       'Deforestation Free Funds: Cattle consumer brand screen',
       'Deforestation Free Funds: Soy producer screen',
       'Deforestation Free Funds: Soy consumer brand screen',
       'Gender Equality Funds: Has Equileap gender equality score',
       'Gun Free Funds: Gun manufacturers screen',
       'Gun Free Funds: Gun retailers screen',
       'Prison Free Funds: Prison industry screen',
       'Prison Free Funds: Border industry screen',
       'Prison Free Funds: Higher risk screen',
       'Prison Free Funds: Private prison operators screen',
       'Weapons Free Funds: Major military contractor screen',
       'Weapons Free Funds: Cluster munitions / landmines screen',
       'Weapons Free Funds: Nuclear weapons screen',
       'Tobacco Free Funds: Tobacco producers screen',
       'Tobacco Free Funds: Tobacco-promoting entertainment companies screen']

fff.drop(drop_cols, axis=1, inplace=True)
fff = fff[fff['Tickers'].notnull()]
fff['Tickers'] = fff['Tickers'].str.strip()

In [None]:
fff['Tickers'].value_counts(dropna=False)

In [None]:
pd.set_option('display.max_rows', 500)
fff['Tickers'].sort_values().head(500)

In [None]:
fff[fff['Tickers'] == '00006']

### remove common words from Company name (Ltd, Plc, etc.)

In [None]:
fff_all['Company'].str.split(expand=True).stack().value_counts().head(30)

In [None]:
common_words_in_company_name = ['LTD', 'INC', 'CORP', 'CO', 'GROUP', 'PLC', 'HOLDINGS', '&']

## 2. holdings file 

In [None]:
holdings_path = "/Users/urimarom/Downloads/2020Q3 החזקות בדלקים פוסיליים.xlsx"
holdings = pd.read_excel(holdings_path, sheet_name=0)

In [None]:
## TODO: check for previously classified ISINs and corps (see "new Q holdings")

### Automatically identify columns with ISINs

In [None]:
isin_pattern = r"^[A-Z]{2}([A-Z0-9]){9}[0-9]$"
max_isin_cnt = 0
for col in holdings:
    print(col)
    isin_cnt = sum(holdings[col].astype(str).str.contains(isin_pattern, na=False))
    print(isin_cnt)
    if isin_cnt > max_isin_cnt:
        isin_col = col
        max_isin_cnt = isin_cnt

if max_isin_cnt > 0:
    print("ISIN col is: "+isin_col)

In [None]:
# focusing on new assets only here
# TODO: might want to start by looking for fossils in previous reports, and maintain known fossil ISINS
new_assets = holdings[holdings['ברשימת רבעון קודם'].isna()]
new_isins = new_assets[isin_col][new_assets[isin_col].str.contains(isin_pattern, na=False)].unique()
new_isins = pd.DataFrame(new_isins, columns=['ISIN'])
print(len(new_isins))

In [None]:
isins = holdings[isin_col][holdings[isin_col].str.contains(isin_pattern, na=False)].unique()
isins = pd.DataFrame(isins, columns=['ISIN'])
print(len(isins))

In [None]:
# isin2ticker = []
# i = 1
# for isin in isins:
# #     isin='US87612G1013'
#     print(isin)
#     r = requests.get('https://finnhub.io/api/v1/search?q='+isin+'&token=c0jbqgf48v6vejlecf80')
#     if r.status_code == 200:
#         j = r.json()
#         for i in range(j['count']):
#             isin2ticker.append([isin, j['result'][i]['symbol']])
#     else:
#         print("error fetching "+isin+" ,status code: "+r.status_code)


In [None]:
# isin2ticker

In [None]:
# isin='XS2224632971'
# print(isin)
# r = requests.get('https://finnhub.io/api/v1/search?q='+isin+'&token=c0jbqgf48v6vejlecf80')
# r

In [None]:
# if r.status_code == 200:
#     j = r.json()
#     print(j)
#     for i in range(j['count']):
#         isin2ticker.append([isin, j['result'][i]['symbol']])
# else:
#     print("error fetching "+isin+" ,status code: "+str(r.status_code))

Finnhub: 
Multiple ISINs get empty results from the API.
Looking for another solution.

## Try using FIGI API

In [None]:
# Copyright 2017 Bloomberg Finance L.P.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import urllib.request
import urllib.parse

'''
See https://www.openfigi.com/api for more information.
'''
# tried ratelimit, didn't seem to work:
# from ratelimit import limits
# @limits(calls=25, period=60)

def map_jobs(jobs):
    '''
    Send an collection of mapping jobs to the API in order to obtain the
    associated FIGI(s).
    Parameters
    ----------
    jobs : list(dict)
        A list of dicts that conform to the OpenFIGI API request structure. See
        https://www.openfigi.com/api#request-format for more information. Note
        rate-limiting requirements when considering length of `jobs`.
    Returns
    -------
    list(dict)
        One dict per item in `jobs` list that conform to the OpenFIGI API
        response structure.  See https://www.openfigi.com/api#response-fomats
        for more information.
    '''
    handler = urllib.request.HTTPHandler()
    opener = urllib.request.build_opener(handler)
    openfigi_url = 'https://api.openfigi.com/v2/mapping'
    request = urllib.request.Request(openfigi_url, data=bytes(json.dumps(jobs), encoding='utf-8'))
    request.add_header('Content-Type','application/json')
    if openfigi_apikey:
        request.add_header('X-OPENFIGI-APIKEY', openfigi_apikey)
    request.get_method = lambda: 'POST'
    connection = opener.open(request)
    if connection.code != 200:
        raise Exception('Bad response code {}'.format(str(response.status_code)))
    return json.loads(connection.read().decode('utf-8'))


def job_results_handler(jobs, job_results):
    '''
    Handle the `map_jobs` results.  See `map_jobs` definition for more info.
    Parameters
    ----------
    jobs : list(dict)
        The original list of mapping jobs to perform.
    job_results : list(dict)
        The results of the mapping job.
    Returns
    -------
        None
    '''
    for job, result in zip(jobs, job_results):
        job_str = '|'.join(job.values())
        figis_str = ','.join([d['figi'] for d in result.get('data', [])])
        result_str = figis_str or result.get('error')
        output = '%s maps to FIGI(s) ->\n%s\n---' % (job_str, result_str)
        print(output)


openfigi_apikey = ''  # Put API Key here
jobs = [
    {'idType': 'ID_ISIN', 'idValue': 'IL0011677668'}
]
job_results = map_jobs(jobs)
job_results_handler(jobs, job_results)

# get ticker
job_results[0]['data'][0]['ticker']

In [None]:
jobs = [{'idType': 'ID_ISIN', 'idValue': isin} for isin in isins['ISIN']]

i = 0
job_results = []
print("ISINs count: {}".format(len(isins)))

sleep_sec=20
print("estimated time: {i} minutes".format(len(isins)/10*sleep_sec/60))

while i < len(jobs):
    curr_jobs = jobs[i:i+10]
    print("processing {} to {}".format(i+1, i+10))
    job_results.append(map_jobs(curr_jobs))
    time.sleep(sleep_sec)
    i += 10
# job_results_handler(jobs, job_results)

In [None]:
sleep_sec=20
print("estimated time: {:.0f} minutes".format(len(isins)/10*sleep_sec/60))

In [None]:
job_results_flat = [item for sublist in job_results for item in sublist]
results = []
for i in range(len(isins)):
    result = {}
    result['isin'] = isins['ISIN'][i]
    if 'data' in job_results_flat[i]:
        result.update(job_results_flat[i]['data'][0])
    results.append(result)
    
results = pd.DataFrame(results)
drop_cols = ['uniqueIDFutOpt']
results.drop(drop_cols, axis=1, inplace=True)
results['ticker_simp'] = results['ticker'].str.split().str[0].str.upper().str.strip()
results['name'] = results['name'].str.upper().str.strip()

pickle_filename = 'figi_results-'+now.strftime("%Y-%m-%d-%H%M%S")+'.pickle'
with open(pickle_filename, 'wb') as f:
    pickle.dump(results, f)

In [None]:
# no FIGI records, handle separately
not_in_figi = results[results['ticker_simp'].isna()]['isin']
len(not_in_figi) / len(results)
# 3.6% missing, it's a go!

In [None]:
tickers_test = results.sort_values(by='ticker_simp')[['ticker', 'ticker_simp']]
tickers_test[tickers_test['ticker_simp'].str.isdigit().fillna(False)]

In [None]:
results['name'].str.split(expand=True).stack().value_counts().head(10)

# join with Fossil Free Funds list
## 1. Exact ticker match filtered by partial company name match

In [None]:
isin_figi_ticker_fff = pd.merge(left=results,
                                right=fff,
                                left_on='ticker_simp', 
                                right_on='Tickers',
                                how='left'
                               )

isin_figi_ticker_fff[isin_figi_ticker_fff['Fossil Fuels any'].notnull()]

### False positives (similar ticker, different company names)

In [None]:
isin_figi_ticker_fff[isin_figi_ticker_fff['isin'] == 'US81180WAZ41']

### False negatives (different ticker, same company)

In [None]:
print(isin_figi_ticker_fff[isin_figi_ticker_fff['isin'] == 'US7594701077'])
fff[fff['Company'].str.lower().str.startswith('reliance ind')]

In [None]:
print(isin_figi_ticker_fff[isin_figi_ticker_fff['isin'] == 'US71647NBE85'])
fff[fff['Company'].str.lower().str.startswith('petrole')]

In [None]:
# Should match by ticker
## TODO: remove leading zeros from tickers in both holdings and fff lists
isin_figi_ticker_fff[
    (isin_figi_ticker_fff['name'].notnull()) &
    (isin_figi_ticker_fff['name'].str.lower().str.contains('rusal'))
]

## Handle False Positives
Using partial fuzzy matching for ticker matches

In [35]:
# handle false positives - maybe by editing distance with company names
from fuzzywuzzy import fuzz
def company_names_match_score(row):
    holdings_company_name = str(row['name'])
    fff_company_name = str(row['Company'])
    if (len(holdings_company_name) >= 3) & (len(fff_company_name) >= 3):
        return fuzz.partial_ratio(holdings_company_name.lower(),fff_company_name.lower())

isin_figi_ticker_fff['company_names_partial_match_score'] = isin_figi_ticker_fff.apply(company_names_match_score, axis='columns')
isin_figi_ticker_fff[
    (isin_figi_ticker_fff['Company'].notnull()) &
    (isin_figi_ticker_fff['company_names_partial_match_score'] > 80)
                    ]

NameError: name 'isin_figi_ticker_fff' is not defined

In [48]:
from fuzzywuzzy import fuzz
def company_names_match_score_test(holdings_company_name, fff_company_name, min_len=3):
    if (not holdings_company_name) | (not fff_company_name):
        return np.nan
    holdings_company_name = str(holdings_company_name).strip().lower()
    fff_company_name = str(fff_company_name).strip().lower()
    if (len(holdings_company_name) >= min_len) & (len(fff_company_name) >= min_len):
        return fuzz.partial_ratio(holdings_company_name,fff_company_name)
    else:
        return np.nan
    
company_names_match_score_test(np.nan,np.nan)

    

100

# Company name fuzzy matching for all holdings vs. fff list

In [None]:
from fuzzywuzzy import process
fff_company_names = fff[fff['Company'].notnull()]['Company'].unique()
holdings_company_names = results[results['name'].notnull()]['name'].unique()
print(len(fff_company_names))
print(len(holdings_company_names))

### clean company names from common words

In [None]:
def remove_common_words(l, common):
    res = []
    for x in l:
        x = str(x)
        new = ' '.join([word for word in x.split() if word not in (common)])
        res.append(new)
    return res

# fff_company_names = remove_common_words(fff_company_names, common_words_in_company_name)
# holdings_company_names = remove_common_words(holdings_company_names, common_words_in_company_name)

## 1. fuzzy matching - partial ratio

In [None]:
partial_matches = {}
for c in holdings_company_names:
    s = str(c)
    partial_matches[c] = process.extractOne(s, fff_company_names, scorer=fuzz.partial_ratio)
pd.DataFrame(partial_matches).transpose().sort_values(by=1)

In [None]:
exact_partial_matches = pd.DataFrame(partial_matches).transpose()

## 2. another fuzzy matching attempt - using the sum of all scorers

In [None]:
def best_match(s, l, first_word_thresh=95):
    s = str(s)
    first_word_matches = process.extract(s.split()[0], l, scorer=fuzz.partial_ratio, limit = 10)
    max_agg_score = 0
    winner = ''
    for m in first_word_matches:
        if m[1] > first_word_thresh:
            agg_score = (
                fuzz.ratio(s, m) +
                fuzz.partial_ratio(s, m) +
                fuzz.token_sort_ratio(s, m) +
                fuzz.token_set_ratio(s, m) +
                fuzz.partial_token_sort_ratio(s, m) +
                fuzz.partial_token_set_ratio(s, m)
            )
            if agg_score > max_agg_score:
                max_agg_score = agg_score
                winner = m[0]
    return winner, max_agg_score

In [None]:
agg_matches = {}
for c in holdings_company_names:
    agg_matches[c] = best_match(c, fff_company_names)

In [None]:
agg_fuzzy_results = pd.DataFrame(agg_matches).transpose()
agg_fuzzy_results.rename({0: 'fff_name', 1: 'fuzzy_agg_score'}, axis=1, inplace=True)
agg_fuzzy_results = agg_fuzzy_results[agg_fuzzy_results['fuzzy_agg_score'] > 0].sort_values(by='fuzzy_agg_score', ascending=False)
agg_fuzzy_results

In [None]:
s = "RELIANCE INDS-SPONS GDR 144A"
l = fff_company_names

first_word_matches = process.extract(s.split()[0], l, scorer=fuzz.partial_ratio, limit = 10)
for m in first_word_matches:
    if m[1] > 80:
        print(
            m,
            fuzz.partial_ratio(s, m),
            fuzz.token_sort_ratio(s, m),
            fuzz.token_set_ratio(s, m),
            fuzz.partial_token_sort_ratio(s, m),
            fuzz.partial_token_set_ratio(s, m),
            fuzz.partial_ratio(s, m) +
            fuzz.token_sort_ratio(s, m) +
            fuzz.token_set_ratio(s, m) +
            fuzz.partial_token_sort_ratio(s, m) +
            fuzz.partial_token_set_ratio(s, m) +
            fuzz.ratio(s, m)
        )
        s2 = s[:len(m[0])]
        print(fuzz.ratio(s2, m))
        print(s2)


In [None]:
s = 'PETROBRAS GLOBAL FINANCE'
l = fff_company_names
# taking only the first word
first_word_matches = process.extract(s.split()[0], l, scorer=fuzz.partial_ratio, limit = 10)
for m in first_word_matches:
    if m[1] > 80:
        print(
            m,
            fuzz.partial_ratio(s, m),
            fuzz.token_sort_ratio(s, m),
            fuzz.token_set_ratio(s, m),
            fuzz.partial_token_sort_ratio(s, m),
            fuzz.partial_token_set_ratio(s, m)
        )

# def get_best_match(s, l):
#     process.extractOne(s, l, scorer=fuzz.partial_ratio)
# def add_best_match(left, right, threshold):
    

In [None]:
# do the same for Israeli holdings? (different ID - numerics only; join with local fossil list)

In [None]:
# maybe later: visualization for results

In [None]:
# bonus: build a website! Dash? streamlit?
# input: holdings csv - must have ISIN, value (same currency) per row 
# output: % of fossil, top fossil holdings, other stats, downloadable CSV

## Try going through LEI

In [None]:
# source: https://www.gleif.org/en/lei-data/lei-mapping/download-isin-to-lei-relationship-files
isin2lei = pd.read_csv('~/Downloads/ISIN_LEI_20210216.csv')
len(isin2lei)

In [None]:
merged = pd.merge(left=isins,
                  right=isin2lei,
                  on='ISIN', 
                  how='left'
                 )

merged

In [None]:
sum(merged['LEI'].isnull())

In [None]:
# about 33% of the ISINs missing in the isin2lei mapping.
# TODO: check for all ISINs in holdings file

In [None]:
# INPUT CSVS
# holdings = csv.read()
# fff = csv.read()

# add symbol per ISIN in holdings using finnhub API
## get a list of unique valid ISINs
## run API, get symbols

# per symbol, check if in fff and mark as fossil

# compare results to g-spreadsheet results

In [None]:
results_old = results