In this notebook, unique entities that should be subjects are extracted from the data
Subjects to be extracted: 
- Countries (exported from pycountry)
- Companies
- Which companies own the mines 
- primary Commodities

In [24]:
import pandas as pd
import geopandas as gpd
import pycountry
import numpy as np

# import data
facilities = gpd.read_file("data/production/facilities.gpkg")
ownership = pd.read_csv("manual_input_data/ownership_cleaned.csv")

## Construct dataframe that links mine to company

In [25]:
# split strings by ","
ownership_df = ownership["owners"].str.split(",", expand=True)

# combine again with ids and years
ownership_df = pd.concat([ownership.loc[:,["facility_id", "year"]], ownership_df], axis = 1)

# pivot to tidy dataframe
ownership_df = pd.melt(ownership_df, id_vars=['facility_id', "year"], var_name='owner_nr', value_name='company')

# delete rows with NULL values
ownership_df = ownership_df[ownership_df.notnull().all(axis=1)]

# extract the ownership percentage and delete from old column
ownership_df['percentage'] = ownership_df['company'].str.extract(r'\((.*?)\)')
ownership_df['company'].replace(r'\((.*?)\)', "", inplace = True, regex = True)
ownership_df["company"] = ownership_df["company"].str.strip() # remove leading and trailing spaces

# add IRIs
ownership_df["company_iri"] = ownership_df["company"].replace(r'[^a-zA-Z\s]', '', regex = True) # remove all non-letters for iri
ownership_df["company_iri"] = ownership_df["company_iri"].str.strip() # remove leading and trailing spaces
ownership_df["company_iri"].replace(r'[\s]', '_', regex = True, inplace = True) #replace spaces with underscores in iri

In [26]:
ownership_df

Unnamed: 0,facility_id,year,owner_nr,company,percentage,company_iri
0,COM00001.00,2018,0,Hudbay Minerals Inc,100%,Hudbay_Minerals_Inc
1,COM00005.00,2019,0,Rusal Plc.,100%,Rusal_Plc
2,COM00006.00,2019,0,ArcelorMittal S.A.,100%,ArcelorMittal_SA
3,COM00007.00,2018,0,AngloGold Ashanti,100%,AngloGold_Ashanti
4,COM00009.00,2017,0,Newmont Mining Corporation,100%,Newmont_Mining_Corporation
...,...,...,...,...,...,...
2277,COM00397.00,2018,3,JX Nippon Mining and Metals,10%,JX_Nippon_Mining_and_Metals
2496,COM00840.00,2019,3,SMM Morenci Inc.,13%,SMM_Morenci_Inc
2523,COM00885.00,2020,3,Daewoo International Corporation and Korea Re...,7.5%,Daewoo_International_Corporation_and__Korea_Re...
2579,COM01010.00,2019,3,POS-Ore,20%,POSOre


In [27]:
# get only the most recent owners 
idx = ownership_df.groupby("facility_id")["year"].transform(max) == ownership_df["year"]
print(len(ownership_df))
len(ownership_df[idx])

819


788

In [28]:
# Tests:
# this company has ownership values for multiple years, so only the most recent one should be displayed (only one row)
assert len(ownership_df[idx][ownership_df[idx]["facility_id"] == "COM00400.00"]) == 1

# this company has multiple ownership values for one year, so all of them should be displayed
assert len(ownership_df[idx][ownership_df[idx]["facility_id"] == "COM00048.00"]) > 1

# display(ownership_df[idx][ownership_df[idx]["facility_id"] == "COM00400.00"])
# display(ownership_df[idx][ownership_df[idx]["facility_id"] == "COM00048.00"])

## Construct dataframe with unique companies

In [29]:
companies_df = ownership_df.loc[:,["company", "company_iri"]].drop_duplicates()

## Construct dataframe with operators

In [30]:
# tbd

## Construct dataframe with all countries

In [31]:
country_alpha_3 = []
country_name = []
for country in list(pycountry.countries):
    country_alpha_3.append(country.alpha_3)
    country_name.append(country.name)

countries_df = pd.DataFrame({"country_alpha_3":  country_alpha_3, "country_name":  country_name})

## Export data

In [32]:
import os

# create the intermediate directory if it does not exist
path = "./intermediate"
isExist = os.path.exists(path)
if not isExist:
    os.makedirs(path)
    print("The new directory is created!")

pd.DataFrame(companies_df).to_csv("intermediate/companies.csv", index = False)
pd.DataFrame(countries_df).to_csv("intermediate/countries.csv", index = False)
pd.DataFrame(ownership_df).to_csv("intermediate/ownership.csv", index = False)