In this notebook, unique entities that should be subjects are extracted from the data
Subjects to be extracted: 
- Countries (exported from pycountry)
- Companies
- Which companies own the mines 
- primary Commodities

In [150]:
import pandas as pd
import geopandas as gpd
import pycountry
import numpy as np

# import data
facilities = gpd.read_file("data/production/facilities.gpkg")
ownership = pd.read_csv("manual_input_data/ownership_cleaned.csv")

## Construct dataframe that links mine to company

In [151]:
# split strings by ","
ownership_df = ownership["owners"].str.split(",", expand=True)

# combine again with ids and years
ownership_df = pd.concat([ownership.loc[:,["facility_id", "year"]], ownership_df], axis = 1)

# pivot to tidy dataframe
ownership_df = pd.melt(ownership_df, id_vars=['facility_id', "year"], var_name='owner_nr', value_name='company')

# delete rows with NULL values
ownership_df = ownership_df[ownership_df.notnull().all(axis=1)]

# extract the ownership percentage and delete from old column
ownership_df['percentage'] = ownership_df['company'].str.extract(r'\((.*?)\)')
ownership_df['company'].replace(r'\((.*?)\)', "", inplace = True, regex = True)
ownership_df["company"] = ownership_df["company"].str.strip() # remove leading and trailing spaces

# add IRIs
ownership_df["company_iri"] = ownership_df["company"].replace(r'[^a-zA-Z\s]', '', regex = True) # remove all non-letters for iri
ownership_df["company_iri"] = ownership_df["company_iri"].str.strip() # remove leading and trailing spaces
ownership_df["company_iri"].replace(r'[\s]', '_', regex = True, inplace = True) #replace spaces with underscores in iri

## Construct dataframe with unique companies

In [152]:
companies_df = ownership_df.loc[:,["company", "company_iri"]].drop_duplicates()

## Construct dataframe with operators

In [153]:
# tbd

## Construct dataframe with all countries

In [154]:
country_alpha_3 = []
country_name = []
for country in list(pycountry.countries):
    country_alpha_3.append(country.alpha_3)
    country_name.append(country.name)

countries_df = pd.DataFrame({"country_alpha_3":  country_alpha_3, "country_name":  country_name})

## Export data

In [155]:
import os

# create the intermediate directory if it does not exist
path = "./intermediate"
isExist = os.path.exists(path)
if not isExist:
    os.makedirs(path)
    print("The new directory is created!")

pd.DataFrame(companies_df).to_csv("intermediate/companies.csv", index = False)
pd.DataFrame(countries_df).to_csv("intermediate/countries.csv", index = False)
pd.DataFrame(ownership_df).to_csv("intermediate/ownership.csv", index = False)