This part of the code collects a dictionary of all lables and their descriptions

In [3]:
import os
import json
import pandas as pd


def extract_us_gaap_elements():
    data = []
    current_directory = os.getcwd()

    # Loop through all files in the current directory
    for filename in os.listdir(current_directory):
        if filename.endswith('.json'):
            file_path = os.path.join(current_directory, filename)

            # Open and parse the JSON file
            with open(file_path, 'r') as file:
                try:
                    json_data = json.load(file)
                    us_gaap_elements = json_data.get(
                        'facts', {}).get('us-gaap', {})

                    # Extract relevant fields
                    for metric, details in us_gaap_elements.items():
                        label = details.get('label', '')
                        description = details.get('description', '')
                        data.append(
                            {'metrics': metric, 'label': label, 'description': description})
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON in file {filename}: {e}")

    # Create a DataFrame
    df = pd.DataFrame(data)

    # Remove duplicates
    # df = df.drop_duplicates()

    return df


# Example usage
df = extract_us_gaap_elements()
print(df)

                                                  metrics  \
0                                  AccountsPayableCurrent   
1                            AccountsReceivableNetCurrent   
2                AccrualForEnvironmentalLossContingencies   
3       AccrualForEnvironmentalLossContingenciesDiscou...   
4           AccrualForEnvironmentalLossContingenciesGross   
...                                                   ...   
275278             RestructuringReserveAccrualAdjustment1   
275279  AssetsOfDisposalGroupIncludingDiscontinuedOper...   
275280  DisposalGroupIncludingDiscontinuedOperationCon...   
275281                      FinanceLeasePrincipalPayments   
275282  LiabilitiesOfDisposalGroupIncludingDiscontinue...   

                                                    label  \
0                               Accounts Payable, Current   
1       Accounts Receivable, after Allowance for Credi...   
2            Accrual for Environmental Loss Contingencies   
3       Accrual for Env

In [4]:
df.to_csv('us_gaap_elements.csv', index=False)

In [None]:
metrics_df = pd.read_csv('us_gaap_elements.csv')
metric_counts = metrics_df['metrics'].value_counts()


print(metric_counts)

metrics
Assets                                                                                          488
LiabilitiesAndStockholdersEquity                                                                488
NetCashProvidedByUsedInFinancingActivities                                                      488
NetCashProvidedByUsedInInvestingActivities                                                      488
NetCashProvidedByUsedInOperatingActivities                                                      485
                                                                                               ... 
DepositLiabilitiesReclassifiedAsLoansReceivable                                                   1
SupplierFinanceProgramObligationPeriodIncreaseDecrease                                            1
SalesTypeLeaseNetInvestmentInLeaseExcludingAccruedInterestAfterAllowanceForCreditLossCurrent      1
DepreciationExcludingLessorAssetUnderOperatingLease                                         

Items to collect 
income statement
    Ops
        AccumulatedOtherComprehensiveIncomeLossNetOfTax
    WeightedAverageNumberOfDilutedSharesOutstanding	
    EarningsPerShareDiluted	
Balance Sheet
    Assets
    LiabilitiesAndStockholdersEquity 
Cashflow statement
    NetCashProvidedByUsedInFinancingActivities       
    NetCashProvidedByUsedInInvestingActivities   
    NetCashProvidedByUsedInOperatingActivities




Code to extract diluted EPS from all files. 

In [None]:
import os
import json
import pandas as pd


def extract_metrics(metrics_and_units):
    """
    Extracts specified metrics and their units from JSON files in the current directory.

    Args:
        metrics_and_units: A dictionary where keys are metric names 
                           and values are their corresponding units.

    Returns:
        A pandas DataFrame containing the extracted data.
    """
    data = []
    current_directory = os.getcwd()

    for filename in os.listdir(current_directory):
        if filename.endswith('.json'):
            file_path = os.path.join(current_directory, filename)
            ticker = os.path.splitext(filename)[0]

            try:
                with open(file_path, 'r') as file:
                    json_data = json.load(file)
                    us_gaap_elements = json_data.get(
                        'facts', {}).get('us-gaap', {})

                    for metric, unit in metrics_and_units.items():
                        metric_data = us_gaap_elements.get(
                            metric, {}).get('units', {})
                        if unit in metric_data:
                            for entry in metric_data[unit]:
                                entry_data = {
                                    'ticker': ticker, 'metric': metric}
                                entry_data.update(entry)
                                data.append(entry_data)

            except json.JSONDecodeError as e:
                print(f"Error decoding JSON in file {filename}: {e}")

    return pd.DataFrame(data)


# Define metrics and their units
metrics_and_units = {
    "EarningsPerShareDiluted": "USD/shares",
    "IncomeLossFromContinuingOperationsPerDilutedShare": "USD/shares",
    "IncomeLossFromDiscontinuedOperationsNetOfTaxPerDilutedShare": "USD/shares",
    "WeightedAverageNumberOfDilutedSharesOutstanding": "shares",
    "NetIncomeLoss": "USD",
    "NetIncomeLossFromContinuingOperationsAvailableToCommonShareholdersDiluted": "USD",
    "NetIncomeLossFromDiscontinuedOperationsAvailableToCommonShareholdersDiluted": "USD"

}

# Extract data using the defined metrics
df = extract_metrics(metrics_and_units)

metric ticker       start         end    form       filed  \
0           A  2006-11-01  2007-10-31    10-K  2009-12-21   
1           A  2006-11-01  2007-10-31     8-K  2010-09-07   
2           A  2007-11-01  2008-07-31  10-Q/A  2009-10-05   
3           A  2007-11-01  2008-10-31    10-K  2009-12-21   
4           A  2007-11-01  2008-10-31    10-K  2010-12-20   

metric  EarningsPerShareDiluted  \
0                          1.57   
1                          1.57   
2                          1.23   
3                          1.87   
4                          1.87   

metric  IncomeLossFromContinuingOperationsPerDilutedShare  \
0                                                     NaN   
1                                                     NaN   
2                                                     NaN   
3                                                     NaN   
4                                                     NaN   

metric  IncomeLossFromDiscontinuedOperationsNetOfTaxPer

In [27]:


pivot_df = df.sort_values(by='filed', ascending=False).pivot_table(index=['ticker', 'start', 'end',
                                                                          'form', 'filed'], columns='metric', values='val', aggfunc='first').reset_index()
print(pivot_df.head())

metric ticker       start         end    form       filed  \
0           A  2006-11-01  2007-10-31    10-K  2009-12-21   
1           A  2006-11-01  2007-10-31     8-K  2010-09-07   
2           A  2007-11-01  2008-07-31  10-Q/A  2009-10-05   
3           A  2007-11-01  2008-10-31    10-K  2009-12-21   
4           A  2007-11-01  2008-10-31    10-K  2010-12-20   

metric  EarningsPerShareDiluted  \
0                          1.57   
1                          1.57   
2                          1.23   
3                          1.87   
4                          1.87   

metric  IncomeLossFromContinuingOperationsPerDilutedShare  \
0                                                     NaN   
1                                                     NaN   
2                                                     NaN   
3                                                     NaN   
4                                                     NaN   

metric  IncomeLossFromDiscontinuedOperationsNetOfTaxPer

In [20]:
pivot_df = df.pivot_table(index=['ticker', 'start', 'end',
                                 'form', 'filed'], columns='metric', values='val', aggfunc='first').reset_index()
pivot_df = pivot_df.drop_duplicates()

# to do list
# build logic from the parameters by ETL
# get a robust data set where there is only one entry per period

In [3]:
merged_df = pd.merge(
    df,
    pivot_df,
    on=['ticker', 'accn', 'end', 'form', 'filed'],
    how='outer'
)

In [4]:
# Drop unnecessary columns: metric, frame, val
cleaned_df = merged_df.drop(
    columns=['metric', 'frame', 'val'], errors='ignore')

# Deduplicate the rows
deduplicated_df = cleaned_df.drop_duplicates()

In [5]:
deduplicated_df1 = (
    cleaned_df.sort_values(by=['filed'], ascending=False)
    .drop_duplicates(subset=['ticker', 'end', 'form'], keep='first')
)