# SimFin Test All Datasets

This Notebook performs automated testing of all the bulk datasets from SimFin. The datasets are first downloaded from the SimFin server and then various tests are performed on the data. An exception is raised if any problems are found.

This Notebook can be run as usual if you have `simfin` installed, by running the following command from the directory where this Notebook is located:

    jupyter notebook

This Notebook can also be run using `pytest` which makes automated testing easier. You need to have the Python packages `simfin` and `nbval` installed. Then execute the following command from the directory where this Notebook is located:

    pytest --nbval-lax -v test_bulk_data.ipynb
    
This runs the entire Notebook and outputs error messages for all the cells that raised an exception.

## IMPORTANT!

- When you make changes to this Notebook, remember to clear all cells before pushing it back to github, because that makes it easier to see the difference from the previous version. Select menu-item "Kernel / Restart & Clear Output".

- If you set `refresh_days=0` then it will force a new download of all the datasets.

In [None]:
# Set this to 0 to force a new download of all datasets.
refresh_days = 30

## Imports

In [None]:
import pandas as pd
import numpy as np
import warnings
import sys
import os
from IPython.display import display

In [None]:
import simfin as sf
from simfin.names import *
from simfin.datasets import *

## Are We Running Pytest?

In [None]:
# Boolean whether this is being run under pytest.
# This is useful when printing examples of errors
# if they take a long time to compute, because it
# is not necessary when running pytest.
running_pytest = ('PYTEST_CURRENT_TEST' in os.environ)

## Configure SimFin

In [None]:
sf.set_data_dir('~/simfin_data/')

In [None]:
sf.load_api_key(path='~/simfin_api_key.txt', default_key='free')

## Load All Datasets

In [None]:
%%time
data = AllDatasets(refresh_days=refresh_days)

In [None]:
# Example for annual Income Statements.
data.get(dataset='income', variant='annual', market='us').head()

## Lists of Datasets

These are in addition to the lists of datasets from `datasets.py`.

In [None]:
# Datasets that have a column named TICKER.
# Some tests are probably only necessary for 'companies'
# but we might as well test all datasets that use tickers.
datasets_tickers = ['companies'] + datasets_fundamental() + datasets_shareprices()

## Function for Testing Datasets

In [None]:
def test_datasets(test_name, datasets=None, variants=None,
                  markets=None,
                  test_func=None,
                  test_func_rows=None,
                  test_func_groups=None,
                  group_index=SIMFIN_ID,
                  process_df_none=False, raise_exception=True):
    """
    Helper-function for running tests on many Pandas DataFrames.
    
    :param test_name:
        String with the name of the test.
        
    :param datasets:
        By default (datasets=None) all possible datasets
        will be tested. Otherwise datasets is a list of
        strings with dataset names to be tested.
        
    :param variants:
        By default (variants=None) all possible variants
        for each dataset will be tested, as defined in
        simfin.datasets.valid_variants. Otherwise variants
        is a list of strings and only those variants
        will be tested.
        
    :param markets:
        By default (markets=None) all possible markets
        for each dataset will be tested, as defined in
        simfin.datasets.valid_markets. Otherwise markets
        is a list of strings and only those markets
        will be tested.
        
    :param test_func:
        Function to be called on the Pandas DataFrame for
        each dataset. If there are problems with the DataFrame
        then return True, otherwise return False.
        
        This is generally used for testing problems with the
        entire DataFrame. For example, if the dataset is empty:

        test_func = lambda df: len(df) == 0
        
        If this returns True then there is a problem with df.
                
    :param test_func_rows:
        Similar to test_func but for testing individual rows
        of a DataFrame. For example, test if SHARES_BASIC is
        None, zero or negative:
        
        test_func_rows = lambda df: (df[SHARES_BASIC] is None or
                                     df[SHARES_BASIC] <= 0)

    :param test_func_groups:
        Similar to test_func but for testing groups of rows
        in a DataFrame. For example, test on a per-stock basis
        whether SHARES_BASIC is greater than twice its mean:
        
        test_func_groups = lambda df: (df[SHARES_BASIC] >
                                       df[SHARES_BASIC].mean() * 2).any()

    :param group_index:
        String with the column-name used to create groups when
        using test_func_groups e.g. SIMFIN_ID for grouping by companies.

    :param process_df_none:
        Boolean whether to process (True) or skip (False)
        DataFrames that are None, because they could not be loaded.

    :param raise_exception:
        Boolean. If True then raise an exception if there were
        any problems, but wait until all datasets have been
        tested, so we can print the list of datasets with problems.
        If False then only show a warning if there were problems.
        
    :return:
        None
    """

    # Convert to test_func.
    if test_func_rows is not None:
        # Convert test_func_rows to test_func.
        test_func = lambda df: test_func_rows(df).any()
    elif test_func_groups is not None:
        # Convert test_func_groups to test_func.
        # NOTE: We must use .any(axis=None) because if the DataFrame
        # is empty then the groupby returns an empty DataFrame, and
        # .any() then returns an empty Series, but we need a boolean.
        # By using .any(axis=None) it is reduced to a boolean value.
        test_func = lambda df: df.groupby(group_index, group_keys=False).apply(test_func_groups).any(axis=None)

    # Number of problems found.
    num_problems = 0

    # For all datasets, variants and markets.
    for dataset, variant, market, df in data.iter(datasets=datasets,
                                                  variants=variants,
                                                  markets=markets):
        # Also process DataFrames that are None,
        # because they could not be loaded?
        if df is not None or process_df_none:
            try:
                # Perform the user-supplied test.
                problem_found = test_func(df)
            except:
                # An exception occurred so we consider
                # that to be a problem.
                problem_found = True
                
            if problem_found:
                # Increase the number of problems found.
                num_problems += 1

                # Print the test's name. Only done once.
                if num_problems==1:
                    print(test_name, file=sys.stderr)

                # Print the dataset details.
                msg = "dataset='{}', variant='{}', market='{}'"
                msg = msg.format(dataset, variant, market)
                print(msg, file=sys.stderr)
                
    # Raise exception or generate warning?
    if num_problems>0:
        if raise_exception:
            raise Exception(test_name)
        else:
            warnings.warn(test_name)

## Function for Getting Rows with Problems

When a test has found problems in a dataset, it does not show which specific rows have the problem. You can get all the problematic rows using this function:

In [None]:
def get_problem_rows(df, test_func_rows):
    """
    Perform the given test on all rows of the given DataFrame
    and return a DataFrame with only the problematic rows.
    
    :param df:
        Pandas DataFrame.

    :param test_func_rows:
        Function used for testing each row. This takes
        a Pandas DataFrame as an argument and returns
        a Pandas Series of booleans whether each row
        in the original DataFrame has the error.
        
        For example:
        
        test_func_rows = lambda df: (df[SHARES_BASIC] is None or
                                     df[SHARES_BASIC] <= 0)

    :return:
        Pandas DataFrame with only the problematic rows.
    """

    # Index of the rows with problems.
    idx = test_func_rows(df)
    
    # Extract the rows with problems.
    df2 = df[idx]
    
    return df2

## Function for Getting Rows with Missing Data

In [None]:
def get_missing_data_rows(df, column):
    """
    Return the rows of `df` where the data for the given
    column is missing i.e. it is either NaN, None, or Null.
    
    :param df:
        Pandas DataFrame.
    
    :param column:
        Name of the column.

    :return:
        Pandas Series with the rows where the
        column-data is missing.
    """

    # Index for the rows where column-data is missing.
    idx = df[column].isnull()

    # Get those rows from the DataFrame.
    df2 = df[idx]

    return df2

## Function for Getting Problematic Groups

In [None]:
def get_problem_groups(df, test_func_groups, group_index):
    """
    Perform the given test on the given DataFrame grouped by
    the given index, and return a DataFrame with only the
    problematic groups.
    
    This is used to perform tests on a DataFrame on a per-group
    basis, e.g. per-stock or per-company, and return a new
    DataFrame with only the rows for the stocks that had problems.
    
    :param df:
        Pandas DataFrame.

    :param test_func_groups:
        Similar to test_func but for testing groups of rows
        in a DataFrame. For example, test on a per-stock basis
        whether SHARES_BASIC is greater than twice its mean:
        
        test_func_groups = lambda df: (df[SHARES_BASIC] >
                                       df[SHARES_BASIC].mean() * 2)

    :param group_index:
        String with the column-name used to create groups when
        using test_func_groups e.g. SIMFIN_ID for grouping by companies.

    :return:
        Pandas DataFrame with only the problematic groups.
    """

    return df.groupby(group_index).filter(test_func_groups)

## Function for Testing Equality with Tolerance

This function is useful when comparing floating point numbers, or when comparing accounting numbers that are supposed to have a strict relationship (e.g. Assets = Liabilities + Equity) but we might tolerate a small degree of error in the data e.g. 1%.

In [None]:
def isclose(x, y, tolerance=0.01):
    """
    Compare whether x and y are approximately equal within
    the given tolerance, which is a ratio so tolerance=0.01
    means that we tolerate max 1% difference between x and y.
    
    This is similar to numpy.isclose() but is a more efficient
    implementation for Pandas which apparently does not have
    this built-in already (v. 0.25.1)
    
    :param x:
        Pandas DataFrame or Series.

    :param y:
        Pandas DataFrame or Series.

    :param tolerance:
        Max allowed difference as a ratio e.g. 0.01 = 1%.

    :return:
        Pandas DataFrame or Series with booleans whether
        x and y are approx. equal.
    """
    return (x-y).abs() <= tolerance * y.abs()

# Tests

## Dataset could not be loaded

In [None]:
test_name = "Dataset could not be loaded"
test_func = lambda df: df is None
test_datasets(datasets=datasets_all(),
              test_name=test_name, test_func=test_func,
              process_df_none=True)

## Dataset is empty

In [None]:
test_name = "Dataset is empty"
test_func = lambda df: len(df) == 0

# Test for all markets. This only raises a warning,
# because some markets do have some of their datasets empty.
test_datasets(datasets=datasets_all(),
              test_name=test_name, test_func=test_func,
              raise_exception=False)

In [None]:
# Test only for the 'us' market. This raises an exception.
# It happened once that all the datasets were empty
# because of some bug on the server or whatever, so it
# is important to raise an exception in case this happens again.
test_datasets(datasets=datasets_all(), markets=['us'],
              test_name=test_name, test_func=test_func,
              raise_exception=True)

In [None]:
data.get(dataset='income-insurance', variant='quarterly', market='de')

## Shares Basic is None or <= 0

In [None]:
test_name = "SHARES_BASIC is None or <= 0"
test_func_rows = lambda df: (df[SHARES_BASIC] is None or
                             df[SHARES_BASIC] <= 0)
test_datasets(datasets=datasets_fundamental(),
              test_name=test_name, test_func_rows=test_func_rows)

In [None]:
# Show the problematic rows for a dataset.
df = data.get(dataset='income', variant='annual', market='us')
get_problem_rows(df=df, test_func_rows=test_func_rows)

## Shares Diluted is None or <= 0

In [None]:
test_name = "SHARES_DILUTED is None or <= 0"
test_func_rows = lambda df: (df[SHARES_DILUTED] is None or
                             df[SHARES_DILUTED] <= 0)
test_datasets(datasets=datasets_fundamental(),
              test_name=test_name, test_func_rows=test_func_rows)

In [None]:
# Show the problematic rows for a dataset.
df = data.get(dataset='income', variant='annual', market='us')
get_problem_rows(df=df, test_func_rows=test_func_rows)

## Shares Basic or Diluted looks strange

In [None]:
# List of SimFin-Id's to ignore in this test.
# Use this list when a company's share-counts look strange,
# but after manual inspection of the financial reports, the
# share-counts are actually correct.
ignore_simfin_ids = \
    [520475, 652016, 951586, 698616, 543421, 82753]

In [None]:
def test_func_groups(df_grp):
    # Perform various tests on the share-counts.
    # Assume `df_grp` only contains data for a single company,
    # because this function should be called using:
    # df.groupby(SIMFIN_ID).apply(test_func_groups)
    
    # Ignore this company?
    if df_grp[SIMFIN_ID].iloc[0] in ignore_simfin_ids:
        return False
    
    # Helper-function for calculating absolute ratio between
    # a value and its average.
    abs_ratio = lambda df: (df / df.mean() - 1).abs()

    # Max absolute ratio allowed.
    max_abs_ratio = 2
    
    # Test whether Shares Basic is much different from its mean.
    test1 = (abs_ratio(df_grp[SHARES_BASIC]) > max_abs_ratio).any()

    # Test whether Shares Diluted is much different from its mean.
    test2 = (abs_ratio(df_grp[SHARES_DILUTED]) > max_abs_ratio).any()

    return (test1 | test2)

In [None]:
%%time
test_name = "Shares Basic or Shares Diluted looks strange"
test_datasets(datasets=datasets_fundamental(),
              test_name=test_name,
              test_func_groups=test_func_groups,
              group_index=SIMFIN_ID)

In [None]:
# Show the problematic groups for a dataset.
if not running_pytest:
    # Get the dataset.
    df = data.get(dataset='income', variant='annual', market='us')

    # Get the problematic groups.
    df_problems = get_problem_groups(df=df,
                                     test_func_groups=test_func_groups,
                                     group_index=SIMFIN_ID)

    # Print the problematic groups.
    for _, df2 in df_problems.groupby(SIMFIN_ID):
        display(df2[[SIMFIN_ID, REPORT_DATE, SHARES_BASIC, SHARES_DILUTED]])

## Revenue is negative

In [None]:
test_name = "REVENUE < 0"
test_func_rows = lambda df: (df[REVENUE] < 0)

# It is possible that Revenue is negative for banks and
# insurance companies, so we only test it for "normal" companies
# in the 'income' dataset.
test_datasets(datasets=['income'],
              test_name=test_name, test_func_rows=test_func_rows)

In [None]:
# Show the problematic rows for a dataset.
df = data.get(dataset='income-insurance', variant='quarterly', market='us')
get_problem_rows(df=df, test_func_rows=test_func_rows)

## Assets != Liabilities + Equity (Exact Comparison)

This only generates a warning, because sometimes there are tiny rounding errors.

In [None]:
test_name = "Assets != Liabilities + Equity (Exact Comparison)"
test_func_rows = lambda df: (df[TOTAL_ASSETS] != df[TOTAL_LIABILITIES] + df[TOTAL_EQUITY])
test_datasets(datasets=datasets_balance(),
              test_name=test_name, test_func_rows=test_func_rows,
              raise_exception=False)

In [None]:
# Get the problematic rows for a dataset.
df = data.get(dataset='balance', variant='quarterly', market='us')
df2 = get_problem_rows(df=df, test_func_rows=test_func_rows)

# Only show the relevant columns.
df2[[TICKER, SIMFIN_ID, REPORT_DATE, TOTAL_ASSETS, TOTAL_LIABILITIES, TOTAL_EQUITY]]

## Assets != Liabilities + Equity (1% Tolerance)

The above test used exact comparison. We now allow for 1% error. This raises an exception.

In [None]:
def test_func_rows(df):
    x = df[TOTAL_ASSETS]
    y = df[TOTAL_LIABILITIES] + df[TOTAL_EQUITY]
    
    # Compare x and y within 1% tolerance. Note the resulting
    # boolean array is negated because we want to indicate
    # which rows are problematic so x and y are not close.
    return ~isclose(x=x, y=y, tolerance=0.01)

In [None]:
test_name = "Assets != Liabilities + Equity (1% Tolerance)"
test_datasets(datasets=datasets_balance(),
              test_name=test_name, test_func_rows=test_func_rows)

In [None]:
# Get the problematic rows for a dataset.
df = data.get(dataset='balance', variant='annual', market='us')
df2 = get_problem_rows(df=df, test_func_rows=test_func_rows)

# Only show the relevant columns.
df2[[TICKER, SIMFIN_ID, REPORT_DATE, TOTAL_ASSETS, TOTAL_LIABILITIES, TOTAL_EQUITY]]

## Dates are invalid (Fundamentals)

In [None]:
# Lambda function for converting strings to dates. Format: YYYY-MM-DD
# This will raise an exception if invalid dates are encountered.
date_parser = lambda column: pd.to_datetime(column, yearfirst=True, dayfirst=False)

In [None]:
# Test function for the entire DataFrame.
# This cannot show which individual rows have problems.
def test_func(df):
    result1 = date_parser(df[REPORT_DATE])
    result2 = date_parser(df[PUBLISH_DATE])
    
    # We only get to this point if date_parser() does not
    # raise any exceptions, in which case we assume the
    # data did not have any problems.
    return False

In [None]:
test_name = "REPORT_DATE or PUBLISH_DATE is invalid"
test_datasets(datasets=datasets_fundamental(),
              test_name=test_name, test_func=test_func)

## Dates are invalid (Share-Prices)

In [None]:
# Test function for the entire DataFrame.
# This cannot show which individual rows have problems.
def test_func(df):
    result1 = date_parser(df[DATE])
    
    # We only get to this point if date_parser() does not
    # raise any exceptions, in which case we assume the
    # data did not have any problems.
    return False

In [None]:
test_name = "DATE is invalid"
test_datasets(datasets=datasets_shareprices(),
              test_name=test_name, test_func=test_func)

## Duplicate Tickers

In [None]:
def get_duplicate_tickers(df):
    """
    Return the rows of `df` where multiple SIMFIN_ID
    have the same TICKER.
    
    :param df: Pandas DataFrame with TICKER column.
    :return: Pandas DataFrame.
    """

    # Remove duplicate rows of [TICKER, SIMFIN_ID] pairs.
    # For the 'companies' dataset this is not necessary,
    # but for e.g. the 'income' dataset we have many rows
    # for each [TICKER, SIMFIN_ID] pair because there are
    # many financial reports for each of these ID pairs.
    idx = df[[TICKER, SIMFIN_ID]].duplicated()
    df2 = df[~idx]

    # Now the DataFrame df2 only contains unique rows of
    # [TICKER, SIMFIN_ID] so we need to check if there are
    # any duplicate TICKER.

    # Index for rows where TICKER is a duplicate.
    idx1 = df2[TICKER].duplicated()

    # Index for rows where TICKER is not NaN.
    # These would otherwise show up as duplicates.
    idx2 = df2[TICKER].notna()

    # Index for rows where TICKER is a duplicate but not NaN.
    idx = idx1 & idx2

    # Get those rows from the DataFrame.
    df2 = df2[idx]

    return df2

In [None]:
# Test-function whether a DataFrame has duplicate tickers.
test_func = lambda df: (len(get_duplicate_tickers(df=df)) > 0)

In [None]:
test_name = "Duplicate Tickers"
test_datasets(datasets=datasets_tickers,
              test_name=test_name, test_func=test_func)

In [None]:
# Show duplicate tickers in the 'companies' dataset.
df = data.get(dataset='companies', market='us')
get_duplicate_tickers(df=df)

In [None]:
# Show duplicate tickers in the 'income-annual' dataset.
df = data.get(dataset='income', variant='annual', market='us')
get_duplicate_tickers(df=df)

## Missing Tickers

In [None]:
# Test-function whether a DataFrame has missing tickers.
test_func = lambda df: (len(get_missing_data_rows(df=df, column=TICKER)) > 0)

In [None]:
test_name = "Missing Tickers"
test_datasets(datasets=datasets_tickers,
              test_name=test_name, test_func=test_func)

In [None]:
# Show missing tickers in the 'companies' dataset.
df = data.get(dataset='companies', market='us')
get_missing_data_rows(df=df, column=TICKER)

In [None]:
# Show missing tickers in the 'income-annual' dataset.
df = data.get(dataset='income', variant='annual', market='us')
get_missing_data_rows(df=df, column=TICKER)

In [None]:
# Show missing tickers in the 'shareprices-daily' dataset.
df = data.get(dataset='shareprices', variant='daily', market='us')
get_missing_data_rows(df=df, column=TICKER)

## Missing Company Names

In [None]:
# Test-function whether a DataFrame has missing company names.
test_func = lambda df: (len(get_missing_data_rows(df=df, column=COMPANY_NAME)) > 0)

In [None]:
test_name = "Missing Company Name"
test_datasets(datasets=['companies'],
              test_name=test_name, test_func=test_func)

In [None]:
# Show missing company names in the 'companies' dataset.
df = data.get(dataset='companies', market='us')
get_missing_data_rows(df=df, column=COMPANY_NAME)

## Missing Annual Reports

In [None]:
def missing_annual_reports(df):
    """
    Return a list of the SIMFIN_ID's from the given DataFrame
    that have missing annual reports.
    
    :param df:
        Pandas DataFrame with a dataset e.g. 'income-annual'.
        It must have columns SIMFIN_ID and FISCAL_YEAR.

    :return:
        List of integers with SIMFIN_ID's that have missing reports.
    """
    
    # The idea is to test for each SIMFIN_ID individually,
    # whether the DataFrame has all the expected reports for
    # consecutive Fiscal Years between the min/max years.
    
    # Helper-function for processing a DataFrame for one SIMFIN_ID.
    def _missing(df):
        # Get the Fiscal Years from the DataFrame.
        fiscal_years = df[FISCAL_YEAR]

        # How many years between min and max fiscal years.
        num_years = fiscal_years.max() - fiscal_years.min() + 1

        # We expect the Series to have the same length, otherwise
        # some reports must be missing between min and max years.
        missing = (num_years != len(fiscal_years))

        return missing
    
    # Process all companies individually and get a Pandas
    # DataFrame with a boolean for each SIMFIN_ID whether
    # it has some missing Fiscal Years.
    idx = df.groupby(SIMFIN_ID).apply(_missing)

    # List of the SIMFIN_ID's that have missing reports.
    simfin_ids = list(idx[idx].index.values)

    return simfin_ids

In [None]:
test_name = "Missing annual reports"
test_func = lambda df: len(missing_annual_reports(df=df)) > 0
test_datasets(datasets=datasets_fundamental(),
              variants=['annual'],
              test_name=test_name, test_func=test_func)

In [None]:
# Get list of SIMFIN_ID's that have missing reports for a dataset.
if not running_pytest:
    df = data.get(dataset='income', variant='annual', market='de')
    display(missing_annual_reports(df=df))

In [None]:
def sort_annual_reports(df, simfin_id):
    """
    Get the data for a given SIMFIN_ID and set the index to be
    the sorted Fiscal Year so it is easier to see which are missing.
    """
    return df.set_index([SIMFIN_ID, FISCAL_YEAR]).sort_index().loc[simfin_id]

In [None]:
# Show all the reports for a given SIMFIN_ID sorted by
# Fiscal Year so it is easier to see which are missing.
if not running_pytest:
    display(sort_annual_reports(df=df, simfin_id=936426))

## Missing Quarterly Reports

In [None]:
def missing_quarterly_reports(df):
    """
    Return a list of the SIMFIN_ID's from the given DataFrame
    that have missing quarterly or ttm reports.
    
    :param df:
        Pandas DataFrame with a dataset e.g. 'income-annual'.
        It must have columns SIMFIN_ID, FISCAL_YEAR, FISCAL_PERIOD.

    :return:
        List of integers with SIMFIN_ID's that have missing reports.
    """
    
    # The idea is to test for each SIMFIN_ID individually,
    # whether the DataFrame has all the expected reports for
    # consecutive Fiscal Years and Periods between the min/max.
    
    # Helper-function for processing a DataFrame for one SIMFIN_ID.
    def _missing(df):
        # Get the Fiscal Years and Periods from the DataFrame.
        fiscal_years_periods = df[[FISCAL_YEAR, FISCAL_PERIOD]]

        # The first Fiscal Year and Period.
        min_year = fiscal_years_periods[FISCAL_YEAR].min()
        min_idx = (fiscal_years_periods[FISCAL_YEAR] == min_year)
        min_period = fiscal_years_periods[min_idx][FISCAL_PERIOD].min()

        # The last Fiscal Year and Period.
        max_year = fiscal_years_periods[FISCAL_YEAR].max()
        max_idx = (fiscal_years_periods[FISCAL_YEAR] == max_year)
        max_period = fiscal_years_periods[max_idx][FISCAL_PERIOD].max()

        # How many years between min and max fiscal years.
        num_years = max_year - min_year + 1

        # Total number of Fiscal Periods between first and
        # last Fiscal Years - if all Fiscal Periods were included.
        num_periods = num_years * 4

        # Used to map from Fiscal Period strings to ints.
        # This is safer and easier to understand than
        # e.g. def map_period(x): int(x[1])
        map_period = \
        {
            'Q1': 1,
            'Q2': 2,
            'Q3': 3,
            'Q4': 4
        }

        # Number of Fiscal Periods missing in the first year.
        adj_min_period = map_period[min_period] - 1

        # Number of Fiscal Periods missing in the last year.
        adj_max_period = 4 - map_period[max_period]

        # Adjust the number of Fiscal Periods between the min/max
        # Fiscal Years and Periods by subtracting those periods
        # missing in the first and last years.
        expected_periods = num_periods - adj_min_period - adj_max_period

        # If the expected number of Fiscal Periods between the
        # min and max dates, is different from the actual number
        # of Fiscal Periods in the DataFrame, then some are missing.
        missing = (expected_periods != len(fiscal_years_periods))

        return missing

    # Process all companies individually and get a Pandas
    # DataFrame with a boolean for each SIMFIN_ID whether
    # it has some missing Fiscal Years.
    idx = df.groupby(SIMFIN_ID).apply(_missing)

    # List of the SIMFIN_ID's that have missing reports.
    simfin_ids = list(idx[idx].index.values)

    return simfin_ids

In [None]:
%%time
test_name = "Missing quarterly reports"
test_func = lambda df: len(missing_quarterly_reports(df=df)) > 0
test_datasets(datasets=datasets_fundamental(),
              variants=['quarterly'],
              test_name=test_name, test_func=test_func)

In [None]:
# Get list of SIMFIN_ID's that have missing reports for a dataset.
if not running_pytest:
    df = data.get(dataset='income', variant='quarterly', market='us')
    display(missing_quarterly_reports(df=df))

In [None]:
def sort_quarterly_reports(df, simfin_id):
    """
    Get the data for a given SIMFIN_ID and set the index to be
    the sorted Fiscal Year and Period so it is easier to see
    which ones are missing.
    """
    return df.set_index([SIMFIN_ID, FISCAL_YEAR, FISCAL_PERIOD]).sort_index().loc[simfin_id]

In [None]:
# Show all the reports for a given SIMFIN_ID sorted by
# Fiscal Year and Period so it is easier to see which are missing.
if not running_pytest:
    display(sort_quarterly_reports(df=df, simfin_id=139560))

## Missing TTM Reports

Trailing-Twelve-Months (TTM) data is also quarterly so we can use the same helper-functions from above.

In [None]:
test_name = "Missing ttm reports"
test_func = lambda df: len(missing_quarterly_reports(df=df)) > 0
test_datasets(datasets=datasets_fundamental(),
              variants=['ttm'],
              test_name=test_name, test_func=test_func)

In [None]:
# Get list of SIMFIN_ID's that have missing reports for a dataset.
if not running_pytest:
    df = data.get(dataset='income', variant='ttm', market='us')
    display(missing_quarterly_reports(df=df))

In [None]:
# Show all the reports for a given SIMFIN_ID sorted by
# Fiscal Year and Period so it is easier to see which are missing.
if not running_pytest:
    display(sort_quarterly_reports(df=df, simfin_id=89750))