### Imports

In [0]:
import requests
import pyspark.sql.functions as F
from pyspark.sql.types import *
import json
from dateutil.relativedelta import relativedelta
import time

In [0]:
header = {'User-Agent' : "rkumar11@mscience.com"}

### Input tickers which needs to be checked for their quarterly/yearly total revenues

In [0]:
tickers_str_with_region = """
FL-US
LULU-US
LE-US
KIRK-US
CTRN-US
GES-US
GCO-US
PETS-US
DKS-US
RH-US
CBRL-US
CAL-US
VSCO-US
AEO-US
ZUMZ-US
BKE-US
DBI-US
SFIX-US
GME-US
PETS-US
PLAY-US
UNFI-US
ASO-US
"""

### Ouput Variables

In [0]:
#Output list of tickers 1.when revenue is not available 2.In cases where any exception occurs(delisted etc)
rev_bif_not_found = []

#processed tickers
processed_tickers = []

#Final result df with the required schema
final_result_df = spark.createDataFrame([], schema = StructType([StructField('Ticker', StringType(), False), StructField('Period', StringType(), False), StructField('Period_Startdate', StringType(), True), StructField('Period_Enddate', StringType(), True), StructField('Period_Reportdate', StringType(), True), StructField('_', StringType(), False), StructField('Segment_Identifier', StringType(), False), StructField('KPI', StringType(), False), StructField('Value', StringType(), True), StructField('Revison_Date', StringType(), False), StructField('Original_Value', StringType(), False), StructField('Source', StringType(), False)])
)

### Functions

In [0]:
def get_input_tickers(tickers_str_with_region):
  #segregate tickers by region
  tickers_list_us_region = [ticker for ticker in tickers_str_with_region.split("\n") if ticker != "" and ticker.lstrip().rstrip().endswith("-US")]
  tickers_list_int_region = [ticker for ticker in tickers_str_with_region.split("\n") if ticker != "" and not ticker.lstrip().rstrip().endswith("-US")]

  #get the ticker part only
  tickers_list = [ticker.split("-")[0] for ticker in tickers_list_us_region]

  #handle cases with duplicate entries
  tickers_to_check =  [ticker for ticker in set(tickers_list)]

  #add international tickers to flag
  rev_bif_not_found.extend(list(set(tickers_list_int_region)))
  return tickers_to_check

def get_cik_lookup_df():
  #Get CIK lookup table through company tickers endpoint
  cik_map = requests.get("https://www.sec.gov/files/company_tickers.json", headers = header)
  cik_map_df = spark.createDataFrame(cik_map.json().values(), schema = StructType([StructField('cik_str', StringType(), True),\
                                                                                            StructField('ticker', StringType(), True),\
                                                                                            StructField('title', StringType(), True)]))
  #padding to get 10 digit cik 
  cik_map_df = cik_map_df.withColumn('cik_str', F.lpad(F.col('cik_str'), 10, '0'))
  return cik_map_df

def get_company_facts(cik):
    #company facts endpoint
    URL = f"https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json"
    time.sleep(1)
    comp_facts = requests.get(URL, headers = header)
    comp_facts_dict = json.loads(comp_facts.text)
    return comp_facts_dict

def choose_best_fin_statement(comp_facts_dict, Ticker):
    rev_reporting_statements = ["RevenueFromContractWithCustomerExcludingAssessedTax", "RevenueFromContractWithCustomerIncludingAssessedTax", "Revenues"]
    potential_keys = []
    try:
      for each in comp_facts_dict['facts']['us-gaap'].keys():
        if(each in rev_reporting_statements):
          potential_keys.append(each)
      #case when the correct statements are not there
      if(len(potential_keys) == 0):
        print(f"No matching revenues statements found for {Ticker}")
        rev_bif_not_found.append(Ticker + "-US")
        return None
      elif(len(potential_keys)>1):
        #choosing the best financials statement for tracking revenue out of the available
        df_key_max_date_and_rev = {}
        for k in potential_keys:
          df = spark.createDataFrame(comp_facts_dict['facts']['us-gaap'][k]['units']['USD'])\
                        .withColumn('end', F.col('end').cast(DateType()))\
                        .withColumn('val', F.col('val').cast(LongType()))
          max_date = df.agg(F.max(F.col('end')).alias('max_date')).first()['max_date']
          max_val = df.filter(F.col('end') == max_date).first()['val'] 
          df_key_max_date_and_rev[k] = (max_date, max_val)
        # Sort by max_date first, then by revenue(we know Total revenue will always be >= net sales/any other revenue KPI)
        key_to_use = sorted(df_key_max_date_and_rev.items(), key=lambda x: (x[1][0], x[1][1]), reverse=True)[0][0]
      else:
        key_to_use = potential_keys[0]
    #raise exception in case us-gaap facts are not available
    except KeyError:
      print(f"us-gaap financials not available for {Ticker}")
      rev_bif_not_found.append(Ticker + "-US")
      return None
    return key_to_use

def create_df_and_run_transformations(comp_facts_dict, key_to_use):
    #create df for the company facts
    flat_facts_df = spark.createDataFrame(comp_facts_dict['facts']['us-gaap'][key_to_use]['units']['USD'])

    # drop columns that are not required
    flat_facts_df = flat_facts_df.drop(*['description', 'label', 'accn', 'frame', 'fy'])
    
    #transformations
    flat_facts_df = flat_facts_df.withColumn('start', F.col('start').cast(DateType()))\
                                .withColumn('filed', F.col('filed').cast(DateType()))\
                                .withColumn('end', F.col('end').cast(DateType()))\
                                .withColumn('val_int', F.col('val').cast(LongType()))\
                                .withColumn("time_period_months", F.round(F.months_between(F.col('end'), F.col('start')), 2))\
                                .withColumn("start_year", F.year(F.date_trunc('year', F.col('start'))).cast(StringType()))
                                
    flat_facts_df = (flat_facts_df
                      #filtering out records that are not within the last 2 years --edit here for more ticker history --IMPORTANT
                      .filter("start > date_sub(current_date(), 732) ")
                      #filter out rows having running total revenue instead of quarter/yearly values 
                      .filter("  (time_period_months between 2.7 and 4) or (time_period_months between 11 and 13) ")
    )

    # drop duplicate rows   
    flat_facts_df = flat_facts_df.dropDuplicates(['start', 'end', 'val_int'])
    # print(flat_facts_df.schema)
    return flat_facts_df

def get_fiscal_year_quarters(flat_facts_df, Ticker, assign_leniency_to_quarter_end_in_days= 23, gen_prev_num_quarters= 16, gen_next_num_quarters= 9):
    #select relevant columns
    all_data_df = flat_facts_df.select('start', 'end', 'fp', 'time_period_months') 
    # display(all_data_df)

    #get latest fy enddate
    fiscal_year_end = all_data_df.filter(""" time_period_months between 11 and 13 """).select(F.max(F.col('end'))).collect()[0][0]
    # print("fiscal_year_end", fiscal_year_end)
    if(fiscal_year_end is None):
        print(f"No FY rows found for {Ticker} and hence quarter assignment not possible for this ticker")
        processed_tickers.append(Ticker + "-US")
        return None
    else:
      #idea is to get q1-q3 using the latest fiscal year enddate by subtracting/adding 3 months consecutively e.g.subtracting 3 months from the current FY enddate will give the enddate for 3Q, repeating that again will give 2Q enddate
      quarters_dict ={
      '1Q' : [],
      '2Q' : [],
      '3Q' : []
      }
      
      fiscal_year_end_m = fiscal_year_end
      fiscal_year_end_p = fiscal_year_end
      
      #generating quarter mapping for dates that are lower than the latest FY end (considering prev 15 quarters in this case as per default argument)
      for i in range(1, gen_prev_num_quarters, 1):
          fiscal_year_end_m = fiscal_year_end_m - relativedelta(months=3)
          if(i==1 or (i-1)%4==0):
              quarters_dict['3Q'].append(fiscal_year_end_m)
          elif(i==2 or (i-2)%4==0):
              quarters_dict['2Q'].append(fiscal_year_end_m)
          elif(i==3 or (i+1)%4==0):
              quarters_dict['1Q'].append(fiscal_year_end_m)

      #generating quarter mapping for dates that are greater than the latest FY end (considering next 8 quarters in this case as per default argument)
      for i in range(1, gen_next_num_quarters, 1):
          fiscal_year_end_p = fiscal_year_end_p + relativedelta(months=3)
          if(i==1 or (i-1)%4==0):
              quarters_dict['1Q'].append(fiscal_year_end_p)
          elif(i==2 or (i-2)%4==0):
              quarters_dict['2Q'].append(fiscal_year_end_p)
          elif(i==3 or (i+1)%4==0):
              quarters_dict['3Q'].append(fiscal_year_end_p)
      
      quarters_df = spark.createDataFrame(data = [quarters_dict])
      
      quarters_df = quarters_df.withColumn("1Q", F.explode(F.col("1Q")))\
                          .withColumn("2Q", F.explode(F.col("2Q")))\
                          .withColumn("3Q", F.explode(F.col("3Q")))
      # print(quarters_dict)
      #floor and ceil for the quarter's enddate
      q1_df = quarters_df.select('1Q').distinct().withColumn('1Q_floor', F.date_sub(F.col('1Q'), assign_leniency_to_quarter_end_in_days))\
                                                .withColumn('1Q_ceil', F.date_add(F.col('1Q'), assign_leniency_to_quarter_end_in_days))
      # display(q1_df)
      q2_df = quarters_df.select('2Q').distinct().withColumn('2Q_floor', F.date_sub(F.col('2Q'), assign_leniency_to_quarter_end_in_days))\
                                                .withColumn('2Q_ceil', F.date_add(F.col('2Q'), assign_leniency_to_quarter_end_in_days))

      q3_df = quarters_df.select('3Q').distinct().withColumn('3Q_floor', F.date_sub(F.col('3Q'), assign_leniency_to_quarter_end_in_days))\
                                                .withColumn('3Q_ceil', F.date_add(F.col('3Q'), assign_leniency_to_quarter_end_in_days))
      
      #consider only periods with 3 months for assigning fp i.e. q1, q2, q3(FY and q4 will be handled later)
      all_data_df_not_FY  = all_data_df.filter("not(time_period_months between 11 and 13)")
      
      #assigning quarters(i.e. fiscal period (fp)) based on floor and ceil range
      q1_assigned = all_data_df_not_FY.crossJoin(q1_df).withColumn('fp', F.when((F.col('end')>=F.col('1Q_floor')) & (F.col('end')<=F.col('1Q_ceil')), F.lit('1Q')).otherwise(F.lit('')))
      # display(q1_assigned)
      
      q1_q2_assigned = q1_assigned.crossJoin(q2_df).withColumn('fp', F.when((F.col('end')>=F.col('2Q_floor')) & (F.col('end')<=F.col('2Q_ceil')), F.lit('2Q')).otherwise(F.col('fp')))
      
      q1_q2_q3_assigned = q1_q2_assigned.crossJoin(q3_df).withColumn('fp', F.when((F.col('end')>=F.col('3Q_floor')) & (F.col('end')<=F.col('3Q_ceil')), F.lit('3Q')).otherwise(F.col('fp')))

      quarters_q1_to_q3_alloted = q1_q2_q3_assigned.filter(F.col('fp') != '').dropDuplicates(['start', 'end', 'fp', 'time_period_months']).withColumn('fp_alloted', F.col('fp')).select(['start', 'end', 'fp_alloted', 'time_period_months'])
      
      #return the df with alloted quarters
      return quarters_q1_to_q3_alloted

def assign_quarters_to_fiscal_year(all_periods_alloted):
    #This function establishes relationship between quarters and the fiscal year i.e. which quarters belong to which fiscal year .This would be useful for q4 rows calculations

    #get all fiscal year rows in the data available for the ticker
    FY_df = all_periods_alloted.filter(F.col('time_period_months').between(11, 13)).selectExpr('start as Fy_start', 'end as Fy_end')
    # display(FY_df)

    #latest FY end
    latest_fy_end = FY_df.select(F.max(F.col('Fy_end'))).collect()[0][0]
    
    #assign quarters to the correct fiscal year based on the period range
    periods_alloted_with_fy_fp_relation = all_periods_alloted.crossJoin(FY_df).withColumn('fy_fp_relation', F.when((F.col('start') >= F.col('Fy_start')) & (F.col('end') <= F.col('Fy_end')), F.concat_ws(' ', F.col('Fy_start'), F.col('Fy_end'))).otherwise(''))

    periods_alloted_with_fy_fp_relation = periods_alloted_with_fy_fp_relation.filter(F.col('fy_fp_relation') != '').dropDuplicates(['start', 'end' , 'val_int'])
    
    #making sure to include the recent quarterly data as it's FY row is not available yet
    missing_recent_rows = all_periods_alloted.join(periods_alloted_with_fy_fp_relation, *[all_periods_alloted.columns], 'left_anti')
  
    missing_recent_rows = missing_recent_rows.filter((F.col('fp_alloted').isin(['1Q', '2Q', '3Q']) & (F.col('end') > latest_fy_end))).withColumn('Fy_start', F.lit('')).withColumn('Fy_end', F.lit('')).withColumn('fy_fp_relation', F.lit(''))

    all_periods_alloted_with_fy_fp_relation = periods_alloted_with_fy_fp_relation.union(missing_recent_rows.select(*[periods_alloted_with_fy_fp_relation.columns]))

    all_periods_alloted_with_fy_fp_relation = all_periods_alloted_with_fy_fp_relation.drop(*['Fy_start', 'Fy_end'])
    # all_periods_alloted_with_fy_fp_relation.display()
    return all_periods_alloted_with_fy_fp_relation

def get_q4_rows(all_periods_alloted_with_fy_fp_relation):
    #calcs below for q4 rows if not present in the original df
    
    #get values comprised of q1,q2, q3 and FY
    q1_q4_fy_grp_df = all_periods_alloted_with_fy_fp_relation.groupBy('fy_fp_relation').agg(F.max('val_int').alias('max_val_per_grp'),\
                                                              F.max('end').alias('fy_end'),\
                                                              F.max('filed').alias('fy_filed_date'),\
                                                              F.count('*').alias('fy_period_cnt')).filter(F.col('fy_period_cnt') == 4)
    
    q1_q4_fy_grp_list = [row['max_val_per_grp'] for row in q1_q4_fy_grp_df.collect()]
    
    #get q1-q3 values
    q1_q3_grp = all_periods_alloted_with_fy_fp_relation.filter(~F.col('val_int').isin(q1_q4_fy_grp_list))\
                                 .groupBy('fy_fp_relation')\
                                    .agg(F.count('*').alias('quarter_count'),\
                                         F.sum('val_int').alias('sum_q1_q3_val'),\
                                         F.max('end').alias('q3_end_date'))\
                                .filter(F.col('quarter_count') == 3)
    # calc q4 values
    q4_df = q1_q4_fy_grp_df.join(q1_q3_grp, 'fy_fp_relation', 'inner')\
                     .withColumn('q4_val', F.col('max_val_per_grp') - F.col('sum_q1_q3_val'))\
                     .withColumn('q4_start_date', F.date_add(F.col('q3_end_date'), 1))\
                     .withColumn('Period', F.concat_ws('', F.lit('4Q'), F.regexp_extract(F.col('fy_fp_relation'), '^\\d{2}(\\d{2})[-]', 1)))
    
    #match the schema with the original df for union after return
    q4_df = q4_df.withColumn('start', F.col('q4_start_date'))\
                 .withColumn('end', F.col('fy_end') )\
                 .withColumn('time_period_months', F.lit(3))\
                 .withColumn('val', F.col('q4_val').cast(StringType()))\
                 .withColumn('form', F.lit(' '))\
                 .withColumn('filed', F.col('fy_filed_date'))\
                 .withColumn('val_int', F.col('q4_val').cast(LongType()))\
                 .withColumn('end_year', F.lit(' ') )\
                 .withColumn('fp_alloted', F.lit('4Q'))\
                 .withColumn('Period', F.col('Period'))\
                 .withColumn('start_year', F.year(F.date_trunc('year', F.col('start'))).cast(StringType()))
    
    return q4_df.select(*[all_periods_alloted_with_fy_fp_relation.columns])

def get_base_sheet_format(output_df, Ticker):
    #base sheet format
    output_df  = output_df.withColumn('Ticker', F.lit(f'{Ticker}' + " " + "US Equity"))\
                            .withColumn('start_date_string', F.date_format(F.col('start'), 'M/d/y'))\
                            .withColumn('filed_date_string', F.date_format(F.col('filed'), 'M/d/y'))\
                            .withColumn('end_date_string', F.date_format(F.col('end'), 'M/d/y'))\
                            .withColumn('Period_Startdate', F.col('start_date_string'))\
                            .withColumn('Period_Enddate', F.col('end_date_string'))\
                            .withColumn('Period_Reportdate', F.col('filed_date_string'))\
                            .withColumn('Value', F.col('val_int'))\
                            .withColumn('KPI', F.lit('rev_Topline'))\
                            .withColumn('_', F.lit(''))\
                            .withColumn('Segment_Identifier', F.lit(''))\
                            .withColumn('Revison_Date', F.lit(''))\
                            .withColumn('Original_Value', F.lit(''))\
                            .withColumn('Source', F.concat_ws(' ', 'form', 'Period'))\
                            .withColumn("value", F.format_number(F.col("value").cast(LongType()), "###,###"))

    output_df = output_df.orderBy(F.col('end').desc(), F.col('val_int').desc())\
                         .select(['Ticker', 'Period', 'Period_Startdate', 'Period_Enddate', 'Period_Reportdate', '_', 'Segment_Identifier',  'KPI', 'Value' , 'Revison_Date', 'Original_Value' , 'Source'])\
                          .filter((F.col('Period').rlike('Q|F')) & (F.col('val_int') > 0 ))
    return output_df
            

### Check the input tickers for their cik and title


In [0]:
cik_map_df = get_cik_lookup_df().filter(F.col('ticker').isin(get_input_tickers(tickers_str_with_region)))
display(cik_map_df)

###Main loop 

In [0]:
for Ticker in get_input_tickers(tickers_str_with_region):
    try:
        cik_map_df_filt = cik_map_df.filter(F.col('ticker') == Ticker)
        cik = cik_map_df_filt.select('cik_str').collect()[0][0]
    except IndexError:
        print(f"{Ticker} not present in cik map. Check cmd 10 for more info")
        rev_bif_not_found.append(Ticker + "-US")
        continue
    print(f"Processing ticker {Ticker}, cik {cik}")

    comp_facts_dict = get_company_facts(cik)

    key_to_use = choose_best_fin_statement(comp_facts_dict, Ticker)
    if(key_to_use is None):
        continue
    else:
        flat_facts_df = create_df_and_run_transformations(comp_facts_dict, key_to_use)

        #get correct q1-q3 quarters through function call
        q1_q3_fn_return = get_fiscal_year_quarters(flat_facts_df, Ticker)

        if(q1_q3_fn_return is None):
             continue
        else:
            quarters_q1_to_q3_alloted = flat_facts_df.join(q1_q3_fn_return, ['start', 'end', 'time_period_months'], 'left')

            #allot fy and q4(if any)
            all_periods_alloted = quarters_q1_to_q3_alloted.drop('fp').withColumn('fp_alloted', F.when(F.col('fp_alloted').isNull() & (F.col('time_period_months').between(2.7, 4)), F.lit('4Q')).otherwise(F.col('fp_alloted'))).withColumn('fp_alloted', F.when(F.col('fp_alloted').isNull() & (F.col('time_period_months').between(11, 13)), F.lit('FY')).otherwise(F.col('fp_alloted'))).filter(F.col('fp_alloted').isNotNull())
            
            #assign quarters to their respective fiscal_years
            all_periods_alloted_with_fy_fp_relation = assign_quarters_to_fiscal_year(all_periods_alloted)

            # assign period values e.g. 1Q22, FY21 etc
            all_periods_alloted_with_fy_fp_relation = all_periods_alloted_with_fy_fp_relation.withColumn('Period', F.when(F.col('fy_fp_relation') == '', F.concat_ws('', F.col('fp_alloted'), F.regexp_extract(F.col('start_year'), '^\\d{2}(\\d{2})$', 1)))\
                                                                            .otherwise(F.concat_ws('', F.col('fp_alloted'), F.regexp_extract(F.col('fy_fp_relation'), '^\\d{2}(\\d{2})[-]', 1))))

            #handling cases with amended revenue values --try to optimize
            amend_rev_rows = all_periods_alloted_with_fy_fp_relation.groupBy(['start', 'end', 'time_period_months']).agg(F.max('filed').alias('latest_rep_date'), F.count('*').alias('cnt_rows_with_amend')).filter(F.col('cnt_rows_with_amend') > 1).select(['*']).collect()
            
            for row in amend_rev_rows:
                Start = row['start']
                End = row['end']
                latest_rep_date = row['latest_rep_date']
                
                #filter out original rows that were amended at a later date
                all_periods_alloted_with_fy_fp_relation  = all_periods_alloted_with_fy_fp_relation.filter(~((F.col('start') == Start) & (F.col('end') == End) & (F.col('filed') < latest_rep_date)))
            

            #call function to get missing 4th quarter rows and union 
            output_df = all_periods_alloted_with_fy_fp_relation.union(get_q4_rows(all_periods_alloted_with_fy_fp_relation))

            #call function to get the df converted into the base sheet format
            output_df = get_base_sheet_format(output_df, Ticker)
            
            #union with final result
            final_result_df = final_result_df.unionByName(output_df) 

            print(f"Finished processing {Ticker}")
            #add to processed tickers
            processed_tickers.append(Ticker + "-US")

### Check results

In [0]:
display(final_result_df)

### Check rows that needs to be added in the base sheet

In [0]:
not_in_base_sheet = (
    F.broadcast(final_result_df.withColumn('period_sub_str', F.col('period').substr(0, 2)).alias('a'))
    .join(
     table('bronze.internal_products.financials_cians_raw')
     .filter(F.col('kpi') == "rev_Topline")
     .select('ticker', 'period', 'period_startdate', 'period_enddate', 'period_reportdate', 'kpi', 'value')
     .withColumn('period_sub_str', F.col('period').substr(0, 2))
     .alias('b'), 
          on= (F.col("a.Ticker") == F.col("b.ticker")) & (F.col("a.Period_Startdate") == F.col("b.period_startdate")) & (F.col("a.Period_Enddate") == F.col("b.period_enddate")) & (F.col("a.period_sub_str") == F.col("b.period_sub_str")),
          how= 'left_anti')
    .drop('period_sub_str')
)
display(not_in_base_sheet)

In [0]:
add_to_base_sheet = [row['Ticker'].replace(" US Equity", "-US") for row in not_in_base_sheet.select('Ticker').distinct().collect()]

In [0]:
new_processed_tickers = [t.replace("-US", "") for t in processed_tickers]
new_rev_bif_not_found = [t.replace("-US", "") for t in rev_bif_not_found]

new_processed_tickers.extend(new_rev_bif_not_found)

set(get_input_tickers(tickers_str_with_region)) == set(new_processed_tickers)

In [0]:
print(('\n').join(set(processed_tickers) - set(add_to_base_sheet)))

### Exit notebook

In [0]:
dbutils.notebook.exit(f"\nFinancials for the following tickers were not found: {rev_bif_not_found}\nFollowing tickers were processed: {processed_tickers}\nNew rows for these tickers can be added: {add_to_base_sheet}")

### **END**

In [0]:
# final_result_df.write.mode('overwrite').saveAsTable('silver_dev.dse_investigations.sec_gov_api_rev_bench_daily_workflow_tickers')

In [0]:
# display(table('silver_dev.dse_investigations.sec_gov_api_rev_bench_daily_workflow_tickers'))

In [0]:
print(table('bronze.internal_products.financials_cians_raw').schema)

In [0]:
cik

In [0]:
from pyspark.sql.window import Window

display(
    all_periods_alloted_with_fy_fp_relation.withColumn('fy_plus_q1_q3_rev', F.sum(F.col('val_int')).over(Window.partitionBy(F.col('fy_fp_relation'))))
                                          .withColumn('q1_q3_rev', F.when(F.col('fp_alloted') == "FY" , F.col('fy_plus_q1_q3_rev') - F.col('val_int')).otherwise(None))
                                          .withColumn('q4_rev', F.when(F.col('fp_alloted') == "FY" , F.col('val_int') - F.col('q1_q3_rev')).otherwise(None))
)

In [0]:
print(potential_keys)

In [0]:
print(df_key_max_date_and_cnt)

In [0]:
print(key_to_use)

In [0]:
test_df = (spark.createDataFrame(json_normalize(comp_facts_dict['facts']['us-gaap']['RevenueFromContractWithCustomerExcludingAssessedTax']['units']))
        # .withColumn('USD', F.explode(F.col('USD')))
        # .select("USD.*")
        # .withColumn('start', F.col('start').cast(DateType()))
        # .withColumn('end', F.col('end').cast(DateType()))
        # .withColumn("time_period_months", F.round(F.months_between(F.col('end'), F.col('start')), 2))
        # .filter("start > '2018-01-01' ")
        # .filter(""" time_period_months between 2.7 and 4 or time_period_months between 11 and 13  """)
        # # .filter(F.col('time_period_months').between(11.50, 13))
        # .dropDuplicates(['start', 'end', 'val'])
        # .drop(*['description', 'label', 'accn', 'frame', 'fy'])
        # .orderBy(F.col('end').desc()))
)

display(test_df)

In [0]:
display(test_df.join(get_fiscal_year_quarters(test_df), ['start', 'end', 'time_period_months'], 'left'))

In [0]:
display(spark.createDataFrame(json_normalize(comp_facts_dict['facts']['us-gaap']['Revenues']['units'])).withColumn('USD', F.explode(F.col('USD'))).select("USD.*").withColumn('end', F.col('end').cast(DateType())).orderBy(F.col('end').desc()))

In [0]:
display(flat_facts_df)

In [0]:
display(quarters_q1_to_q3_alloted)

In [0]:
get_fiscal_year_quarters(flat_facts_df)

In [0]:
display(q1_df)

In [0]:
display(all_periods_alloted)

In [0]:
print([each for each in comp_facts_dict['facts']['us-gaap'].keys() if re.search("revenue", each.lower())])

In [0]:
display(comp_facts_dict['facts']['us-gaap'][key_to_use]['units']['USD'])

In [0]:
potential_keys = []

for each in comp_facts_dict['facts']['us-gaap'].keys():
    if(each == "RevenueFromContractWithCustomerExcludingAssessedTax" or each == "Revenues"):
        potential_keys.append(each)
        
print(potential_keys)

if(len(potential_keys)>1):
    df_key_count = {}
    for k in potential_keys:
        df_key_count[k] = spark.createDataFrame(pd.json_normalize(comp_facts_dict['facts']['us-gaap'][k]['units'])).withColumn('USD', F.explode(F.col('USD'))).select("USD.*").filter(F.col('form').isin(['10-K', '10-Q'])).count()

key_to_use = max(df_key_count, key=df_key_count.get)
print(key_to_use)
        

In [0]:
key_to_use

In [0]:
from pprint import pprint
pprint(comp_facts_dict['facts']['us-gaap'][key_to_use]['units']['USD'])