### Imports

In [0]:
import requests
import pyspark.sql.functions as F
from pyspark.sql.types import *
import json
import time
import re
from pyspark.sql.window import Window

In [0]:
%run "/Workspace/Users/rishak1997@gmail.com/Sec_gov_revenue_KPI/helper_functions"

In [0]:
custom_tickers = json.loads(dbutils.notebook.run("/Workspace/Users/rishak1997@gmail.com/Sec_gov_revenue_KPI/custom_input_tickers", timeout_seconds= 120))
custom_tickers

In [0]:
header = {'User-Agent' : "rishak1997@gmail.com"}

### Ouput Variables

In [0]:
# Output list of tickers 1.when revenue is not available 2.In cases where any exception occurs(delisted etc)
rev_bif_not_found = []

# processed tickers
processed_tickers = []

# Final result df with the required schema
final_result_df = spark.createDataFrame(
    [],
    schema=StructType(
        [
            StructField("Ticker", StringType(), False),
            StructField("Ticker_Title", StringType(), False),
            StructField("Ticker_CIK", StringType(), False),
            StructField("Period", StringType(), False),
            StructField("Period_Startdate", StringType(), False),
            StructField("Period_Enddate", StringType(), False),
            StructField("Period_Reportdate", StringType(), False),
            StructField("KPI", StringType(), False),
            StructField("Value", StringType(), True),
            StructField("Source", StringType(), False),
        ]
    ),
)

### Check the sample input tickers and custom input tickers for their cik and title


In [0]:
cik_lookup_df = get_cik_lookup_df()

sample_tickers_df = (
    cik_lookup_df
    .filter(~F.col("ticker").rlike('-'))
    .withColumn(
        "rn_tickers_plus_title",
        F.row_number().over(
            Window.partitionBy('cik_str', 'title').orderBy(F.col("title").asc())
        )
    )
    .filter(F.col("rn_tickers_plus_title")==1)
    .withColumn('ticker_substr', F.substring("ticker", 1, 1))
    .withColumn(
        "rn_tickers_prefix",
        F.row_number().over(
            Window.partitionBy('ticker_substr').orderBy(F.col("ticker_substr"))
        )
    )
    #  .withColumn(
    #     "count_ticker_substr",
    #     F.count(F.lit(1)).over(
    #         Window.partitionBy('ticker_substr'))
    #     )
    .filter(F.col("rn_tickers_prefix")<=1)
    .selectExpr('cik_str', 'ticker', 'upper(title) as title')
    
    #  .withColumn(
    #     "rank_cik",
    #     F.rank().over(
    #         Window.orderBy(F.col("cik_str"))
    #     )
    # )
)

all_tickers_df = spark.createDataFrame([], schema= sample_tickers_df.schema)

all_tickers_df = all_tickers_df.unionByName(sample_tickers_df)

for ticker in custom_tickers:
    try:
        cik_map_df_filt = cik_lookup_df.filter(F.col("ticker") == ticker).selectExpr('cik_str', 'ticker', 'upper(title) as title')
        all_tickers_df = all_tickers_df.unionByName(cik_map_df_filt)
    except IndexError:
        print(f"{ticker} not present in cik map")
        rev_bif_not_found.append(ticker + "-US")
        continue

all_tickers_df = all_tickers_df.dropDuplicates()

In [0]:
display(all_tickers_df)

In [0]:
all_tickers_rows = all_tickers_df.collect()

In [0]:
%sql
create volume if not exists revenue_benchmarking.sec_gov_api_data.raw_data_lake

###Main loop 

In [0]:
for ticker_row in all_tickers_rows:
    cik_str = ticker_row.cik_str
    title = ticker_row.title
    Ticker = ticker_row.ticker
    print(f"Processing ticker {Ticker}, cik {cik_str}")
    processed_tickers.append(Ticker + "-US")
    try:
        comp_facts_dict = get_company_facts(cik_str)
    except Exception as e:
        print(e)
        rev_bif_not_found.append(Ticker + "-US")
        continue
    stmnts_combined_df = create_stmnts_combined_df(comp_facts_dict, history_years=3)
    if stmnts_combined_df:
        stmnts_combined_df_cleansed = remove_inconsistencies(stmnts_combined_df)
        latest_q1_q3 = save_latest_q1_q3(stmnts_combined_df_cleansed)
        stmnts_combined_df_with_rel = assign_relation_between_q_and_fy(
            stmnts_combined_df_cleansed
        )
        q4_rows_df = get_q4_rows(stmnts_combined_df_with_rel)
        fy_all_q_union_df = get_fy_all_q_union_df(
            stmnts_combined_df_with_rel, latest_q1_q3, q4_rows_df
        )
        fy_all_q_union_pc_df = period_corrected(fy_all_q_union_df)
        fy_all_q_union_raw_table_df = get_raw_table_format(fy_all_q_union_pc_df, Ticker).withColumn("Ticker_Title", F.lit(title)).withColumn("Ticker_CIK", F.lit(cik_str))
        not_in_raw_table_df = get_not_in_raw_data_lake(fy_all_q_union_raw_table_df, Ticker)

        # final_result_df = final_result_df.unionByName(fy_all_q_union_raw_table_df)
        final_result_df = final_result_df.unionByName(not_in_raw_table_df)
    else:
        print(f"us-gaap financials not found for {Ticker}")
        rev_bif_not_found.append(Ticker + "-US")
        continue

### Check rows that needs to be added in the raw table

In [0]:
display(
    final_result_df.orderBy(
        F.col("Ticker"),
        F.to_date(F.col("Period_Enddate"), "M/d/y").desc(),
        F.regexp_replace(F.col("Value"), ",", "").cast(LongType()).desc(),
    )
)
add_to_raw = [
    row["Ticker"].replace(" US Equity", "-US")
    for row in final_result_df.select("Ticker").distinct().collect()
]


### Write to data lake

In [0]:
#revenue_benchmarking.sec_gov_api_data.raw
(
final_result_df.write
    .format('delta')
    .mode("append")
    # .option("overwriteSchema", "true")
    .partitionBy('Ticker')
    .save("/Volumes/revenue_benchmarking/sec_gov_api_data/raw_data_lake")
)

In [0]:
spark.sql(" describe history delta.`/Volumes/revenue_benchmarking/sec_gov_api_data/raw_data_lake` ").display()

In [0]:
%sql

select * from  delta.`/Volumes/revenue_benchmarking/sec_gov_api_data/raw_data_lake` version as of 0
except  
select * from  delta.`/Volumes/revenue_benchmarking/sec_gov_api_data/raw_data_lake` version as of 1


### QA check

In [0]:
new_processed_tickers = [t.replace("-US", "") for t in set(processed_tickers)]
new_rev_bif_not_found = [t.replace("-US", "") for t in set(rev_bif_not_found)]
all_tickers_names = [t.ticker for t in all_tickers_rows]

if set(all_tickers_names) == set(new_processed_tickers):
    print("No anomalies")
else:
    print(
        f"Anomalies found for {set(all_tickers_names) - set(new_processed_tickers)}"
    )

### Generate ticker string for next run

In [0]:
print(("\n").join(set(processed_tickers) - set(add_to_raw)))

### Exit notebook

In [0]:
dbutils.notebook.exit(
    f"\nFinancials for the following tickers were not found: {list(set(rev_bif_not_found))}\nFollowing tickers were processed: {processed_tickers}\nNew rows for these tickers can be added: {add_to_raw}"
)

In [0]:
display(table('bronze.internal_products.financials_cians_raw').select('ticker').distinct())

In [0]:
table('bronze.internal_products.financials_cians_raw').columns

In [0]:
spark.sql(" select round(months_between('2024-10-01', '2024-12-31'),2) ").display()

In [0]:
# final_result_df.write.mode('overwrite').saveAsTable('silver_dev.dse_investigations.sec_gov_api_rev_bench_daily_workflow_tickers')

In [0]:
# display(table('silver_dev.dse_investigations.sec_gov_api_rev_bench_daily_workflow_tickers'))

In [0]:
print(table('bronze.internal_products.financials_cians_raw').columns)

In [0]:
from pandas import json_normalize
test_df = (spark.createDataFrame(json_normalize(comp_facts_dict['facts']['us-gaap']['RevenueFromContractWithCustomerIncludingAssessedTax']['units']))
        .withColumn('USD', F.explode(F.col('USD')))
        .select("USD.*")
        .withColumn('start', F.col('start').cast(DateType()))
        .withColumn('end', F.col('end').cast(DateType()))
        .withColumn("time_period_months", F.round(F.months_between(F.col('end'), F.col('start')), 2))
        .filter(" start > date_sub(current_date(), ) ")
        .filter(""" time_period_months between 2.7 and 4 or time_period_months between 11 and 13  """)
        # # .filter(F.col('time_period_months').between(11.50, 13))
        # .dropDuplicates(['start', 'end', 'val'])
        # .drop(*['description', 'label', 'accn', 'frame', 'fy'])
        # .orderBy(F.col('end').desc()))
)

display(test_df)

In [0]:
display(test_df.join(get_fiscal_year_quarters(test_df), ['start', 'end', 'time_period_months'], 'left'))

In [0]:
display(spark.createDataFrame(json_normalize(comp_facts_dict['facts']['us-gaap']['Revenues']['units'])).withColumn('USD', F.explode(F.col('USD'))).select("USD.*").withColumn('end', F.col('end').cast(DateType())).orderBy(F.col('end').desc()))

In [0]:
print([each for each in comp_facts_dict['facts']['us-gaap'].keys() if re.search("revenue", each.lower())])

In [0]:
display(comp_facts_dict['facts']['us-gaap'][key_to_use]['units']['USD'])

In [0]:
from pprint import pprint
pprint(comp_facts_dict['facts']['us-gaap'][key_to_use]['units']['USD'])