## AISQL Functions For Equity Research

Snowflake's AI powered functions enable column level operations with LLM like traditional database operators. This is handy to use to process unstructured data for any downstream analytics.


1. Parse document into text from pdf using [PARSE_DOCUMENT](https://docs.snowflake.com/en/sql-reference/functions/parse_document-snowflake-cortex)
2. Extract entities using snowflake [structured output](https://docs.snowflake.com/en/user-guide/snowflake-cortex/complete-structured-outputs)
3. Using Top-K join and AI Join to map entities to S&P 500 tickers ([AI_FILTER](https://docs.snowflake.com/LIMITEDACCESS/snowflake-cortex/ai_filter-snowflake-cortex))
4. Summarize research insights (using [AI_AGG](https://docs.snowflake.com/LIMITEDACCESS/snowflake-cortex/ai_agg)) across multiple articles upon given ticker


NOTE: please complete the prerequisites and take note of the DB.Schema.Stage names if you are using names other than the suggested: AI_SQL_TEAM_DB.SE_SAMPLE_DATA.EQUITY_RESEARCH

**Step 1**. Parse research doc into text using [PARSE_DOCUMENT](https://docs.snowflake.com/en/sql-reference/functions/parse_document-snowflake-cortex) function from the stage containing the pdfs.

In [None]:
-- optional list to validate 
LIST @SI_EVENTS_HOL.EQUITY_RESEARCH.EQUITY_RESEARCH;

In [None]:
-- This step can take around a minute or so on an XS warehouse compute considering the calls made to the Cortex AI Inference service
-- Note: validate prereqs like client side encryption!!


CREATE OR REPLACE TABLE raw_docs_text AS SELECT
    relative_path, 
    GET_PRESIGNED_URL(@AI_SQL_TEAM_DB.SE_SAMPLE_DATA.equity_research, relative_path) as scoped_file_url, 
    TO_VARIANT(SNOWFLAKE.CORTEX.PARSE_DOCUMENT(@AI_SQL_TEAM_DB.SE_SAMPLE_DATA.equity_research, relative_path , {'mode': 'layout'})) as raw_text_dict,
    raw_text_dict:content as raw_text
FROM DIRECTORY(@AI_SQL_TEAM_DB.SE_SAMPLE_DATA.equity_research);


In [None]:
select relative_path, raw_text from raw_docs_text;

Step 2. Extract company and sentiment from the document using [STRUCTURED OUTPUT](https://docs.snowflake.com/en/user-guide/snowflake-cortex/complete-structured-outputs)

In [None]:
CREATE OR REPLACE TABLE ENTITY_EXTRACTION_EXAMPLE as 
select *,
    ai_complete(
    model => 'claude-3-7-sonnet',
    prompt => 'You are tasked with extracting companies from a research article. Extract "company" for each company that is identified and the "sentiment" which includes a sentiment of how the company was referenced.:\n\n'|| RAW_TEXT::text,
    response_format => {
        'type':'json',
        'schema':{'type' : 'object','properties': {'company_sentiment': {
            'type': 'array',
            'items': {
                'type': 'object',
                'properties': {
                    'company': {'type': 'string'},
                    'sentiment': {'type': 'string'}}
                    }
                }}}
        }
    ) as extraction,
    ai_complete('llama3.1-70b', 'summarize below test: ' || raw_text) as summary
    from raw_docs_text;

In [None]:
select * from ENTITY_EXTRACTION_EXAMPLE;

Note, ensure you have completed the Snowflake Marketplace dataset prereq

In [None]:
CREATE OR REPLACE TABLE AI_SQL_TEAM_DB.SE_SAMPLE_DATA.TICKERS_LIST as select distinct(company_name), ticker
FROM S__P_500_BY_DOMAIN_AND_AGGREGATED_BY_TICKERS_SAMPLE.DATAFEEDS.SP_500
group by 1,2;

Step 3. Map extracted company entity to the S&P 500 Ticker using TOP-K Join and [AI_FILTER](https://docs.snowflake.com/LIMITEDACCESS/snowflake-cortex/ai_filter-snowflake-cortex)

In [None]:
create or replace view flattened_extraction as 
SELECT 
    relative_path as file_name,
    RAW_TEXT,
    summary,
    flattened.value:company::STRING AS Company,
    flattened.value:sentiment::STRING AS Sentiment,
    extraction 
FROM 
    entity_extraction_example,
    LATERAL FLATTEN(INPUT => extraction:company_sentiment) AS flattened;

In [None]:
select company,sentiment, * from flattened_extraction;

In [None]:
-- use top k join to filter down company mapping first, then compare it to the ai_filter results
-- note, calculating a similarty score is a one approach to a problem, for which there are multiple approaches

create or replace table top_candidates as 
SELECT c.*, d.*, ai_similarity(c.company, d.company_name, {'model':'snowflake-arctic-embed-m-v1.5'}) as sim_score
FROM  flattened_extraction c
CROSS JOIN TICKERS_LIST d
QUALIFY row_number() OVER 
    (PARTITION BY company, file_name
    ORDER BY sim_score DESC) <= 2;

In [None]:
-- take a look from top k match - a lot of false positives due to this approach
-- is there a better way to approach this problem?
select company as extracted, company_name as mapped_company, ticker as mapped_ticker from top_candidates;

In [None]:
-- ENTITY DISAMBIGUATION - USING AI FILTER TO further filter down the matched entities.

create or replace table matched_candidates as 
SELECT file_name, raw_text, summary, company as extracted, company_name as mapped_company, ticker as mapped_ticker, 
    FROM top_candidates
WHERE true
and AI_FILTER('Does this extracted company:' || company || ' refers to the same company as this S&P 500 company: ' || company_name || '?')
ORDER BY FILE_NAME;

In [None]:
select extracted, mapped_company, mapped_ticker from matched_candidates;

Step 4. Aggregate insights across multiple documents on specific Ticker using [AI_AGG](https://docs.snowflake.com/LIMITEDACCESS/snowflake-cortex/ai_agg)

In [None]:
select 
    mapped_ticker,
    count(*) as count_research,
    AI_AGG('TICKER: ' || mapped_ticker || '\n' || raw_text, 'You are provided a couple research articles to the company; Please help me summarize in bullet points on discussions relevant to the company') as aggregated_summary
from matched_candidates
where mapped_ticker = 'MSFT' -- other tickers you can also check are CRM, NVDA
group by mapped_ticker;


In [None]:
df = AGGREGATED_INSIGHTS.to_pandas()
print(df['AGGREGATED_SUMMARY'].iloc[0])