In [18]:
import dotenv
dotenv.load_dotenv('../.secrets', override=True)
!pip install rc_sqlalchemy_cfg 

Looking in indexes: https://pypi.org/simple, https://****@pkgs.dev.azure.com/ResonanzCapitalWebDev/RCA-AZ-APP-WEB02-Dev/_packaging/rc_models_package/pypi/simple


In [62]:
import os
from sqlalchemy import create_engine
from sqlalchemy.orm import scoped_session, sessionmaker
from rc_sqlalchemy_cfg.sqlalchemy_cfg import configure_base_model

def get_session():
    engine = create_engine(os.getenv('ENSEMBLE_DB_URI'))
    session_maker = sessionmaker(autocommit=False, autoflush=True, bind=engine)
    session = scoped_session(session_maker)
    configure_base_model(session)
    return session

ens_session = get_session()

In [35]:
import re
from rc_sqlalchemy_cfg.models import InvestmentEntityDocument, InvestmentEntity

def get_doc_to_product():
    doc_product = {int(re.search(r'"ID":\s*(\d+)', d.object_version).group(1)): d.product_id for d in 
            ens_session.query(InvestmentEntity.product_id, InvestmentEntityDocument.object_version).join(InvestmentEntityDocument).all()}
    return doc_product
    
doc_to_product = get_doc_to_product()

In [36]:
doc_to_product[123730]

KeyError: 123730

In [37]:
import numpy as np
import pandas as pd
import pyodbc

conn = pyodbc.connect("DSN=Azure Databricks", autocommit=True)
doc_dates = conn.execute(
    "select document_mf_id, document_type, document_date from fund_document_v3 "
    "where document_type in ('quarterly_report', 'monthly_report')").fetchall()

doc_dates = pd.DataFrame([list(d) for d in doc_dates],
                        columns=['document_mf_id', 'document_type', 'document_date'])
doc_dates['end_date'] = pd.to_datetime(doc_dates['document_date'])

# start_date is 3 months before end_date for quarterly reports and 1 month before end_date for monthly reports
doc_dates['start_date'] = doc_dates['end_date'] - np.where(doc_dates['document_type'] == 'quarterly_report', 3, 1) * np.timedelta64(1, 'M')
doc_dates = doc_dates.drop(columns=['document_date', 'document_type'])
doc_dates['product_id'] = doc_dates['document_mf_id'].map(doc_to_product)
doc_dates = doc_dates.dropna()

In [40]:
doc_dates

Unnamed: 0,document_mf_id,end_date,start_date,product_id
0,123952,2024-03-31 00:00:00+00:00,2024-01-01 00:00:00+00:00,486.0
1,123951,2024-03-31 00:00:00+00:00,2024-01-01 00:00:00+00:00,6771.0
2,123779,2024-03-31 00:00:00+00:00,2024-01-01 00:00:00+00:00,203.0
4,123731,2024-03-31 00:00:00+00:00,2024-01-01 00:00:00+00:00,281.0
5,123678,2024-03-31 00:00:00+00:00,2024-01-01 00:00:00+00:00,268.0
...,...,...,...,...
734,89742,2022-09-30 00:00:00+00:00,2022-08-30 00:00:00+00:00,1029.0
735,101535,2023-03-31 00:00:00+00:00,2023-02-28 00:00:00+00:00,672.0
736,101536,2023-03-31 00:00:00+00:00,2023-02-28 00:00:00+00:00,674.0
737,101539,2023-03-31 00:00:00+00:00,2023-02-28 00:00:00+00:00,995.0


In [78]:
from sqlalchemy import func
from rc_sqlalchemy_cfg.models import ProductReturn


def query_returns(queries):
    """
    queries: A list of tuples, each containing (parent_id, start_date, end_date)
    Returns a list of tuples with the average return for each condition
    """
    results = []
    for i, row in queries.iterrows():
        avg_return = ens_session.query(func.avg(ProductReturn.value)).filter(
            ProductReturn.product_id == row['product_id'],
            ProductReturn.value_date >= row['start_date'],
            ProductReturn.value_date <= row['end_date']
        ).scalar()
        results.append((row['document_mf_id'], avg_return))
    return results

doc_returns = pd.DataFrame(query_returns(doc_dates), columns=['document_mf_id', 'avg_return']).dropna()

In [79]:
# pickle result
import pickle
with open('doc_returns.pkl', 'wb') as f:
    pickle.dump(doc_returns, f)

In [26]:
# USE PROD
