In [21]:
import xarray as xr
import pandas as pd
import numpy as np

In [22]:
years = np.array([2020, 2021])        # 2 years
months = np.array([1, 2, 3])          # 3 months
days = np.array([1, 2, 3, 4])         # 4 days

# Essentially simulate that there exists unique permnos for 
# non-unique permcos since each permco (company) can have 
# multiple permnos (stocks).
permnos       = [1, 2, 3, 4]
permco_values = [1, 1, 2, 2]

Y = years.shape[0]
M = months.shape[0]
D = days.shape[0]
A = len(permnos)

me_values = np.random.randint(low=0, high=(Y * M * D * A), size=(Y, M, D, A))

In [23]:
me_values.shape

(2, 3, 4, 4)

In [24]:
ds = xr.Dataset(
    data_vars={
        "me": (("year", "month", "day", "permno"), me_values)
    },
    coords={
        "year": years,
        "month": months,
        "day": days,
        "permno": permnos,
        "permco": ("permno", permco_values)
    },
    attrs={"my_custom_dataset_attr": "structure is original."}
)

In [28]:
summe = ds.me.groupby(ds.permco).sum(dim="permno")  # This gives a DataArray with dims (year, month, day, permco)
maxme = ds.me.groupby(ds.permco).max(dim="permno")  # This gives a DataArray with dims (year, month, day, permco)

maxme_broadcast = maxme.sel(permco=ds.permco)  # This will align maxme with the original permno dimension
is_max_permno = (ds.me == maxme_broadcast) 
summe_broadcast = summe.sel(permco=ds.permco)  # This aligns summe with the permno dimension

new_me = xr.where(is_max_permno, summe_broadcast, ds.me)

ds_updated = ds.copy()
ds_updated["me"] = new_me



In [32]:
ds

In [1]:
from src import DataManager
from src.data.core.operations import mean, median, mode, ema
import matplotlib.pyplot as plt
import time

In [2]:
# Initialize the DataManager to handle dataset operations
dm = DataManager()
    
    # Pull in the CRSP data for Apple and Tesla.
    # Data parameters: symbols, date range, and data provider configuration.
datasets = dm.get_data([{"data_path": "wrds/equity/crsp",
        "config": {
            "start_date": "2000-01-01",
            "end_date": "2024-01-01",
            "freq": "M",
            "filters": {
                "date__gte": "2000-01-01"
            },
            "processors": {
                "replace_values": {
                    "source": "delistings",
                    "rename": [["dlstdt", "time"]],
                    "identifier": "permno",
                    "from_var": "dlret",
                    "to_var": "ret"
                },
                "merge_table": [
                    {
                        "source": "msenames",
                        "identifier": "permno",
                        "column": "comnam",
                        "axis": "asset"
                    },
                    {
                        "source": "msenames",
                        "identifier": "permno",
                        "column": "exchcd",
                        "axis": "asset"
                    }
                ],
                "set_permco_coord":  True,
                "fix_market_equity": True
            }
        }}])

wrds/equity/crsp: Level 1 cache not found. Loading data using loader (id: 131820050995712).


An NVIDIA GPU may be present on this machine, but a CUDA-enabled jaxlib is not installed. Falling back to cpu.


wrds/equity/crsp: Successfully saved cache to /home/suchismit/data/cache/wrds/equity/crsp/aa7b81688d3fe9cbe1f6643285838853_1925-01-01_2023-12-01.nc


  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()


RuntimeError: wrds/equity/crsp: Post-processing steps failed for dataset. Error: 'str' object has no attribute 'columns'

In [4]:
crsp = datasets["wrds/equity/crsp"]
crsp["adj_prc"] = crsp["prc"] / crsp["cfacpr"]

In [5]:
crsp