In [1]:
import numpy as np
import pandas as pd
import os

os.chdir('/content/')
os.getcwd()

'/content'

In [2]:
## Reading data
buy_data_path = '/content/data/buy_orders/buy_history_2025-12-15.csv'
sell_data_path = '/content/data/sell_orders/sell_history_2025-12-15.csv'

## Buys

In [4]:
buys = pd.read_csv(buy_data_path)
buys.head()


FileNotFoundError: [Errno 2] No such file or directory: '/content/data/buy_orders/buy_history_2025-12-15.csv'

In [None]:
buys.groupby('item_name').agg({'quantity':'sum'}).sort_values('quantity', ascending = False)

## Sells

In [None]:
sells = pd.read_csv(sell_data_path)
sells.head()

In [None]:
sells.groupby('item_name').agg({'quantity':'sum'}).sort_values('quantity', ascending = False)

In [None]:
import glob
# Combines all buy history files
all_files = glob.glob('data/buy_orders/buy_history_*.csv')
dfBuy = pd.concat([pd.read_csv(f) for f in all_files])
dfBuy = dfBuy.drop_duplicates()  # remove any duplicates

# Combines all sell history files
all_files = glob.glob('data/sell_orders/sell_history_*.csv')
dfSell = pd.concat([pd.read_csv(f) for f in all_files])
dfSell = dfSell.drop_duplicates()  # remove any duplicates

In [None]:
bought_demo = ['100 of sigil of bloodlust 8. 12.2025', '24 of sigil of bloodlust 8. 12.2025']
sold_demo = ['50 of sigil of bloodlust 8. 12.2025', '23 of sigil of bloodlust 9. 12.2025', '11 of sigil of bloodlust 10. 12.2025', '2 of sigil of bloodlust 12. 12.2025', '33 of sigil of bloodlust 16. 12.2025' ]#put the last one on purpose more than 7 days in the future
#not sure if i should include the price it was filled, guess it doesnt hurt but I usually sell at market price -1 copper to squeeze out margin. Like maybe the model could predict a sale faster at a different price point, but eh, that seems more nice to have. What I want is to know how fast its gonna sell at market price-1c, so i guess for this use case it might not be important

#Survival analysis with Weibull Distribution

$$f(t) = \frac{k}{\lambda} \left( \frac{t}{\lambda} \right)^{k-1} e^{-\left( \frac{t}{\lambda} \right)^k}$$

This function shows how failure probability density changes over time, with the shape parameter k determining if failures are more likely early (k < 1), constant over time (k = 1), or increasing with age (k > 1).

The cumulative distribution function (CDF) gives the probability that failure occurs by a specific time:
$$F(t) = 1 - e^{-\left( \frac{t}{\lambda} \right)^k}$$

This function works well for calculating the percentage of items expected to fail within a given timeframe. This is what I need for my use case. I need to see if, say, 90% get sold ('fail') within a week. However, is this the same thing as probability to fill? I feel an itch to use the Bayes theorem but I am not completely sure.

 The reliability function (1-CDF) provides the complementary perspective, showing the probability of survival beyond time t.

The hazard function reveals the instantaneous failure rate at any given time. This is useful to model undercut probability after a certain time period should I ever need this.

$$h(t) = \frac{k}{\lambda} \left( \frac{t}{\lambda} \right)^{k-1}$$

In [None]:
from lifelines.fitters.weibull_fitter import WeibullFitter
#!pip install lifelines
from lifelines import WeibullFitter as wf
import json
from datetime import datetime

def fit_item_models(df,min_observations=3):
  #@ min observations is the minimum number of items sold to fit a model
  """  FIts WEibull distribution on each item in the list given enough data"""
  item_distributions = {}

  # Groupby item
  for item_id, group in df.groupby('item_name'):
      if len(group) < min_observations: continue
      item_name = group['item_name'].iloc[0]
      durations = group['time_to_fill_hours'].dropna()

      #WEibull fit
      wf = WeibullFitter()
      wf.fit(durations)

      item_distributions[int(item_id)] = {
          'item_name':item_name,
          'lambda_': float(wf.lambda_),#scale parameter
          'rho_': float(wf.rho_),#shape parameter
          'n_observations': len(durations),
          'median_fill_hours': float(durations.median()),
          'mean_fill_hours': float(durations.mean()),
          'std_fill_hours': float(durations.std()),
      }
  return item_distributions


In [None]:
#Fit models
print("Fitting models  per item...")
models = fit_item_models(dfSell,min_observations=3)
print("Done for {len(models)} items")
#Save to JSON
with open('data/item_distributions.json', 'w') as f:
    json.dump(models, f, indent=2)
print("Models saved")

In [None]:
def calculate_fill_probability(lambda_, rho_, hours):
    """
    Calculate P(item fills within 'hours' hours) using Weibull CDF
    F(t) = 1 - exp(-(t/lambda)^rho)
    """
    return 1 - np.exp(-((hours / lambda_) ** rho_))

# Example: What's the probability each item fills within 24 hours?
time_horizon_hours = 24

print(f"\nFill probabilities within {time_horizon_hours} hours:")
for item_id, model in list(models.items())[:10]:  # Show first 10
    prob = calculate_fill_probability(model['lambda_'], model['rho_'], time_horizon_hours)
    print(f"{model['item_name']}: {prob:.1%} (shape={model['rho_']:.2f})")

#Optimizer
I guess this should be a separate file later? If I do end up saving the probabilities somewhere to save time and recalculate only occasionally.

In [None]:
#For optimizer
budget = 1000 #gold
time_horizon = 1 #days
min_margin=.05
min_fill_prob = .9
max_transactions = 50 #batch of up to 250 items. This is to prevent optimizer solutions that would have me click through 1000 buy orders.

z = ...#sum of sell price of items meeting the margin threshold and min_fill_prob
full_item_list = pd.read_csv('data/my_trading_items.csv')[["item_name"]] #not even sure what I did this for yet
print(full_item_list)

