In [54]:
import os
import requests
import pprint
import pandas as pd
import shutil
import zipfile

from bs4 import BeautifulSoup
from io import BytesIO

In [55]:
JPX_SHORT_SELLING_URL = "https://www.jpx.co.jp/english/markets/public/short-selling/index.html"

# Spoof web crawler User Agent
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

"""
1. Crawl JPX urls
2. Find the short interest spread sheet
3. Read xls file + Load raw data into DataFrame
"""
daily_df = None
latest_file_name = None
response = requests.get(JPX_SHORT_SELLING_URL, headers=headers)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')

    xls_refs = soup.find_all('a', href=lambda href: href and 'Short_Positions.xls' in href)
    xls_urls = [ref['href'] for ref in xls_refs]
    latest_xls_url = xls_urls[0]
    latest_file_name = latest_xls_url.split('/')[-1]

    excel_url = f"https://www.jpx.co.jp{latest_xls_url}"
    excel_response = requests.get(excel_url, headers=headers)
    if excel_response.status_code == 200:
        excel_data = BytesIO(excel_response.content)
        daily_df = pd.read_excel(excel_data, engine='xlrd')

print(f'Finished processing {url}')

# Snake case mapping between JPX spreadsheet schema for easier queries
schema = [
    "date_of_calculation",
    "code_of_stock",
    "name_of_stock",
    "name_of_short_seller",
    "address_of_short_seller",
    "name_of_discretionary_investment_contractor",
    "address_of_discretionary_investment_contractor",
    "name_of_investment_fund",
    "ratio_of_short_positions_to_shares_outstanding",
    "number_of_short_positions_in_shares",
    "number_of_short_positions_in_trading_units",
    "date_of_calculation_in_previous_reporting",
    "ratio_of_short_positions_in_previous_reporting",
    "notes"
]

cleaned_rows = []
data_rows = daily_df.values.tolist()

for row in data_rows:
    cleaned_row = row[1:]
    # Filter only for metaplanet stock
    if cleaned_row[1] == 3350:
        # Filter out Name of Stock (Japanese) name
        cleaned_rows.append(cleaned_row[:2] + cleaned_row[3:])
### Debugging purposes
# print(schema)
# print(cleaned_data)

date_column = 'date_of_calculation'
prev_date_column = 'date_of_calculation_in_previous_reporting'
df = pd.DataFrame(cleaned_rows, columns=schema)
df[date_column] = pd.to_datetime(df[date_column], errors='coerce').dt.strftime('%Y-%m-%d')
df[prev_date_column] = pd.to_datetime(df[prev_date_column], errors='coerce').dt.strftime('%Y-%m-%d')
print(df)

Finished processing https://www.jpx.co.jp/english/markets/public/short-selling/index.html
  date_of_calculation  code_of_stock    name_of_stock  \
0          2025-07-04           3350  Metaplanet Inc.   
1          2025-07-07           3350  Metaplanet Inc.   

              name_of_short_seller  \
0  Barclays Capital Securities Ltd   
1             モルガン・スタンレーMUFG証券株式会社   

                             address_of_short_seller  \
0  1 Churchill Place, London, E14 5HP, United Kin...   
1               東京都千代田区大手町一丁目９番７号大手町フィナンシャルシティサウスタワー   

   name_of_discretionary_investment_contractor  \
0                                          NaN   
1                                          NaN   

   address_of_discretionary_investment_contractor  name_of_investment_fund  \
0                                             NaN                      NaN   
1                                             NaN                      NaN   

   ratio_of_short_positions_to_shares_outstanding  \
0              

In [56]:
# Download the dataset as a zip file
os.makedirs(DIRECTORY, exist_ok=True)
os.system(f'kaggle datasets download -d {KAGGLE_USERNAME}/{DATASET_NAME} -p {DIRECTORY} --unzip')

# Find the CSV file
csv_files = [f for f in os.listdir(DIRECTORY) if f.endswith('.csv')]
csv_path = os.path.join(DIRECTORY, csv_files[0])
print(f"Found CSV: {csv_path}")

snapshot_df = pd.read_csv(csv_path)
snapshot_df[date_column] = pd.to_datetime(snapshot_df[date_column], errors='coerce').dt.strftime('%Y-%m-%d')
snapshot_df[prev_date_column] = pd.to_datetime(snapshot_df[prev_date_column], errors='coerce').dt.strftime('%Y-%m-%d')

Dataset URL: https://www.kaggle.com/datasets/lilfatdog/metaplanet-daily-short-positions
License(s): CC0-1.0
Downloading metaplanet-daily-short-positions.zip to /Users/eddyk/eddyk/metaplanet/shorts/dataset

Found CSV: /Users/eddyk/eddyk/metaplanet/shorts/dataset/metaplanet_daily_short_positions.csv


100%|██████████| 12.5k/12.5k [00:00<00:00, 16.5MB/s]


In [57]:
merged_df = pd.concat([df, snapshot_df], ignore_index=True)
merged_df = merged_df.sort_values(by=date_column, ascending=False)
merged_df = merged_df.drop_duplicates()
# print(merged_df)
merged_df.to_csv(os.path.join(DIRECTORY, 'metaplanet_daily_short_positions.csv'), index=False)

# Move the updated CSV for upload
try:
    shutil.copy(csv_path, DIRECTORY)
except Exception as e:
    print(f"Exception: {e}")

# Use the Kaggle API to create a new version
message = f"Added new data for {latest_file_name}"
os.system(f'kaggle datasets version -p {DIRECTORY} -m "{message}"')

Exception: '/Users/eddyk/eddyk/metaplanet/shorts/dataset/metaplanet_daily_short_positions.csv' and '/Users/eddyk/eddyk/metaplanet/shorts/dataset/metaplanet_daily_short_positions.csv' are the same file
Starting upload for file metaplanet_daily_short_positions.csv


100%|██████████| 112k/112k [00:00<00:00, 158kB/s]


Upload successful: metaplanet_daily_short_positions.csv (112KB)
Skipping folder: .ipynb_checkpoints; use '--dir-mode' to upload folders
Dataset version is being created. Please check progress at https://www.kaggle.com/lilfatdog/metaplanet-daily-short-positions


0