## Setting up paths

In [1]:
from pathlib import Path

# dans un notebook : utilise cwd
BASE_DIR = Path.cwd().parents[2]
DATASET_DIR = BASE_DIR / "src" / "text_processing" / "dataset"
DATASET_OUTPUT_FILE = DATASET_DIR / "dataset.parquet"

print("Base dir:", BASE_DIR)
print("Dataset dir:", DATASET_DIR)
print("Dataset output file:", DATASET_OUTPUT_FILE)

Base dir: /Users/arthur/Desktop/Hackathon-FIAM-2025
Dataset dir: /Users/arthur/Desktop/Hackathon-FIAM-2025/src/text_processing/dataset
Dataset output file: /Users/arthur/Desktop/Hackathon-FIAM-2025/src/text_processing/dataset/dataset.parquet


## Parameters

In [2]:
start_date = 2005
end_date = 2025
delete_after = True
ignore_warnings = True
export_to_parquet = True

## Importing Libs

In [3]:
import os
import pickle as pkl
import polars as pl
import gc
import warnings

## Messing with warnings

In [4]:
# Ignorer tous les DeprecationWarning de la fonction load_datasets_polars
if ignore_warnings == True:
    warnings.filterwarnings("ignore", category=DeprecationWarning)

## Load dataset converter function

In [5]:
import os
import pickle as pkl
import polars as pl
import gc

def load_datasets_polars(
    path: str = "./dataset",
    start_date: int = 2005,
    end_date: int = 2025,
    out_file: str = "./dataset/text_us.parquet"
) -> pl.DataFrame:
    if not os.path.isdir(path):
        raise FileNotFoundError(f"Dataset folder not found: {path}")

    df_all = None  # initialisation vide
    total_rows = 0  # compteur de lignes cumulées

    for year in range(start_date, end_date + 1):
        file_path = os.path.join(path, f"text_us_{year}.pkl")
        try:
            with open(file_path, "rb") as f:
                df = pkl.load(f)  # pandas DataFrame
            df["year"] = year
            df_polars = pl.from_pandas(df)
            del df
            gc.collect()

            # compteur de lignes
            total_rows += df_polars.height
            print(f"[OK] Loaded text_us_{year}.pkl ({df_polars.height} lignes, cumul={total_rows})")

            if df_all is None:
                df_all = df_polars
            else:
                df_all = pl.concat([df_all, df_polars], how="vertical")

        except FileNotFoundError:
            raise FileNotFoundError(f"Missing dataset for year {year}: {file_path}")
        except Exception as e:
            raise RuntimeError(f"Error loading {file_path}: {e}")

    # écriture parquet

    # comparaison sanity check
    print(f"\n[ℹ️] Compteur de lignes cumulées : {total_rows}")
    print(f"[ℹ️] Nombre de lignes dans df_all : {df_all.height}")

    if total_rows == df_all.height:
        print("[✅] Les deux comptes correspondent, concat ok.")
    else:
        print("[⚠️] Attention : mismatch détecté !")



    return df_all

In [6]:
df = load_datasets_polars(path=DATASET_DIR, start_date=2005, 
                          end_date=2025, out_file=DATASET_OUTPUT_FILE)
print("Loaded datasets:")

[OK] Loaded text_us_2005.pkl (16857 lignes, cumul=16857)
[OK] Loaded text_us_2006.pkl (16553 lignes, cumul=33410)
[OK] Loaded text_us_2007.pkl (16875 lignes, cumul=50285)
[OK] Loaded text_us_2008.pkl (18391 lignes, cumul=68676)
[OK] Loaded text_us_2009.pkl (18133 lignes, cumul=86809)
[OK] Loaded text_us_2010.pkl (17537 lignes, cumul=104346)
[OK] Loaded text_us_2011.pkl (17398 lignes, cumul=121744)
[OK] Loaded text_us_2012.pkl (16968 lignes, cumul=138712)
[OK] Loaded text_us_2013.pkl (17401 lignes, cumul=156113)
[OK] Loaded text_us_2014.pkl (17814 lignes, cumul=173927)
[OK] Loaded text_us_2015.pkl (17514 lignes, cumul=191441)
[OK] Loaded text_us_2016.pkl (16840 lignes, cumul=208281)
[OK] Loaded text_us_2017.pkl (16424 lignes, cumul=224705)
[OK] Loaded text_us_2018.pkl (16326 lignes, cumul=241031)
[OK] Loaded text_us_2019.pkl (16222 lignes, cumul=257253)
[OK] Loaded text_us_2020.pkl (16335 lignes, cumul=273588)
[OK] Loaded text_us_2021.pkl (17318 lignes, cumul=290906)
[OK] Loaded text_us

## Export dataset to parquet file

In [None]:
if export_to_parquet == True:
    print("Starting to export dataset to parquet") # C'est un peu long env 5 minutes
    df.write_parquet(DATASET_OUTPUT_FILE)
    print(f"[✅] Exported dataset to {DATASET_OUTPUT_FILE}")

Starting to export dataset to parquet
[✅] Exported dataset to /Users/arthur/Desktop/Hackathon-FIAM-2025/src/text_processing/dataset/dataset.parquet


## Dataset verification

In [8]:
print(df.head(20))    # head 20 first lines

shape: (20, 8)
┌──────────┬─────────┬───────────┬─────┬─────────────────────────────┬──────────┬───────────┬──────┐
│ date     ┆ cik     ┆ file_type ┆ rf  ┆ mgmt                        ┆ gvkey    ┆ cusip     ┆ year │
│ ---      ┆ ---     ┆ ---       ┆ --- ┆ ---                         ┆ ---      ┆ ---       ┆ ---  │
│ str      ┆ i64     ┆ str       ┆ str ┆ str                         ┆ f64      ┆ str       ┆ i64  │
╞══════════╪═════════╪═══════════╪═════╪═════════════════════════════╪══════════╪═══════════╪══════╡
│ 20050103 ┆ 16099   ┆ 10Q       ┆     ┆ Item 2 Management s         ┆ 6831.0   ┆ 549282101 ┆ 2005 │
│          ┆         ┆           ┆     ┆ Discussion…                 ┆          ┆           ┆      │
│ 20050103 ┆ 779544  ┆ 10K       ┆     ┆ Item 7. Management's        ┆ 11872.0  ┆ 040712101 ┆ 2005 │
│          ┆         ┆           ┆     ┆ Discussio…                  ┆          ┆           ┆      │
│ 20050103 ┆ 831641  ┆ 10K       ┆     ┆ Item 7                      ┆ 24783

## Empty RAM

In [9]:
if delete_after == True:
    del df
    print("Dataset successfully released from memory.")

Dataset successfully released from memory.
