In [1]:
import multiprocessing
print(f"Available CPU cores (via multiprocessing): {multiprocessing.cpu_count()}")

Available CPU cores (via multiprocessing): 96


In [2]:
# Check actual CPU cores (should show 9 for your system)
import os
import subprocess
import platform

print("=== CPU INFORMATION ===")
cpu_count = os.cpu_count()
print(f"CPU cores reported by Python: {cpu_count}")

# Try to get more detailed CPU info
if platform.system() == "Linux":
    # Get physical CPU cores (ignoring hyperthreading)
    try:
        physical_cores = subprocess.check_output(
            "grep -c ^processor /proc/cpuinfo", 
            shell=True
        ).decode().strip()
        print(f"CPU cores from /proc/cpuinfo: {physical_cores}")
    except:
        print("Couldn't read /proc/cpuinfo")
    
    # Get a more comprehensive view with lscpu
    try:
        lscpu_output = subprocess.check_output("lscpu", shell=True).decode()
        print("\nlscpu output:")
        print(lscpu_output)
    except:
        print("lscpu command not available")

# Check GPU information if CUDA is available
print("\n=== GPU INFORMATION ===")
try:
    import torch
    print(f"CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"CUDA version: {torch.version.cuda}")
        print(f"Number of CUDA devices: {torch.cuda.device_count()}")
        
        # Get details about your GPU
        for i in range(torch.cuda.device_count()):
            print(f"\nGPU {i}: {torch.cuda.get_device_name(i)}")
            # Get device properties
            props = torch.cuda.get_device_properties(i)
            print(f"  Compute capability: {props.major}.{props.minor}")
            print(f"  Total memory: {props.total_memory / 1024**3:.2f} GB")
            print(f"  Multi-processor count: {props.multi_processor_count}")
            # This is NOT the same as CUDA cores, but related
            print(f"  CUDA cores per MP: Varies by architecture")
            
except ImportError:
    print("PyTorch not available, trying nvidia-smi instead")
    try:
        nvidia_smi = subprocess.check_output("nvidia-smi", shell=True).decode()
        print(nvidia_smi)
    except:
        print("nvidia-smi command not available")

# For NVIDIA A40 specifically
print("\n=== A40 GPU SPECIFICATIONS ===")
print("NVIDIA A40:")
print("- CUDA Cores: 10,752")
print("- Tensor Cores: 336")
print("- RT Cores: 84")
print("- Memory: 48 GB GDDR6")
print("- Memory Bandwidth: 696 GB/s")
print("\nNote: When calculating cores for processing workloads,")
print("use the 9 CPU cores for CPU-based tasks and")
print("PyTorch/TensorFlow will automatically use the")
print("GPU CUDA cores for compatible operations.")

=== CPU INFORMATION ===
CPU cores reported by Python: 96
CPU cores from /proc/cpuinfo: 96

lscpu output:
Architecture:                         x86_64
CPU op-mode(s):                       32-bit, 64-bit
Address sizes:                        52 bits physical, 57 bits virtual
Byte Order:                           Little Endian
CPU(s):                               96
On-line CPU(s) list:                  0-95
Vendor ID:                            GenuineIntel
Model name:                           Intel(R) Xeon(R) Gold 6342 CPU @ 2.80GHz
CPU family:                           6
Model:                                106
Thread(s) per core:                   2
Core(s) per socket:                   24
Socket(s):                            2
Stepping:                             6
CPU max MHz:                          3500.0000
CPU min MHz:                          800.0000
BogoMIPS:                             5600.00
Flags:                                fpu vme de pse tsc msr pae mce cx8 ap

In [1]:
!pip install -r requirements.txt

Collecting numpy==2.2.3 (from -r requirements.txt (line 1))
  Downloading numpy-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tabulate==0.9.0 (from -r requirements.txt (line 2))
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting torch==2.1.1 (from -r requirements.txt (line 3))
  Downloading torch-2.1.1-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)
Collecting tqdm==4.66.1 (from -r requirements.txt (line 4))
  Downloading tqdm-4.66.1-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.35.2 (from -r requirements.txt (line 5))
  Downloading transformers-4.35.2-py3-none-any.whl.metadata (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123

Setting up data processing with 90 parallel processes
Targeting shard size of 3GB



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 1053, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.p

In [None]:

%load_ext autoreload
%autoreload 2
from setup import setup
setup()


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 1053, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.p

Setting up data processing with 24 parallel processes
Using 250000 examples per shard
Loading dataset...


Resolving data files:   0%|          | 0/2110 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/98 [00:00<?, ?it/s]

Processing training split...
Processing 9,672,101 examples with optimized parallel approach
Will create approximately 39 shards

Processing shard 1/39 (250,000 examples)


Processing chunks:   0%|          | 0/50 [00:00<?, ?it/s]

Saved shard 0 with 250,000 examples in 56.29 seconds

Processing shard 2/39 (250,000 examples)


Processing chunks:   0%|          | 0/50 [00:00<?, ?it/s]

Saved shard 1 with 250,000 examples in 85.65 seconds

Processing shard 3/39 (250,000 examples)


Processing chunks:   0%|          | 0/50 [00:00<?, ?it/s]

Saved shard 2 with 250,000 examples in 84.09 seconds

Processing shard 4/39 (250,000 examples)


Processing chunks:   0%|          | 0/50 [00:00<?, ?it/s]

Saved shard 3 with 250,000 examples in 76.46 seconds

Processing shard 5/39 (250,000 examples)


Processing chunks:   0%|          | 0/50 [00:00<?, ?it/s]

Saved shard 4 with 250,000 examples in 83.78 seconds

Processing shard 6/39 (250,000 examples)


Processing chunks:   0%|          | 0/50 [00:00<?, ?it/s]

Saved shard 5 with 250,000 examples in 79.64 seconds

Processing shard 7/39 (250,000 examples)


Processing chunks:   0%|          | 0/50 [00:00<?, ?it/s]

Saved shard 6 with 250,000 examples in 81.47 seconds

Processing shard 8/39 (250,000 examples)


Processing chunks:   0%|          | 0/50 [00:00<?, ?it/s]

Saved shard 7 with 250,000 examples in 85.41 seconds

Processing shard 9/39 (250,000 examples)


Processing chunks:   0%|          | 0/50 [00:00<?, ?it/s]

Saved shard 8 with 250,000 examples in 81.83 seconds

Processing shard 10/39 (250,000 examples)


Processing chunks:   0%|          | 0/50 [00:00<?, ?it/s]

Saved shard 9 with 250,000 examples in 83.43 seconds

Processing shard 11/39 (250,000 examples)


Processing chunks:   0%|          | 0/50 [00:00<?, ?it/s]

Saved shard 10 with 250,000 examples in 80.68 seconds

Processing shard 12/39 (250,000 examples)


In [None]:
2+2