In [2]:
#!/usr/bin/env python3
"""
Multilingual Documented Code Extractor
--------------------------------------
Fetches code samples with comments/docstrings from BigCode's The Stack dataset.

‚úÖ Supports multiple languages (Python, JavaScript, Java, C++)
‚úÖ Streams safely (no full dataset download)
‚úÖ Saves JSONL output for easy downstream analysis

Requirements:
    pip install datasets huggingface_hub
"""

from datasets import load_dataset
import json
from itertools import islice

# ----------- CONFIG -------------
LANGUAGES = ['python', 'javascript', 'java', 'c', 'go', 'rust', 'php', 'ruby', 'typescript']  # choose your target languages
SAMPLES_PER_LANG = 3  # how many documented files per language to keep
OUTPUT_FILE = "./data/documented_code_samples.jsonl"
# --------------------------------

def has_docs(example):
    """Simple heuristic to check if code has comments or docstrings."""
    code = example["content"]
    # detect Python/JavaScript/C++ style comments
    return any(marker in code for marker in ['"""', "'''", "#", "//", "/*", "*/"])

def sample_stream(stream, n):
    """Take first n items from a streaming dataset."""
    return list(islice(stream, n))

def collect_samples(language, limit):
    print(f"\nüîπ Loading language: {language}")
    ds = load_dataset("bigcode/the-stack", data_dir=f"data/{language}", split="train", streaming=True)

    print(f"  Filtering documented samples for {language} ...")
    filtered = (ex for ex in ds if has_docs(ex))

    collected = []
    for i, ex in enumerate(filtered):
        collected.append({
            "language": ex.get("lang", language),
            "repo": ex.get("max_stars_repo_name", ""),
            "path": ex.get("max_stars_repo_path", ""),
            "code": ex["content"]
        })
        if len(collected) >= limit:
            break

    print(f"  ‚úÖ Collected {len(collected)} documented {language} files.")
    return collected

def main():
    all_samples = []
    for lang in LANGUAGES:
        try:
            samples = collect_samples(lang, SAMPLES_PER_LANG)
            all_samples.extend(samples)
        except Exception as e:
            print(f"‚ö†Ô∏è Skipping {lang} due to error: {e}")

    print(f"\nüíæ Writing {len(all_samples)} total samples to {OUTPUT_FILE} ...")
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        for item in all_samples:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    print("\n‚úÖ Done! Your dataset is ready.")
    print(f"üëâ Output: {OUTPUT_FILE}")

if __name__ == "__main__":
    main()



üîπ Loading language: python
  Filtering documented samples for python ...
  ‚úÖ Collected 3 documented python files.

üîπ Loading language: javascript
  Filtering documented samples for javascript ...
  ‚úÖ Collected 3 documented javascript files.

üîπ Loading language: java
  Filtering documented samples for java ...
  ‚úÖ Collected 3 documented java files.

üîπ Loading language: c
  Filtering documented samples for c ...
  ‚úÖ Collected 3 documented c files.

üîπ Loading language: go
  Filtering documented samples for go ...
  ‚úÖ Collected 3 documented go files.

üîπ Loading language: rust
  Filtering documented samples for rust ...
  ‚úÖ Collected 3 documented rust files.

üîπ Loading language: php
  Filtering documented samples for php ...
  ‚úÖ Collected 3 documented php files.

üîπ Loading language: ruby
  Filtering documented samples for ruby ...
  ‚úÖ Collected 3 documented ruby files.

üîπ Loading language: typescript
  Filtering documented samples for typescript .

In [3]:
with open('./data/documented_code_samples.jsonl', 'r') as file:
    data = [json.loads(line) for line in file]
data  # Now this is a list of dicts, one per line in your .jsonl file

[{'language': 'Python',
  'repo': 'CNDB/CNDB',
  'path': 'spider/openwrt.py',
  'code': '#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# #*** <License> ************************************************************#\n# This module is part of the repository CNDB.\n#\n# This module is licensed under the terms of the BSD 3-Clause License\n# <http://www.c-tanzer.at/license/bsd_3c.html>.\n# #*** </License> ***********************************************************#\n\nfrom   _TFL.pyk           import pyk\n\nfrom   rsclib.HTML_Parse  import tag, Page_Tree\nfrom   rsclib.autosuper   import autosuper\nfrom   spider.common      import Interface, Inet4, Inet6, unroutable\nfrom   spider.common      import WLAN_Config\nfrom   spider.luci        import Version_Mixin\n\nclass Status (Page_Tree, Version_Mixin) :\n    url          = \'cgi-bin/luci/freifunk/status/status\'\n    retries      = 2\n    timeout      = 10\n    html_charset = \'utf-8\' # force utf-8 encoding\n\n    wl_names = dict \\\n        (

In [4]:
import pandas as pd
df = pd.DataFrame(data)
df.head()

Unnamed: 0,language,repo,path,code
0,Python,CNDB/CNDB,spider/openwrt.py,#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# ...
1,Python,adcrn/knest,utils/compare.py,# UCF Senior Design 2017-18\n# Group 38\n\nfro...
2,Python,ConverJens/pipelines,sdk/python/kfp/__main__.py,# Copyright 2018 Google LLC\n#\n# Licensed und...
3,JavaScript,pvlugter/cloudstate,node-support/test/crdts/pncounter-test.js,/*\n * Copyright 2019 Lightbend Inc.\n *\n * L...
4,JavaScript,Vladi-57/project-xr,server/routes/patient-router.js,import express from 'express'\n\n// const Pat...


In [5]:
import os
os.makedirs('./data/txt_files', exist_ok=True)
for i, row in df.iterrows():
    with open(f'./data/txt_files/row_{i}.txt', 'w') as file:
        file.write(f"Code:\n{row['code']}\n")