In [168]:
import ast
import re

import pandas as pd

from pathlib import Path

from datasets import load_dataset, Dataset, DatasetDict
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

In [2]:
root_dir = Path.cwd()

In [3]:
dataset = load_dataset("semeru/code-text-python", cache_dir=root_dir / "data" / "cache")

Found cached dataset json (/home/paul/projects/edu/master/mdl-ii/src/data/cache/semeru___json/semeru--code-text-python-047c4cae5928dad3/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
dataset.set_format(type="pandas")
train_df = dataset["train"][:]
test_df = dataset["test"][:]
validation_df = dataset["validation"][:]

# Data Preperation
## Remove Python 2

In [158]:
def remove_python2(df: pd.DataFrame) -> pd.DataFrame:
    """Remove python2 code from a DataFrame"""
    def _filter(code: str) -> bool:
        """Filter the given code ."""
        try:
            ast.parse(code)
        except SyntaxError:
            return False
        return True
    print(type(df))
    return df[df["code"].map(_filter)]

## Remove Doc-Strings

In [157]:
def remove_docstring(df: pd.DataFrame) -> pd.DataFrame:
    """Remove the docstring from a DataFrame ."""
    re_docstring = re.compile(r'^(\b(def.+)\s+("{3}[\w\W]*?"{3}))')
    def _transform(code: str) -> str:
        """Transform code to a Python docstring ."""
        return re_docstring.sub(r"\2", code)
    return df.applymap(_transform)

In [164]:
pipeline = Pipeline([
    ("remove python2", FunctionTransformer(remove_python2)),
    ("transform code", ColumnTransformer(
        transformers=[
            ("", FunctionTransformer(remove_docstring), ["code"]),
            ("passthrough", 'passthrough', ["code_tokens", "docstring", "docstring_tokens", "original_string"]),
        ],
        remainder="drop",
        verbose_feature_names_out=False
    ))
])
pipeline.set_output(transform="pandas")





In [165]:
train_df = pipeline.fit_transform(train_df)
test_df = pipeline.fit_transform(test_df)
validation_df = pipeline.fit_transform(validation_df)

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [171]:
dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "test": Dataset.from_pandas(test_df),
    "validation": Dataset.from_pandas(validation_df),
})

In [173]:
for split, ds in dataset.items():
    ds.to_json(root_dir / "data" / f"{split}.jsonl")

Creating json from Arrow format:   0%|          | 0/250 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]