In [9]:
import ast
import re
import os
import json

from abc import ABC
from pathlib import Path
from enum import Enum, auto

from transformers import AutoTokenizer, AutoModelWithLMHead, SummarizationPipeline

In [10]:
root_dir = Path.cwd()

In [20]:
class FileType(Enum):
    PY = auto()
    IPYNB = auto()
    UNKNOWN = auto()

class AbstractDocStringUtil(ABC):
    FILETYPE: FileType = FileType.UNKNOWN
    DEF_REGEX = re.compile(r"(\bdef .*\(.*\).*:)")

    def __init__(self, model_path: Path = root_dir / "modeling" / "models" / "codet5p_220m") -> None:
        """Initialize the pipeline ."""
        super().__init__()

        self._pipeline = self._pipeline = SummarizationPipeline(
            model=AutoModelWithLMHead.from_pretrained(model_path),
            tokenizer=AutoTokenizer.from_pretrained("Salesforce/codet5p-220m"),
            device=0
        )

    def gen_docstring(self, method: str) -> str:
        """Generate a docstring for a method ."""
        return [self._pipeline([method])[0]["summary_text"]]
    
    def _get_files(self, path: Path):
        """Return a list of all files in the given path ."""
        if self.FILETYPE == FileType.UNKNOWN:
            raise NotImplementedError()
        
        if self.FILETYPE == FileType.IPYNB:
            return path.rglob(f"*.ipynb")
        return path.rglob(f"*.py")
    
    def process_files(self, path: Path, inplace = False):
        """Process all files in a directory ."""
        for file in self._get_files(path):
            if inplace:
                self._process_file(file, file)
                continue

            new_base_dir = path / "docs"
            if Path(os.path.commonpath([file, new_base_dir])) == new_base_dir:
                continue

            new_path = new_base_dir / (str(file)[len(str(path)):]).lstrip("/")
            if not new_path.parent.exists():
                new_path.parent.mkdir()

            self._process_file(file, new_path)
    
    def _process_file(self, file: Path, ):
        """Process a file ."""
        raise NotImplementedError()
    
    def _add_docstring(self, code: str):
        """Add a docstring to the code ."""
        try:
            functions = [*sorted([
                f for f in ast.walk(ast.parse(code)) 
                if isinstance(f, ast.FunctionDef)
            ], key=lambda f:f.lineno, reverse=True)]
        except:
            return code

        c = code.split("\n")

        for func in functions:
            indent = re.search('\S', c[func.lineno]).start()
            docsrting = f"\n{' ' * indent}".join(self.gen_docstring(ast.unparse(func)))
            c = c[:func.lineno] + [f'{" " * indent}"""{docsrting}"""'] + c[func.lineno:]

        return "\n".join(c)

In [16]:
class PythonFileDocStringUtil(AbstractDocStringUtil):
    FILETYPE: FileType = FileType.PY

    def _process_file(self, file: Path, new_path: Path):
        """Process a file ."""
        with file.open("r+") as f:
            code = f.read()

        with new_path.open("w") as f:
            f.write(self._add_docstring(code))


py_util = PythonFileDocStringUtil()
py_util.process_files(root_dir / "test")

Your max_length is set to 20, but your input_length is only 14. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)
Your max_length is set to 20, but your input_length is only 12. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)


In [23]:
class IPYNBFileDocStringUtil(AbstractDocStringUtil):
    FILETYPE: FileType = FileType.IPYNB

    def _process_file(self, file: Path, new_path: Path):
        """Process a notebook file ."""
        with file.open("r+") as f:
            print(file)
            notebook = json.load(f)

        for cell in notebook.get("cells", []):
            if not cell.get("cell_type", "code") == "code":
                continue

            cell["source"] = self._add_docstring("".join(cell.get("source", "")))

        with new_path.open("w") as f:
            json.dump(notebook, f)
    
ipynb_util = IPYNBFileDocStringUtil()
ipynb_util.process_files(root_dir)

Your max_length is set to 20, but your input_length is only 19. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=9)


/home/paul/projects/edu/master/mdl-ii/src/deployment.ipynb
/home/paul/projects/edu/master/mdl-ii/src/data_understanding.ipynb
/home/paul/projects/edu/master/mdl-ii/src/data_preperation.ipynb
/home/paul/projects/edu/master/mdl-ii/src/evaluation/metrics.ipynb


Your max_length is set to 20, but your input_length is only 17. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=8)
Your max_length is set to 20, but your input_length is only 19. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=9)


/home/paul/projects/edu/master/mdl-ii/src/evaluation/runtime.ipynb


Your max_length is set to 20, but your input_length is only 17. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=8)
Your max_length is set to 20, but your input_length is only 15. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)
Your max_length is set to 20, but your input_length is only 12. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)
Your max_length is set to 20, but your input_length is only 12. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)


/home/paul/projects/edu/master/mdl-ii/src/modeling/codetf_fine_tuning.ipynb
/home/paul/projects/edu/master/mdl-ii/src/modeling/codet5p_770m_fine_tuning.ipynb
/home/paul/projects/edu/master/mdl-ii/src/modeling/codet5p_220m_fine_tuning.ipynb
/home/paul/projects/edu/master/mdl-ii/src/test/test.ipynb


Your max_length is set to 20, but your input_length is only 14. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)
