In [1]:
%load_ext autoreload
%autoreload 2

import json
import logging
import os
import shutil
import subprocess
import time
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from pathlib import Path

import libcst as cst
from tqdm import tqdm

from typet5.data import GitRepo
from typet5.type_env import collect_annots_info, mypy_checker
from typet5.utils import proj_root, read_file, write_file

os.chdir(proj_root())

In [3]:
# This is our own implementation to split the data depending on the split pickle file

from typet5.utils import pickle_load
from typet5.data import get_dataset_dir, get_tk_dataset_name, PreprocessArgs
import shutil

sdata = pickle_load(proj_root() / "data/repos_split.pkl")

repos_dir = get_dataset_dir("ManyTypes4Py") / "repos"


for n in ["train", "test", "valid"]:
    for i in range(0, len(sdata[n])):
        original = repos_dir / "downloaded" / sdata[n][i].authorname()
        if not original.exists():
            print(f"{original} not found. Skip.")
            continue
        shutil.move(original ,repos_dir / n / sdata[n][i].authorname() )


# sdata["valid"][0].download(repos_dir)

/Users/iamariyap/Desktop/sem4/NLP/Final/TypeT5/datasets/ManyTypes4Py/repos/downloaded/tiangolo__uvicorn-gunicorn-docker not found. Skip.
/Users/iamariyap/Desktop/sem4/NLP/Final/TypeT5/datasets/ManyTypes4Py/repos/downloaded/Shoobx__mypy-zope not found. Skip.
/Users/iamariyap/Desktop/sem4/NLP/Final/TypeT5/datasets/ManyTypes4Py/repos/downloaded/vinissimus__async-asgi-testclient not found. Skip.
/Users/iamariyap/Desktop/sem4/NLP/Final/TypeT5/datasets/ManyTypes4Py/repos/downloaded/uwbmrb__BMRBDep not found. Skip.
/Users/iamariyap/Desktop/sem4/NLP/Final/TypeT5/datasets/ManyTypes4Py/repos/downloaded/noyainrain__micro not found. Skip.
/Users/iamariyap/Desktop/sem4/NLP/Final/TypeT5/datasets/ManyTypes4Py/repos/downloaded/scop__hashpipe not found. Skip.
/Users/iamariyap/Desktop/sem4/NLP/Final/TypeT5/datasets/ManyTypes4Py/repos/downloaded/nikitanovosibirsk__jj not found. Skip.
/Users/iamariyap/Desktop/sem4/NLP/Final/TypeT5/datasets/ManyTypes4Py/repos/downloaded/zabuldon__teslajsonpy not found. Ski

In [22]:
# Run form here: Analyzing TokenizedSrcSet.

%load_ext autoreload
%autoreload 2

import pickle

from typet5 import proj_root
from typet5.data import get_dataset_dir, get_tk_dataset_name, PreprocessArgs
import typet5.function_dataset as fd
from typet5.utils import Path, run_long_task, DefaultTokenizer, not_none
import subprocess
from typet5.utils import proj_root, get_dataroot

dataset_name = "ManyTypes4Py"
# repos_split_path = proj_root() /  "data/repos_split.pkl"
repos_dir = get_dataset_dir("ManyTypes4Py") / "repos"

recreate = False
func_only = True # whether to create functional data (for TypeT5) or chunk data (for CodeT5)
pre_args = PreprocessArgs(
    # This was for an ablation study. 
    drop_env_types=True,
    stub_in_preamble=True,
)
data_reduction = 1

tk_src_name = get_tk_dataset_name(
    dataset_name, pre_args, func_only, data_reduction=data_reduction,
)
datasets_path = get_dataroot() / "SPOT-data" / tk_src_name

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
from typet5.data import create_tokenized_srcsets, load_tokenized_srcsets
if recreate or not datasets_path.exists():
    create_tokenized_srcsets(
        # proj_root() / "data/repos_split.pkl",
        proj_root() / "datasets/ManyTypes4Py/",
        datasets_path,
        func_only=func_only,
        pre_args=pre_args,
        data_reduction=data_reduction,
    )

print(tk_src_name, " token source name")    
tk_dataset = load_tokenized_srcsets(
    # proj_root() / "data/repos_split.pkl",
    proj_root() / "SPOT-data/func-ManyTypes4Py-v7-PreprocessArgs(drop_env_types=True)",
    tk_src_name,
)


func-ManyTypes4Py-v7-PreprocessArgs(drop_env_types=True)  token source name
Loading TokenizedSrcSets:  /Users/iamariyap/Desktop/sem4/NLP/Final/TypeT5/SPOT-data/func-ManyTypes4Py-v7-PreprocessArgs(drop_env_types=True)
805M	/Users/iamariyap/Desktop/sem4/NLP/Final/TypeT5/SPOT-data/func-ManyTypes4Py-v7-PreprocessArgs(drop_env_types=True)


In [11]:
import plotly.express as px
from pandas import DataFrame

from typet5.utils import cumulative_counts

len_counts = [len(src.tokenized_code) for src in tk_dataset["train"].all_srcs]
xs, ys = cumulative_counts(len_counts)
px.line(
    DataFrame({"tokens_per_file": xs, "n_files": ys}), x="tokens_per_file", y="n_files"
)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [12]:
print("dataset:", datasets_path)
tk_dataset["train"].print_stats()

dataset: /Users/iamariyap/Desktop/sem4/NLP/Final/TypeT5/SPOT-data/func-ManyTypes4Py-v7-PreprocessArgs(drop_env_types=True)
num_repos: 11
num_files: 11
num_lines: 474
num_labels: 31
main_tokens_per_file:
   mean: 634.64
   median: 323
   min: 57
   max: 2578
preamble_tokens_per_file:
   mean: 164.82
   median: 84
   min: 36
   max: 849
target_tks_per_file:
   mean: 9.7273
   median: 6
   min: 2
   max: 48
subset_ids: range(0, 132259, 13225)


In [13]:
print("dataset:", datasets_path)
tk_dataset["train"].print_stats()

dataset: /Users/iamariyap/Desktop/sem4/NLP/Final/TypeT5/SPOT-data/func-ManyTypes4Py-v7-PreprocessArgs(drop_env_types=True)
num_repos: 11
num_files: 11
num_lines: 474
num_labels: 31
main_tokens_per_file:
   mean: 634.64
   median: 323
   min: 57
   max: 2578
preamble_tokens_per_file:
   mean: 164.82
   median: 84
   min: 36
   max: 849
target_tks_per_file:
   mean: 9.7273
   median: 6
   min: 2
   max: 48
subset_ids: range(0, 132259, 13225)


In [14]:
from typet5.data import load_tokenized_srcsets

# tk_dataset = load_tokenized_srcsets(get_data_dir(), get_dataset_name(pre_args, func_only))

long_files=sorted(tk_dataset["train"].all_srcs, key=lambda s: len(s.tokenized_code),reverse=True)

In [15]:
print(long_files[8].preamble_code)

import copy
from deoppet.util import debug
import typing
from pynvim import Nvim
import re

class Mapping():
    ...



In [17]:
print(str(tk_dataset["train"].all_srcs[2345]))

IndexError: list index out of range

In [18]:
max(tk_dataset["train"].all_srcs, key=lambda s: len(s.tokenized_code)).print_code(500)

from typing import List, Dict, Tuple, NewType, Optional
from enum import Enum
class Sign(Enum):
   ...
class Comparison(Enum):
   ...
class Word:
   ...
class Register:
   ...
class IODevice:
   ...
class TapeUnit(IODevice):
   ...
class DiskUnit(IODevice):
   ...
class CardReader(IODevice):
   ...
class CardPunch(IODevice):
   ...
class LinePrinter(IODevice):
   ...
class TypewriterTerminal(IODevice):
   ...
class PaperTape(IODevice):
   ...
class UndefinedRegisterException(Exception):
   ...
class UndefinedCharacterException(Exception):
   ...
class MemoryLimitExceededException(Exception):
   ...
class ByteSizeException(Exception):
   ...
class Simulator:
   ...
# simulator
class Register:
    def __init__(self, num_bytes=5):...
    

# simulator
class Simulator:
    def get_field_val(self, start, end, word):...
    
    def get_field_specification(self, F):...
    
    def _get_bytes(self, value):...
    
    def _bytes_to_val(self, bs):...
    

# simulator
class Sign(Enum):
    PO

In [19]:
tk_dataset["train"].print_stats()


num_repos: 11
num_files: 11
num_lines: 474
num_labels: 31
main_tokens_per_file:
   mean: 634.64
   median: 323
   min: 57
   max: 2578
preamble_tokens_per_file:
   mean: 164.82
   median: 84
   min: 36
   max: 849
target_tks_per_file:
   mean: 9.7273
   median: 6
   min: 2
   max: 48
subset_ids: range(0, 132259, 13225)


In [20]:
def preamble_len(src):
    return len(src.preamble_code.split("\n"))


weird_src = max(tk_dataset["train"].all_srcs, key=preamble_len)


In [21]:
from typet5.data import load_tokenized_srcsets, get_dataroot

sdata_path = get_dataroot() / "TokenizedSrcSets" / "ManyTypes4Py-v5-PreprocessArgs()"

tk_dataset = load_tokenized_srcsets(sdata_path)

Loading TokenizedSrcSets:  /Users/iamariyap/Desktop/sem4/NLP/Final/TypeT5/TokenizedSrcSets/ManyTypes4Py-v5-PreprocessArgs()
/Users/iamariyap/Desktop/sem4/NLP/Final/TypeT5/TokenizedSrcSets/ManyTypes4Py-v5-PreprocessArgs()/test.pkl not found. Skip.
/Users/iamariyap/Desktop/sem4/NLP/Final/TypeT5/TokenizedSrcSets/ManyTypes4Py-v5-PreprocessArgs()/train.pkl not found. Skip.
/Users/iamariyap/Desktop/sem4/NLP/Final/TypeT5/TokenizedSrcSets/ManyTypes4Py-v5-PreprocessArgs()/valid.pkl not found. Skip.


du: /Users/iamariyap/Desktop/sem4/NLP/Final/TypeT5/TokenizedSrcSets/ManyTypes4Py-v5-PreprocessArgs(): No such file or directory


AssertionError: Empty dataset.

In [6]:
tk_dataset["test"].print_stats()

num_repos: 50
num_files: 949
num_lines: 139121
num_labels: 17740
main_tokens_per_file:
   mean: 1270.5
   median: 632
   min: 23
   max: 57953
preamble_tokens_per_file:
   mean: 103.8
   median: 67
   min: 2
   max: 1517
target_tks_per_file:
   mean: 72.285
   median: 32
   min: 2
   max: 1882
n_files_too_wide: 1
too_wide_ratio: 0.00062735
preprocess: PreprocessArgs(imports_in_preamble=True, stub_in_preamble=True, drop_comments=True, max_callees=80, max_callers=20, drop_env_types=True, add_override_usages=False)
