In [1]:
!pip install datasets transformers huggingface_hub boto3 smart_open torch tqdm




In [2]:
! pip install tree_sitter



In [3]:
!pip install tree-sitter-javascript




In [4]:
!pip install tree-sitter==0.21.3




In [9]:
!git clone https://github.com/tree-sitter/tree-sitter-javascript.git

Cloning into 'tree-sitter-javascript'...
remote: Enumerating objects: 4096, done.[K
remote: Counting objects: 100% (2017/2017), done.[K
remote: Compressing objects: 100% (403/403), done.[K
remote: Total 4096 (delta 1730), reused 1725 (delta 1612), pack-reused 2079 (from 1)[K
Receiving objects: 100% (4096/4096), 42.87 MiB | 27.22 MiB/s, done.
Resolving deltas: 100% (2618/2618), done.


In [13]:
import re
import datasets
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from multiprocessing import Pool
import os
import torch
import boto3
import smart_open
from botocore import UNSIGNED
from botocore.config import Config
from huggingface_hub import login
from tree_sitter import Language, Parser

# Configuration
CHECKPOINT = "/project/phan/codellama/StarCoder"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
NUM_WORKERS = os.cpu_count() - 1  # Adjust for your system

# Initialize Tree-Sitter
Language.build_library(
    'build/my-languages.so',
    ['tree-sitter-javascript']
)
JAVASCRIPT_LANGUAGE = Language('build/my-languages.so', 'javascript')
TREE_SITTER_PARSER = Parser()
TREE_SITTER_PARSER.set_language(JAVASCRIPT_LANGUAGE)

s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))

def login_to_huggingface():
    print("Logging into Hugging Face...")
    token = "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"  # Replace with your actual token
    login(token=token)
    print("Login successful!")


def download_contents(blob_id, src_encoding):
    s3_url = f"s3://softwareheritage/content/{blob_id}"
    with smart_open.open(s3_url, "rb", compression=".gz", transport_params={"client": s3}) as fin:
        content = fin.read().decode(src_encoding)
    return content

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
model = AutoModelForCausalLM.from_pretrained(CHECKPOINT).to(DEVICE)

# Tree-Sitter method extraction
def extract_js_methods(code):
    """Extract top-level JavaScript methods using Tree-Sitter."""
    tree = TREE_SITTER_PARSER.parse(bytes(code, "utf8"))
    methods = []

    # Define Tree-Sitter query for JavaScript methods
    query = JAVASCRIPT_LANGUAGE.query(
        """
        (function_declaration name: (identifier) @function.name)
        """
    )

    for match in query.captures(tree.root_node):
        methods.append(match[0].text.decode("utf8"))

    return methods

# Filtering docstrings in JavaScript
def has_docstring(code):
    """Check if the function has a docstring."""
    tree = TREE_SITTER_PARSER.parse(bytes(code, "utf8"))
    query = JAVASCRIPT_LANGUAGE.query(
        """
        (comment) @docstring
        """
    )
    return any(query.captures(tree.root_node))

# Ensure the code has a return statement
def has_return_statement(code):
    """Check if the function has a return statement."""
    tree = TREE_SITTER_PARSER.parse(bytes(code, "utf8"))
    query = JAVASCRIPT_LANGUAGE.query(
        """
        (return_statement) @return
        """
    )
    return any(query.captures(tree.root_node))

# Pre-filtering logic
def pre_filtering(example):
    code = example["content"]
    # Ensure it has at least one function
    if not extract_js_methods(code):
        return False
    # Ensure it has a docstring
    if not has_docstring(code):
        return False
    # Ensure it has a return statement
    if not has_return_statement(code):
        return False
    return True

# Prompt formatting
def format_prompt(code):
    """Generate a formatted prompt for the model based on the provided JavaScript code."""
    # Extract the first docstring from the code
    docstring = re.search(r"/\*\*(.*?)\*/", code, re.DOTALL)
    if docstring:
        docstring = docstring.group(1).strip()
    else:
        docstring = "No docstring provided."

    # Combine the function and its description into the prompt
    return f"""<issue_start>username_0: I have a JavaScript function and would like feedback on its description.

Function:
```javascript
{code}
```

Description:
{docstring}

My answer is:"""

# Load dataset
def load_dataset():
    dataset = datasets.load_dataset("bigcode/the-stack-v2-dedup", "JavaScript", split="train", streaming=True)
    data = []
    for i, sample in enumerate(dataset):
        # Fetch and decode content if metadata exists
        if "blob_id" in sample and "src_encoding" in sample:
            sample["content"] = download_contents(sample["blob_id"], sample["src_encoding"])
        data.append(sample)
        if i >= 1000:  # Limit to 1000 samples for demonstration
            break
    return datasets.Dataset.from_list(data)

# Generate responses
# Generate responses
def generate_responses(dataset):
    prompts = [format_prompt(example["content"]) for example in dataset]
    responses = []

    for prompt in tqdm(prompts, desc="Generating responses"):
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024, padding=True).to(DEVICE)
        outputs = model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_length=1050,
            num_beams=5,
            early_stopping=True
        )
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        responses.append(response)

    return responses

# Ensure the tokenizer has a pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Set pad_token to eos_token if not already set

# Main process
def main():
    # Login to Hugging Face
    login_to_huggingface()

    # Load and pre-filter the dataset
    print("Loading dataset...")
    dataset = load_dataset()
    print("Filtering dataset...")
    filtered_dataset = dataset.filter(pre_filtering, num_proc=NUM_WORKERS)

    print("Generating responses...")
    responses = generate_responses(filtered_dataset)

    # Save filtered dataset
    filtered_dataset = filtered_dataset.add_column("response", responses)
    filtered_dataset.save_to_disk("filtered_js_dataset")
    print("Dataset saved.")

if __name__ == "__main__":
    main()


Logging into Hugging Face...
Login successful!
Loading dataset...


Resolving data files:   0%|          | 0/757 [00:00<?, ?it/s]

Filtering dataset...


Filter (num_proc=11):   0%|          | 0/1001 [00:00<?, ? examples/s]

Generating responses...


Generating responses:   0%|          | 0/128 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Generating responses:   1%|          | 1/128 [00:04<08:40,  4.10s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Generating responses:   2%|▏         | 2/128 [00:06<06:47,  3.24s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Generating responses:   2%|▏         | 3/128 [00:24<20:32,  9.86s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Generating responses:   3%|▎         | 4/128 [00:44<28:25, 13.76s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Generating responses:   4%|▍         | 5/128 [00:59<29:37, 14.45s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Generating responses:   5%|▍         | 6/128 [01:02<21:12, 10.43s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Generating responses:   5%|▌

Flattening the indices:   0%|          | 0/128 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/128 [00:00<?, ? examples/s]

Dataset saved.


In [16]:
from datasets import load_from_disk

# Load the saved dataset
dataset_path = "filtered_js_dataset"  # Replace with the correct dataset path
filtered_dataset = load_from_disk(dataset_path)

# Print dataset length to confirm loading
print(f"Loaded dataset with {len(filtered_dataset)} entries.")

# Define a function to find specific content
def find_function_in_dataset(dataset, keyword):
    for i, entry in enumerate(dataset):
        if keyword.lower() in entry["content"].lower():  # Case-insensitive search
            print(f"Match found at index {i}:")
            print(entry["content"])
            return entry
    print("No match found.")
    return None

# Search for a specific keyword (adjust as needed)
keyword = "public static void main"  # Replace with a part of your desired function
matched_entry = find_function_in_dataset(filtered_dataset, keyword)

if matched_entry:
    # Print the associated model response if present
    print("\nAssociated model response:")
    print(matched_entry.get("response", "No response found."))


Loaded dataset with 128 entries.
No match found.


In [17]:
# Inspect the first few entries of the dataset
for i in range(5):  # Adjust the range as needed
    print(f"Entry {i}:")
    print(filtered_dataset[i]["content"])
    print("-" * 80)


Entry 0:
$(document).ready(function() {
	loadRackDetails();
	loadStore();
});

function openModalBox() {
	console.log('openModalBox  javascript function is executed');
	$('#rackMaster').modal('show');
	$("#save_disable").attr("disabled", false);
	$("#reset_disable").attr("disabled", false);
	$("#update_disable").attr("disabled", true);

}

function loadStore() {
	var strUrl = MASTER_END_POINT.loadStores;
	
	$('#load_store_id').empty();
	console.log("loadForm Url is:" + strUrl);
	$.ajax({
		type : 'GET',
		url : strUrl,
		dataType : 'json',
		async : false,
		success : function(data) {
			var responsecode = data.responseCode;
			if (200 !== responsecode) {

			} else {
				var jsonArray = data.objControllerDto;
				var selectfirst = "<option value='0'>Select One </option>";
				$('#load_store_id').append(selectfirst);
				$.each(jsonArray, function(i, resData) {
					var store = "<option value=" + resData.countryId + ">"
							+ resData.countryName + "</option>";
					$(store).appendTo

In [19]:
from datasets import load_from_disk

# Load the saved dataset
filtered_dataset = load_from_disk("filtered_js_dataset")

# Iterate through examples to view instruction-response pairs
for i, example in enumerate(filtered_dataset):
    print(f"Entry {i}:")
    print("Instruction (Code):")
    print(example["content"])  # JavaScript code (instruction)
    print("\nResponse:")
    print(example["response"])  # Model's response (answer)
    print("-" * 80)

    # Limit to a few examples for display
    if i >= 5:
        break


Entry 0:
Instruction (Code):
$(document).ready(function() {
	loadRackDetails();
	loadStore();
});

function openModalBox() {
	console.log('openModalBox  javascript function is executed');
	$('#rackMaster').modal('show');
	$("#save_disable").attr("disabled", false);
	$("#reset_disable").attr("disabled", false);
	$("#update_disable").attr("disabled", true);

}

function loadStore() {
	var strUrl = MASTER_END_POINT.loadStores;
	
	$('#load_store_id').empty();
	console.log("loadForm Url is:" + strUrl);
	$.ajax({
		type : 'GET',
		url : strUrl,
		dataType : 'json',
		async : false,
		success : function(data) {
			var responsecode = data.responseCode;
			if (200 !== responsecode) {

			} else {
				var jsonArray = data.objControllerDto;
				var selectfirst = "<option value='0'>Select One </option>";
				$('#load_store_id').append(selectfirst);
				$.each(jsonArray, function(i, resData) {
					var store = "<option value=" + resData.countryId + ">"
							+ resData.countryName + "</option>";
		

In [20]:
print(filtered_dataset.column_names)


['blob_id', 'directory_id', 'path', 'content_id', 'detected_licenses', 'license_type', 'repo_name', 'snapshot_id', 'revision_id', 'branch_name', 'visit_date', 'revision_date', 'committer_date', 'github_id', 'star_events_count', 'fork_events_count', 'gha_license_id', 'gha_event_created_at', 'gha_created_at', 'gha_language', 'src_encoding', 'language', 'is_vendor', 'is_generated', 'length_bytes', 'extension', 'filename', 'content', 'response']


In [21]:
from datasets import load_from_disk

# Load the filtered dataset
dataset = load_from_disk("filtered_js_dataset")

# Check the number of rows
print(f"Number of entries: {len(dataset)}")

# Inspect the first few rows
for i, example in enumerate(dataset):
    print(f"Entry {i}:")
    print("Instruction (JavaScript Code):")
    print(example["content"])
    print("\nResponse (Generated Description):")
    print(example["response"])
    print("-" * 80)

    # Stop after a few examples
    if i >= 5:
        break


Number of entries: 128
Entry 0:
Instruction (JavaScript Code):
$(document).ready(function() {
	loadRackDetails();
	loadStore();
});

function openModalBox() {
	console.log('openModalBox  javascript function is executed');
	$('#rackMaster').modal('show');
	$("#save_disable").attr("disabled", false);
	$("#reset_disable").attr("disabled", false);
	$("#update_disable").attr("disabled", true);

}

function loadStore() {
	var strUrl = MASTER_END_POINT.loadStores;
	
	$('#load_store_id').empty();
	console.log("loadForm Url is:" + strUrl);
	$.ajax({
		type : 'GET',
		url : strUrl,
		dataType : 'json',
		async : false,
		success : function(data) {
			var responsecode = data.responseCode;
			if (200 !== responsecode) {

			} else {
				var jsonArray = data.objControllerDto;
				var selectfirst = "<option value='0'>Select One </option>";
				$('#load_store_id').append(selectfirst);
				$.each(jsonArray, function(i, resData) {
					var store = "<option value=" + resData.countryId + ">"
							+ res

In [22]:
# Extract relevant columns (content and response)
instruction_response_pairs = dataset.map(lambda x: {"content": x["content"], "response": x["response"]})

# Print a few pairs
for i in range(5):
    print(f"Instruction:\n{instruction_response_pairs[i]['content']}")
    print(f"Response:\n{instruction_response_pairs[i]['response']}")
    print("-" * 80)


Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Instruction:
$(document).ready(function() {
	loadRackDetails();
	loadStore();
});

function openModalBox() {
	console.log('openModalBox  javascript function is executed');
	$('#rackMaster').modal('show');
	$("#save_disable").attr("disabled", false);
	$("#reset_disable").attr("disabled", false);
	$("#update_disable").attr("disabled", true);

}

function loadStore() {
	var strUrl = MASTER_END_POINT.loadStores;
	
	$('#load_store_id').empty();
	console.log("loadForm Url is:" + strUrl);
	$.ajax({
		type : 'GET',
		url : strUrl,
		dataType : 'json',
		async : false,
		success : function(data) {
			var responsecode = data.responseCode;
			if (200 !== responsecode) {

			} else {
				var jsonArray = data.objControllerDto;
				var selectfirst = "<option value='0'>Select One </option>";
				$('#load_store_id').append(selectfirst);
				$.each(jsonArray, function(i, resData) {
					var store = "<option value=" + resData.countryId + ">"
							+ resData.countryName + "</option>";
					$(store).appe