# Step 0 - Install Tree Sitter Languages 

In [1]:
!git clone https://github.com/tree-sitter/tree-sitter-java.git
!git clone https://github.com/tree-sitter/tree-sitter-python.git

fatal: destination path 'tree-sitter-java' already exists and is not an empty directory.
fatal: destination path 'tree-sitter-python' already exists and is not an empty directory.


# Step 1 - Import Modules

In [2]:
import torch
from datasets import load_dataset    
import pandas as pd

# Step 2 - Import Library Modules

In [3]:
from CodeSyntaxConcept.core.parsers.tree_sitter_parser import TreeSitterParser
from CodeSyntaxConcept.core.data.code_search_net import CodeSearchNet

# Step 3 - Split CodeSearchNet test sets by size

In [4]:
testset_small, testset_medium, testset_large = CodeSearchNet.get_test_sets(load_dataset("code_search_net", split='test'))

testset_small_python = testset_small.filter(lambda sample: sample['language']=='python')
testset_medium_python = testset_medium.filter(lambda sample: sample['language']=='python')
testset_large_python = testset_large.filter(lambda sample: sample['language']=='python')

testset_small_java = testset_small.filter(lambda sample: sample['language']=='java')
testset_medium_java = testset_medium.filter(lambda sample: sample['language']=='java')
testset_large_java = testset_large.filter(lambda sample: sample['language']=='java')

No config specified, defaulting to: code_search_net/all
Found cached dataset code_search_net (/root/.cache/huggingface/datasets/code_search_net/all/1.0.0/80a244ab541c6b2125350b764dc5c2b715f65f00de7a56107a28915fac173a27)
Loading cached processed dataset at /root/.cache/huggingface/datasets/code_search_net/all/1.0.0/80a244ab541c6b2125350b764dc5c2b715f65f00de7a56107a28915fac173a27/cache-b24d5e81f353d222.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/code_search_net/all/1.0.0/80a244ab541c6b2125350b764dc5c2b715f65f00de7a56107a28915fac173a27/cache-b82b7038276ca520.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/code_search_net/all/1.0.0/80a244ab541c6b2125350b764dc5c2b715f65f00de7a56107a28915fac173a27/cache-4085f3a256d6c6ae.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/code_search_net/all/1.0.0/80a244ab541c6b2125350b764dc5c2b715f65f00de7a56107a28915fac173a27/cache-f535d56e9ed39a2b.arrow
Loading cached proce

# Step 4 - Context Analysis - Method Level

In [5]:
token_counts, node_type_counts, parent_node_type_counts = CodeSearchNet.count_source_code_ast_type_frequency(testset_small_python[0]['whole_func_string'], testset_small_python[0]['language'])
df_total_token_counts, df_total_node_type_counts, df_total_parent_node_type_counts = CodeSearchNet.transform_code_counts_to_dataframe( token_counts, node_type_counts, parent_node_type_counts)
df_total_token_counts = CodeSearchNet.add_count_average(df_total_token_counts)
df_total_node_type_counts = CodeSearchNet.add_count_average(df_total_node_type_counts)
df_total_parent_node_type_counts = CodeSearchNet.add_count_average(df_total_parent_node_type_counts)

print("---------------------------------------------------------------------------------")
print(testset_small_python[0]['whole_func_string'])
print("---------------------------------------------------------------------------------")
print(df_total_token_counts)
print("---------------------------------------------------------------------------------")
print(df_total_node_type_counts)
print("---------------------------------------------------------------------------------")
print(df_total_parent_node_type_counts)

---------------------------------------------------------------------------------
                                          token  counts        avg
0                                             (       8  14.814815
1                                             )       8  14.814815
2                                             ,       7  12.962963
3                                           url       7  12.962963
4                                            or       5   9.259259
5                                        match1       4   7.407407
6                             parse_query_param       3   5.555556
7                                           'v'       2   3.703704
8                r'youtube\.com/watch/([^/?]+)'       1   1.851852
9                    r'youtube\.com/v/([^/?]+)'       1   1.851852
10               r'youtube\.com/embed/([^/?]+)'       1   1.851852
11                                          def       1   1.851852
12                        r'youtu\.be/([^?/]+)'

# Step 4 - Context Analysis - DataSet Level

In [None]:
df_total_token_counts, df_total_node_type_counts, df_total_parent_node_type_counts = CodeSearchNet.count_ast_type_frequency(testset_small_python)
df_total_token_counts = CodeSearchNet.add_count_average(df_total_token_counts)
df_total_node_type_counts = CodeSearchNet.add_count_average(df_total_node_type_counts)
df_total_parent_node_type_counts = CodeSearchNet.add_count_average(df_total_parent_node_type_counts)

print("---------------------------------------------------------------------------------")
print(df_total_token_counts)
print("---------------------------------------------------------------------------------")
print(df_total_node_type_counts)
print("---------------------------------------------------------------------------------")
print(df_total_parent_node_type_counts)