# 使用tree-sitter用于csharp代码分析

In [1]:
import os
import pickle
from typing import *

## 1. 安装csharp语法扩展

```bash
pip install tree-sitter-c-sharp
```

In [3]:
from tree_sitter import Language, Parser

In [4]:
import tree_sitter_c_sharp

In [5]:
import tree_sitter

In [7]:
cslang = tree_sitter.Language(tree_sitter_c_sharp.language())

parser = Parser(cslang)

# 解析 C# 代码
with open("../data/AMC/AlgorithmLib/CScanPicFunction.cs", "rb") as f:
    tree = parser.parse(f.read())

In [8]:
tree

<tree_sitter.Tree at 0x2033144bde0>

In [9]:
def traverse_tree(node, indent=0):
    print(' ' * indent + f'{node.type}')
    for child in node.children:
            traverse_tree(child, indent + 2)

In [10]:
traverse_tree(tree.root_node)

compilation_unit
  using_directive
    using
    identifier
    ;
  using_directive
    using
    identifier
    ;
  using_directive
    using
    identifier
    ;
  using_directive
    using
    identifier
    ;
  using_directive
    using
    qualified_name
      qualified_name
        identifier
        .
        identifier
      .
      identifier
    ;
  using_directive
    using
    qualified_name
      identifier
      .
      identifier
    ;
  using_directive
    using
    qualified_name
      identifier
      .
      identifier
    ;
  using_directive
    using
    qualified_name
      identifier
      .
      identifier
    ;
  using_directive
    using
    qualified_name
      identifier
      .
      identifier
    ;
  using_directive
    using
    qualified_name
      identifier
      .
      identifier
    ;
  using_directive
    using
    qualified_name
      identifier
      .
      identifier
    ;
  using_directive
    using
    qualified_name
      qualified_name
  

## 2. 解析csharp项目结构

1. 引用声明，区分外部库以及本地项目文件引用
2. 注释，包括代码文件注释以及类的docstring
3. 类以及类的成员，构建两个层级的对象`class-->attribute`之后分析以attribute为基本粒度，得到结果之后与类注释一起作为类功能分析的输入
4. 根据引用声明构建项目文件之间的依赖关系，建立图结构

### 2.1 抽取引用信息

In [11]:
# 提取引用库信息
def extract_qualified_name(node):
    parts = []
    if node.type == 'qualified_name':
        for child in node.children:
            if child.type == 'identifier':
                parts.append(child.text.decode('utf-8'))
            elif child.type == 'qualified_name':
                parts.extend(extract_qualified_name(child))
    return parts
    
# 提取引用库信息
def extract_using_directives(node):
    using_directives = []
    if f'{node.type}' == 'using_directive':
        for child in node.children:
            if f'{child.type}' == 'qualified_name':
                parts = extract_qualified_name(child)
                using_directives.append('.'.join(parts))
    for child in node.children:
        using_directives.extend(extract_using_directives(child))
    return using_directives

In [12]:

# 从根节点开始提取
using_directives = extract_using_directives(tree.root_node)

# 输出引用库的完整名称
for library in using_directives:
    print(library)

System.Collections.Generic
System.Diagnostics
System.Drawing
System.IO
System.Linq
System.Text
System.Threading
System.Threading.Tasks
System.Windows.Forms


### 2.2 抽取代码文件注释

In [13]:
def extract_file_level_comments(node, is_file_level=True):
    comments = []
    if node.type == 'comment' and is_file_level:
        comments.append(node.text.decode('utf-8'))
    elif node.type in ['class_declaration', 'method_declaration']:
        is_file_level = False
    for child in node.children:
        comments.extend(extract_file_level_comments(child, is_file_level))
    return comments

# 从根节点开始提取
file_level_comments = extract_file_level_comments(tree.root_node)

In [14]:
file_level_comments

[]

### 2.3 解析类

类，类注释，方法，方法注释

In [26]:
# 提取类、类注释、方法名、方法注释、方法的全部代码块文本以及类的所有代码文本
def extract_method_info(node):
    method_info = []
    if node.type == 'method_declaration':
        method_dict = {
            'method_name': '',
            'method_comment': '',
            'method_code': '',
            'method_prefix': '',
            'method_param': ''
        }
        method_prefix = []
        method_param = []
        method_code = []
        decode_type = 'utf-8'
        for child in node.children:
            if child.type == 'identifier':
                method_dict['method_name'] = child.text.decode(decode_type, errors="ignore")
            elif child.type == 'comment':
                method_dict['method_comment'] = child.text.decode(decode_type, errors="ignore")
            elif child.type == 'modifier':
                method_prefix.append(child.text.decode(decode_type, errors="ignore"))
            elif child.type == 'parameter_list':
                method_param.append(child.text.decode(decode_type, errors="ignore"))
            method_code.append(child.text.decode(decode_type, errors="ignore"))
        method_dict['method_prefix'] = ' '.join(method_prefix)
        method_dict['method_param'] = ' '.join(method_param)
        method_dict['method_code'] = ''.join(method_code)
        method_info.append(method_dict)
    for child in node.children:
        method_info.extend(extract_method_info(child))
    return method_info
    
def extract_class_info(node):
    class_info = []
    if node.type == 'class_declaration':
        class_dict = {
            'class_name': '',
            'class_comment': '',
            'class_code': '',
            'methods': []
        }
        decode_type = 'utf-8'
        for child in node.children:
            if child.type == 'identifier':
                class_dict['class_name'] = child.text.decode(decode_type, errors="ignore")
            elif child.type == 'comment':
                class_dict['class_comment'] = child.text.decode(decode_type, errors="ignore")
            elif child.type == 'declaration_list':
                class_dict['methods'] = extract_method_info(child)
                class_dict['class_code'] = child.text.decode(decode_type, errors="ignore")
        class_info.append(class_dict)
    for child in node.children:
        class_info.extend(extract_class_info(child))
    return class_info

# 从根节点开始提取
class_info = extract_class_info(tree.root_node)

In [27]:
class_info

[{'class_name': 'CScanPicFunction',
  'class_comment': '',
  'methods': [{'method_name': 'FindFirstPostionInFSFGate',
    'method_comment': '',
    'method_code': 'publicstaticintFindFirstPostionInFSFGate(int threshValue, int gateLengthNum, int gateStartNum, bool isPositiveThresh, byte[] ascandata,\r\n                                                      WaveDataStyle wd, int offset, int maxampvalue,double alpha=1.0, int step = 10){\r\n            // 定义循环变量\r\n            int iPos = 0;\r\n            bool bHasFound = false;\r\n\r\n\r\n            if (ascandata == null)\r\n            {\r\n                return 0;\r\n            }\r\n\r\n            int indexofCrossPoint = 0;\r\n            int iposMax = Math.Min(gateStartNum + gateLengthNum, ascandata.Length);\r\n\r\n            //int initIndexofCrossPoint = 0;//粗找的第1个交点\r\n            //bool bInitHasFound = false;\r\n            ////粗找\r\n            //if (isPositiveThresh)//正向阈值\r\n            //{\r\n\r\n            //    for (iPos 

### 2.4 项目分析

1. 从项目根目录遍历文件树，构建项目文件树
2. 结合项目结构以及各个代码片段中的`using`情况，构建项目文件引用树

In [28]:
if not os.path.isdir('../data/processed'):
    os.mkdir('../data/processed')

In [29]:
root_path = '../data/AMC/'

In [30]:
cs_files = []

In [31]:
def folder_walk(root_folder,file_lst):
    for path,folder_lst,fname_lst in os.walk(root_folder):
        for fname in fname_lst:
            if fname.endswith('.cs'):
                file_lst.append(os.path.join(path,fname))
    return file_lst

In [32]:
folder_walk(root_folder=root_path, file_lst=cs_files)

['../data/AMC/AlgorithmLib\\CScanPicFunction.cs',
 '../data/AMC/AlgorithmLib\\CScanPicFunctionGPU - 副本 (2).cs',
 '../data/AMC/AlgorithmLib\\CScanPicFunctionGPU - 副本.cs',
 '../data/AMC/AlgorithmLib\\CScanPicFunctionGPU.cs',
 '../data/AMC/AlgorithmLib\\UltrasonicalPeakDll.cs',
 '../data/AMC/AlgorithmLib\\obj\\Debug\\TemporaryGeneratedFile_036C0B5B-1481-4323-8D20-8F5ADCB23D92.cs',
 '../data/AMC/AlgorithmLib\\obj\\Debug\\TemporaryGeneratedFile_5937a670-0e60-4077-877b-f7221da3dda1.cs',
 '../data/AMC/AlgorithmLib\\obj\\Debug\\TemporaryGeneratedFile_E7A71F73-0F8D-4B9B-B56E-8E70B10BC5D3.cs',
 '../data/AMC/AlgorithmLib\\obj\\Release\\TemporaryGeneratedFile_036C0B5B-1481-4323-8D20-8F5ADCB23D92.cs',
 '../data/AMC/AlgorithmLib\\obj\\Release\\TemporaryGeneratedFile_5937a670-0e60-4077-877b-f7221da3dda1.cs',
 '../data/AMC/AlgorithmLib\\obj\\Release\\TemporaryGeneratedFile_E7A71F73-0F8D-4B9B-B56E-8E70B10BC5D3.cs',
 '../data/AMC/AlgorithmLib\\obj\\x64\\Debug\\TemporaryGeneratedFile_036C0B5B-1481-4323-8

In [33]:
len(cs_files)

570

In [35]:
cs_files = [i.replace('\\','/') for i in cs_files]

#### 项目分析并存储

In [37]:
cslang = tree_sitter.Language(tree_sitter_c_sharp.language())

parser = Parser(cslang)

content = []

for idx,file_path in enumerate(cs_files):
    if idx%1000==0:
        print(f'Hit:{idx}')
    with open(file_path, "rb") as f:
        ftree = parser.parse(f.read())
        
    node_dict = {
            'filename': '',
            'filepath': file_path,
            'source_reference':[],
            'source_comments': [],
            'source_class': []
        }
    node_dict['filename'] = file_path.split('/')[-1]
    node_dict['source_reference'] = extract_using_directives(ftree.root_node)
    node_dict['source_comments'] = extract_file_level_comments(ftree.root_node)
    node_dict['source_class'] = extract_class_info(ftree.root_node)
    content.append(node_dict)

Hit:0


### 保存中间结果

In [38]:
import json

In [39]:
json.dump(content, open('../data/processed/amc-by-tree-sitter.json','w'))

In [147]:
# json.dump(cs_files, open('../data/processed/project-allfile.json','w'))

# 3. 引用关联

根据文件在头部声明的引用，将其与本地文件关联

In [52]:
for node in content:
    ref_key = '.'.join(node['filepath'].split('/')[3:]).removesuffix('.cs')
    namespace_key = '.'.join(node['filepath'].split('/')[3:]).removesuffix(f".{node['filename']}")
    node['reference_key'] = ref_key
    node['namespace_key'] = namespace_key

In [53]:
ref_dict = {}
for node in content:
    ref_dict[node['reference_key']] = {k:node[k] for k in node if k!='reference_key'}

In [54]:
list(ref_dict.values())[0]

{'filename': 'CScanPicFunction.cs',
 'filepath': '../data/AMC/AlgorithmLib/CScanPicFunction.cs',
 'source_reference': ['System.Collections.Generic',
  'System.Diagnostics',
  'System.Drawing',
  'System.IO',
  'System.Linq',
  'System.Text',
  'System.Threading',
  'System.Threading.Tasks',
  'System.Windows.Forms'],
 'source_comments': [],
 'source_class': [{'class_name': 'CScanPicFunction',
   'class_comment': '',
   'methods': [{'method_name': 'FindFirstPostionInFSFGate',
     'method_comment': '',
     'method_code': 'publicstaticintFindFirstPostionInFSFGate(int threshValue, int gateLengthNum, int gateStartNum, bool isPositiveThresh, byte[] ascandata,\r\n                                                      WaveDataStyle wd, int offset, int maxampvalue,double alpha=1.0, int step = 10){\r\n            // 定义循环变量\r\n            int iPos = 0;\r\n            bool bHasFound = false;\r\n\r\n\r\n            if (ascandata == null)\r\n            {\r\n                return 0;\r\n           

In [55]:
namespace_dict = {}
for key in ref_dict:
    node = ref_dict[key]
    if node['namespace_key'] not in namespace_dict:
        namespace_dict[node['namespace_key']] = []
    namespace_dict[node['namespace_key']].append(node['filepath'])

In [56]:
for key in ref_dict:
    node = ref_dict[key]
    relocal_fpath = []
    relocal_reference_map = {}
    for k in node['source_reference']:
        k = k.removeprefix('SixLabors.')
        if k in ref_dict:
            relocal_reference_map[k] = [node['filepath']]
        if k in namespace_dict:
            relocal_reference_map[k] = namespace_dict[k]
    node['relocal_reference'] = relocal_reference_map

In [57]:
list(ref_dict.values())[0]

{'filename': 'CScanPicFunction.cs',
 'filepath': '../data/AMC/AlgorithmLib/CScanPicFunction.cs',
 'source_reference': ['System.Collections.Generic',
  'System.Diagnostics',
  'System.Drawing',
  'System.IO',
  'System.Linq',
  'System.Text',
  'System.Threading',
  'System.Threading.Tasks',
  'System.Windows.Forms'],
 'source_comments': [],
 'source_class': [{'class_name': 'CScanPicFunction',
   'class_comment': '',
   'methods': [{'method_name': 'FindFirstPostionInFSFGate',
     'method_comment': '',
     'method_code': 'publicstaticintFindFirstPostionInFSFGate(int threshValue, int gateLengthNum, int gateStartNum, bool isPositiveThresh, byte[] ascandata,\r\n                                                      WaveDataStyle wd, int offset, int maxampvalue,double alpha=1.0, int step = 10){\r\n            // 定义循环变量\r\n            int iPos = 0;\r\n            bool bHasFound = false;\r\n\r\n\r\n            if (ascandata == null)\r\n            {\r\n                return 0;\r\n           

In [58]:
cs_files

['../data/AMC/AlgorithmLib/CScanPicFunction.cs',
 '../data/AMC/AlgorithmLib/CScanPicFunctionGPU - 副本 (2).cs',
 '../data/AMC/AlgorithmLib/CScanPicFunctionGPU - 副本.cs',
 '../data/AMC/AlgorithmLib/CScanPicFunctionGPU.cs',
 '../data/AMC/AlgorithmLib/UltrasonicalPeakDll.cs',
 '../data/AMC/AlgorithmLib/obj/Debug/TemporaryGeneratedFile_036C0B5B-1481-4323-8D20-8F5ADCB23D92.cs',
 '../data/AMC/AlgorithmLib/obj/Debug/TemporaryGeneratedFile_5937a670-0e60-4077-877b-f7221da3dda1.cs',
 '../data/AMC/AlgorithmLib/obj/Debug/TemporaryGeneratedFile_E7A71F73-0F8D-4B9B-B56E-8E70B10BC5D3.cs',
 '../data/AMC/AlgorithmLib/obj/Release/TemporaryGeneratedFile_036C0B5B-1481-4323-8D20-8F5ADCB23D92.cs',
 '../data/AMC/AlgorithmLib/obj/Release/TemporaryGeneratedFile_5937a670-0e60-4077-877b-f7221da3dda1.cs',
 '../data/AMC/AlgorithmLib/obj/Release/TemporaryGeneratedFile_E7A71F73-0F8D-4B9B-B56E-8E70B10BC5D3.cs',
 '../data/AMC/AlgorithmLib/obj/x64/Debug/TemporaryGeneratedFile_036C0B5B-1481-4323-8D20-8F5ADCB23D92.cs',
 '../

In [59]:
ref_dict['../data/AMC/AMC/Form1.cs']

KeyError: '../data/AMC/AMC/Form1.cs'

In [61]:
ref_dict.keys()

dict_keys(['AlgorithmLib.CScanPicFunction', 'AlgorithmLib.CScanPicFunctionGPU - 副本 (2)', 'AlgorithmLib.CScanPicFunctionGPU - 副本', 'AlgorithmLib.CScanPicFunctionGPU', 'AlgorithmLib.UltrasonicalPeakDll', 'AlgorithmLib.obj.Debug.TemporaryGeneratedFile_036C0B5B-1481-4323-8D20-8F5ADCB23D92', 'AlgorithmLib.obj.Debug.TemporaryGeneratedFile_5937a670-0e60-4077-877b-f7221da3dda1', 'AlgorithmLib.obj.Debug.TemporaryGeneratedFile_E7A71F73-0F8D-4B9B-B56E-8E70B10BC5D3', 'AlgorithmLib.obj.Release.TemporaryGeneratedFile_036C0B5B-1481-4323-8D20-8F5ADCB23D92', 'AlgorithmLib.obj.Release.TemporaryGeneratedFile_5937a670-0e60-4077-877b-f7221da3dda1', 'AlgorithmLib.obj.Release.TemporaryGeneratedFile_E7A71F73-0F8D-4B9B-B56E-8E70B10BC5D3', 'AlgorithmLib.obj.x64.Debug.TemporaryGeneratedFile_036C0B5B-1481-4323-8D20-8F5ADCB23D92', 'AlgorithmLib.obj.x64.Debug.TemporaryGeneratedFile_5937a670-0e60-4077-877b-f7221da3dda1', 'AlgorithmLib.obj.x64.Debug.TemporaryGeneratedFile_E7A71F73-0F8D-4B9B-B56E-8E70B10BC5D3', 'Algor

## 4. 保存最终结果

In [62]:
json.dump(ref_dict, open('../data/processed/amc-by-tree-sitter.json','w'))