# 使用tree-sitter用于csharp代码分析

In [1]:
import os
import pickle
from typing import *

## 1. 安装csharp语法扩展

```bash
pip install tree-sitter-c-sharp
```

In [2]:
from tree_sitter import Language, Parser

In [11]:
import tree_sitter_c_sharp

In [None]:
cslang = tree_sitter.Language(tree_sitter_c_sharp.language())

parser = Parser(cslang)

# 解析 C# 代码
with open("../data/ImageSharp-main/src/ImageSharp/Image.cs", "rb") as f:
    tree = parser.parse(f.read())

In [38]:
tree

<tree_sitter.Tree at 0x7f1843b029d0>

In [99]:
def traverse_tree(node, indent=0):
    print(' ' * indent + f'{node.type}')
    for child in node.children:
            traverse_tree(child, indent + 2)

In [100]:
traverse_tree(tree.root_node)

compilation_unit
  comment
  comment
  using_directive
    using
    qualified_name
      qualified_name
        identifier
        .
        identifier
      .
      identifier
    ;
  using_directive
    using
    qualified_name
      qualified_name
        identifier
        .
        identifier
      .
      identifier
    ;
  using_directive
    using
    qualified_name
      qualified_name
        identifier
        .
        identifier
      .
      identifier
    ;
  using_directive
    using
    qualified_name
      qualified_name
        identifier
        .
        identifier
      .
      identifier
    ;
  using_directive
    using
    qualified_name
      qualified_name
        identifier
        .
        identifier
      .
      identifier
    ;
  file_scoped_namespace_declaration
    namespace
    qualified_name
      identifier
      .
      identifier
    ;
  comment
  comment
  comment
  comment
  comment
  class_declaration
    modifier
      public
    modifier
  

## 2. 解析csharp项目结构

1. 引用声明，区分外部库以及本地项目文件引用
2. 注释，包括代码文件注释以及类的docstring
3. 类以及类的成员，构建两个层级的对象`class-->attribute`之后分析以attribute为基本粒度，得到结果之后与类注释一起作为类功能分析的输入
4. 根据引用声明构建项目文件之间的依赖关系，建立图结构

### 2.1 抽取引用信息

In [97]:
# 提取引用库信息
def extract_qualified_name(node):
    parts = []
    if node.type == 'qualified_name':
        for child in node.children:
            if child.type == 'identifier':
                parts.append(child.text.decode('utf-8'))
            elif child.type == 'qualified_name':
                parts.extend(extract_qualified_name(child))
    return parts
    
# 提取引用库信息
def extract_using_directives(node):
    using_directives = []
    if f'{node.type}' == 'using_directive':
        for child in node.children:
            if f'{child.type}' == 'qualified_name':
                parts = extract_qualified_name(child)
                using_directives.append('.'.join(parts))
    for child in node.children:
        using_directives.extend(extract_using_directives(child))
    return using_directives

In [98]:

# 从根节点开始提取
using_directives = extract_using_directives(tree.root_node)

# 输出引用库的完整名称
for library in using_directives:
    print(library)

System.Runtime.CompilerServices
SixLabors.ImageSharp.Advanced
SixLabors.ImageSharp.Formats
SixLabors.ImageSharp.Metadata
SixLabors.ImageSharp.PixelFormats


### 2.2 抽取代码文件注释

In [101]:
def extract_file_level_comments(node, is_file_level=True):
    comments = []
    if node.type == 'comment' and is_file_level:
        comments.append(node.text.decode('utf-8'))
    elif node.type in ['class_declaration', 'method_declaration']:
        is_file_level = False
    for child in node.children:
        comments.extend(extract_file_level_comments(child, is_file_level))
    return comments

# 从根节点开始提取
file_level_comments = extract_file_level_comments(tree.root_node)

In [103]:
file_level_comments

['// Copyright (c) Six Labors.',
 '// Licensed under the Six Labors Split License.',
 '/// <summary>',
 '/// Encapsulates an image, which consists of the pixel data for a graphics image and its attributes.',
 '/// For the non-generic <see cref="Image"/> type, the pixel type is only known at runtime.',
 '/// <see cref="Image"/> is always implemented by a pixel-specific <see cref="Image{TPixel}"/> instance.',
 '/// </summary>']

### 2.3 解析类

类，类注释，方法，方法注释

In [112]:
# 提取类、类注释、方法名、方法注释、方法的全部代码块文本以及类的所有代码文本
def extract_method_info(node):
    method_info = []
    if node.type == 'constructor_declaration' or node.type == 'method_declaration':
        method_dict = {
            'method_name': '',
            'method_comment': '',
            'method_code': '',
            'method_prefix': '',
            'method_param': ''
        }
        method_prefix = []
        method_param = []
        for child in node.children:
            if child.type == 'identifier':
                method_dict['method_name'] = child.text.decode('utf-8')
            elif child.type == 'comment':
                method_dict['method_comment'] = child.text.decode('utf-8')
            elif child.type == 'block':
                method_dict['method_code'] = child.text.decode('utf-8')
            elif child.type == 'modifier':
                method_prefix.append(child.text.decode('utf-8'))
            elif child.type == 'parameter_list':
                method_param.append(child.text.decode('utf-8'))
        method_dict['method_prefix'] = ' '.join(method_prefix)
        method_dict['method_param'] = ' '.join(method_param)
        method_info.append(method_dict)
    for child in node.children:
        method_info.extend(extract_method_info(child))
    return method_info
    
def extract_class_info(node):
    class_info = []
    if node.type == 'class_declaration':
        class_dict = {
            'class_name': '',
            'class_comment': '',
            'class_code': '',
            'methods': []
        }
        for child in node.children:
            if child.type == 'identifier':
                class_dict['class_name'] = child.text.decode('utf-8')
            elif child.type == 'comment':
                class_dict['class_comment'] = child.text.decode('utf-8')
            elif child.type == 'declaration_list':
                class_dict['methods'] = extract_method_info(child)
                class_dict['class_code'] = child.text.decode('utf-8')
        class_info.append(class_dict)
    for child in node.children:
        class_info.extend(extract_class_info(child))
    return class_info

# 从根节点开始提取
class_info = extract_class_info(tree.root_node)

In [113]:
class_info

[{'class_name': 'Image',
  'class_comment': '',
  'class_code': '{\n    private bool isDisposed;\n\n    /// <summary>\n    /// Initializes a new instance of the <see cref="Image"/> class.\n    /// </summary>\n    /// <param name="configuration">The global configuration..</param>\n    /// <param name="pixelType">The pixel type information.</param>\n    /// <param name="metadata">The image metadata.</param>\n    /// <param name="size">The size in px units.</param>\n    protected Image(Configuration configuration, PixelTypeInfo pixelType, ImageMetadata metadata, Size size)\n    {\n        this.Configuration = configuration;\n        this.PixelType = pixelType;\n        this.Size = size;\n        this.Metadata = metadata;\n    }\n\n    /// <summary>\n    /// Initializes a new instance of the <see cref="Image"/> class.\n    /// </summary>\n    /// <param name="configuration">The global configuration.</param>\n    /// <param name="pixelType">The <see cref="PixelTypeInfo"/>.</param>\n    /// 

### 2.4 项目分析

1. 从项目根目录遍历文件树，构建项目文件树
2. 结合项目结构以及各个代码片段中的`using`情况，构建项目文件引用树

In [142]:
if not os.path.isdir('../data/processed'):
    os.mkdir('../data/prcessed')

In [114]:
root_path = '../data/ImageSharp-main/'

In [137]:
cs_files = []

In [138]:
def folder_walk(root_folder,file_lst):
    for path,folder_lst,fname_lst in os.walk(root_folder):
        for fname in fname_lst:
            if fname.endswith('.cs'):
                file_lst.append(os.path.join(path,fname))
        for folder in folder_lst:
            folder_walk(os.path.join(path,folder), file_lst)
    return file_lst

In [139]:
cs_files = folder_walk(root_folder=root_path, file_lst=cs_files)

#### 项目分析并存储

In [147]:
cslang = tree_sitter.Language(tree_sitter_c_sharp.language())

parser = Parser(cslang)

content = []

for idx,file_path in enumerate(cs_files):
    if idx%1000==0:
        print(f'Hit:{idx}')
    with open(file_path, "rb") as f:
        ftree = parser.parse(f.read())
        
    node_dict = {
            'filename': '',
            'filepath': file_path,
            'source_reference':[],
            'source_comments': [],
            'source_class': []
        }
    node_dict['filename'] = file_path.split('/')[-1].split('.')[0]
    node_dict['source_reference'] = extract_using_directives(ftree.root_node)
    node_dict['source_comments'] = extract_file_level_comments(ftree.root_node)
    node_dict['source_class'] = extract_class_info(ftree.root_node)
    content.append(node_dict)

Hit:0
Hit:1000
Hit:2000
Hit:3000
Hit:4000
Hit:5000
Hit:6000
Hit:7000
Hit:8000
Hit:9000
Hit:10000
Hit:11000
Hit:12000
Hit:13000
Hit:14000
Hit:15000
Hit:16000
Hit:17000
Hit:18000
Hit:19000
Hit:20000
Hit:21000
Hit:22000
Hit:23000
Hit:24000
Hit:25000
Hit:26000
Hit:27000
Hit:28000
Hit:29000
Hit:30000
Hit:31000
Hit:32000
Hit:33000
Hit:34000
Hit:35000
Hit:36000
Hit:37000
Hit:38000
Hit:39000
Hit:40000
Hit:41000
Hit:42000
Hit:43000
Hit:44000
Hit:45000
Hit:46000
Hit:47000
Hit:48000
Hit:49000
Hit:50000
Hit:51000
Hit:52000


In [148]:
content[0]

{'filename': 'ImageFrameCollectionExtensions',
 'filepath': '../data/ImageSharp-main/src/ImageSharp/ImageFrameCollectionExtensions.cs',
 'source_reference': ['SixLabors.ImageSharp.PixelFormats'],
 'source_comments': ['// Copyright (c) Six Labors.',
  '// Licensed under the Six Labors Split License.',
  '/// <summary>',
  '/// Extension methods for <see cref="ImageFrameCollection{TPixel}"/>.',
  '/// </summary>'],
 'source_class': [{'class_name': 'ImageFrameCollectionExtensions',
   'class_comment': '',
   'class_code': '{\n    /// <inheritdoc cref="Enumerable.AsEnumerable{TPixel}(IEnumerable{TPixel})"/>\n    public static IEnumerable<ImageFrame<TPixel>> AsEnumerable<TPixel>(this ImageFrameCollection<TPixel> source)\n        where TPixel : unmanaged, IPixel<TPixel>\n        => source;\n\n    /// <inheritdoc cref="Enumerable.Select{TPixel, TResult}(IEnumerable{TPixel}, Func{TPixel, int, TResult})"/>\n    public static IEnumerable<TResult> Select<TPixel, TResult>(this ImageFrameCollection

In [149]:
import json

In [151]:
json.dump(content, open('../data/prcessed/cs-file-tree.json','w'))