# 使用tree-sitter用于csharp代码分析

In [1]:
import os
import pickle
from typing import *

## 1. 安装csharp语法扩展

```bash
pip install tree-sitter-c-sharp
```

In [2]:
from tree_sitter import Language, Parser

In [3]:
import tree_sitter_c_sharp

In [6]:
import tree_sitter

In [45]:
cslang = tree_sitter.Language(tree_sitter_c_sharp.language())

parser = Parser(cslang)

# 解析 C# 代码
with open("../data/ImageSharp-main/src/ImageSharp/ImageFrameCollectionExtensions.cs", "rb") as f:
    tree = parser.parse(f.read())

In [46]:
tree

<tree_sitter.Tree at 0x7f48db25bf90>

In [47]:
def traverse_tree(node, indent=0):
    print(' ' * indent + f'{node.type}')
    for child in node.children:
            traverse_tree(child, indent + 2)

In [48]:
traverse_tree(tree.root_node)

compilation_unit
  comment
  comment
  using_directive
    using
    qualified_name
      qualified_name
        identifier
        .
        identifier
      .
      identifier
    ;
  file_scoped_namespace_declaration
    namespace
    qualified_name
      identifier
      .
      identifier
    ;
  comment
  comment
  comment
  class_declaration
    modifier
      public
    modifier
      static
    class
    identifier
    declaration_list
      {
      comment
      method_declaration
        modifier
          public
        modifier
          static
        generic_name
          identifier
          type_argument_list
            <
            generic_name
              identifier
              type_argument_list
                <
                identifier
                >
            >
        identifier
        type_parameter_list
          <
          type_parameter
            identifier
          >
        parameter_list
          (
          parameter
            modif

## 2. 解析csharp项目结构

1. 引用声明，区分外部库以及本地项目文件引用
2. 注释，包括代码文件注释以及类的docstring
3. 类以及类的成员，构建两个层级的对象`class-->attribute`之后分析以attribute为基本粒度，得到结果之后与类注释一起作为类功能分析的输入
4. 根据引用声明构建项目文件之间的依赖关系，建立图结构

### 2.1 抽取引用信息

In [49]:
# 提取引用库信息
def extract_qualified_name(node):
    parts = []
    if node.type == 'qualified_name':
        for child in node.children:
            if child.type == 'identifier':
                parts.append(child.text.decode('utf-8'))
            elif child.type == 'qualified_name':
                parts.extend(extract_qualified_name(child))
    return parts
    
# 提取引用库信息
def extract_using_directives(node):
    using_directives = []
    if f'{node.type}' == 'using_directive':
        for child in node.children:
            if f'{child.type}' == 'qualified_name':
                parts = extract_qualified_name(child)
                using_directives.append('.'.join(parts))
    for child in node.children:
        using_directives.extend(extract_using_directives(child))
    return using_directives

In [50]:

# 从根节点开始提取
using_directives = extract_using_directives(tree.root_node)

# 输出引用库的完整名称
for library in using_directives:
    print(library)

SixLabors.ImageSharp.PixelFormats


### 2.2 抽取代码文件注释

In [51]:
def extract_file_level_comments(node, is_file_level=True):
    comments = []
    if node.type == 'comment' and is_file_level:
        comments.append(node.text.decode('utf-8'))
    elif node.type in ['class_declaration', 'method_declaration']:
        is_file_level = False
    for child in node.children:
        comments.extend(extract_file_level_comments(child, is_file_level))
    return comments

# 从根节点开始提取
file_level_comments = extract_file_level_comments(tree.root_node)

In [52]:
file_level_comments

['// Copyright (c) Six Labors.',
 '// Licensed under the Six Labors Split License.',
 '/// <summary>',
 '/// Extension methods for <see cref="ImageFrameCollection{TPixel}"/>.',
 '/// </summary>']

### 2.3 解析类

类，类注释，方法，方法注释

In [57]:
# 提取类、类注释、方法名、方法注释、方法的全部代码块文本以及类的所有代码文本
def extract_method_info(node):
    method_info = []
    if node.type == 'method_declaration':
        method_dict = {
            'method_name': '',
            'method_comment': '',
            'method_code': '',
            'method_prefix': '',
            'method_param': ''
        }
        method_prefix = []
        method_param = []
        method_code = []
        for child in node.children:
            if child.type == 'identifier':
                method_dict['method_name'] = child.text.decode('utf-8')
            elif child.type == 'comment':
                method_dict['method_comment'] = child.text.decode('utf-8')
            elif child.type == 'modifier':
                method_prefix.append(child.text.decode('utf-8'))
            elif child.type == 'parameter_list':
                method_param.append(child.text.decode('utf-8'))
            method_code.append(child.text.decode('utf-8'))
        method_dict['method_prefix'] = ' '.join(method_prefix)
        method_dict['method_param'] = ' '.join(method_param)
        method_dict['method_code'] = ''.join(method_code)
        method_info.append(method_dict)
    for child in node.children:
        method_info.extend(extract_method_info(child))
    return method_info
    
def extract_class_info(node):
    class_info = []
    if node.type == 'class_declaration':
        class_dict = {
            'class_name': '',
            'class_comment': '',
            'class_code': '',
            'methods': []
        }
        for child in node.children:
            if child.type == 'identifier':
                class_dict['class_name'] = child.text.decode('utf-8')
            elif child.type == 'comment':
                class_dict['class_comment'] = child.text.decode('utf-8')
            elif child.type == 'declaration_list':
                class_dict['methods'] = extract_method_info(child)
                class_dict['class_code'] = child.text.decode('utf-8')
        class_info.append(class_dict)
    for child in node.children:
        class_info.extend(extract_class_info(child))
    return class_info

# 从根节点开始提取
class_info = extract_class_info(tree.root_node)

In [58]:
class_info

[{'class_name': 'ImageFrameCollectionExtensions',
  'class_comment': '',
  'class_code': '{\n    /// <inheritdoc cref="Enumerable.AsEnumerable{TPixel}(IEnumerable{TPixel})"/>\n    public static IEnumerable<ImageFrame<TPixel>> AsEnumerable<TPixel>(this ImageFrameCollection<TPixel> source)\n        where TPixel : unmanaged, IPixel<TPixel>\n        => source;\n\n    /// <inheritdoc cref="Enumerable.Select{TPixel, TResult}(IEnumerable{TPixel}, Func{TPixel, int, TResult})"/>\n    public static IEnumerable<TResult> Select<TPixel, TResult>(this ImageFrameCollection<TPixel> source, Func<ImageFrame<TPixel>, TResult> selector)\n        where TPixel : unmanaged, IPixel<TPixel> => source.AsEnumerable().Select(selector);\n}',
  'methods': [{'method_name': 'AsEnumerable',
    'method_comment': '',
    'method_code': 'publicstaticIEnumerable<ImageFrame<TPixel>>AsEnumerable<TPixel>(this ImageFrameCollection<TPixel> source)where TPixel : unmanaged, IPixel<TPixel>=> source;',
    'method_prefix': 'publi

### 2.4 项目分析

1. 从项目根目录遍历文件树，构建项目文件树
2. 结合项目结构以及各个代码片段中的`using`情况，构建项目文件引用树

In [59]:
if not os.path.isdir('../data/processed'):
    os.mkdir('../data/processed')

In [60]:
root_path = '../data/ImageSharp-main/'

In [109]:
cs_files = []

In [110]:
def folder_walk(root_folder,file_lst):
    for path,folder_lst,fname_lst in os.walk(root_folder):
        for fname in fname_lst:
            if fname.endswith('.cs'):
                file_lst.append(os.path.join(path,fname))
    return file_lst

In [111]:
folder_walk(root_folder=root_path, file_lst=cs_files)

['../data/ImageSharp-main/src/ImageSharp/ImageFrameCollectionExtensions.cs',
 '../data/ImageSharp-main/src/ImageSharp/Image.FromStream.cs',
 '../data/ImageSharp-main/src/ImageSharp/ImageExtensions.Internal.cs',
 '../data/ImageSharp-main/src/ImageSharp/Image.cs',
 '../data/ImageSharp-main/src/ImageSharp/ReadOrigin.cs',
 '../data/ImageSharp-main/src/ImageSharp/Image.FromBytes.cs',
 '../data/ImageSharp-main/src/ImageSharp/GraphicOptionsDefaultsExtensions.cs',
 '../data/ImageSharp-main/src/ImageSharp/ImageFrameCollection.cs',
 '../data/ImageSharp-main/src/ImageSharp/ImageFrame.cs',
 '../data/ImageSharp-main/src/ImageSharp/GraphicsOptions.cs',
 '../data/ImageSharp-main/src/ImageSharp/IDeepCloneable.cs',
 '../data/ImageSharp-main/src/ImageSharp/Configuration.cs',
 '../data/ImageSharp-main/src/ImageSharp/PixelAccessor{TPixel}.cs',
 '../data/ImageSharp-main/src/ImageSharp/ImageInfo.cs',
 '../data/ImageSharp-main/src/ImageSharp/GeometryUtilities.cs',
 '../data/ImageSharp-main/src/ImageSharp/Ima

In [112]:
len(cs_files)

1926

#### 项目分析并存储

In [144]:
cslang = tree_sitter.Language(tree_sitter_c_sharp.language())

parser = Parser(cslang)

content = []

for idx,file_path in enumerate(cs_files):
    if idx%1000==0:
        print(f'Hit:{idx}')
    with open(file_path, "rb") as f:
        ftree = parser.parse(f.read())
        
    node_dict = {
            'filename': '',
            'filepath': file_path,
            'source_reference':[],
            'source_comments': [],
            'source_class': []
        }
    node_dict['filename'] = file_path.split('/')[-1]
    node_dict['source_reference'] = extract_using_directives(ftree.root_node)
    node_dict['source_comments'] = extract_file_level_comments(ftree.root_node)
    node_dict['source_class'] = extract_class_info(ftree.root_node)
    content.append(node_dict)

Hit:0
Hit:1000


### 保存中间结果

In [145]:
import json

In [146]:
json.dump(content, open('../data/processed/project-by-tree-sitter.json','w'))

In [147]:
json.dump(cs_files, open('../data/processed/project-allfile.json','w'))

# 3. 引用关联

根据文件在头部声明的引用，将其与本地文件关联

In [191]:
for node in content:
    ref_key = '.'.join(node['filepath'].split('/')[4:]).removesuffix('.cs')
    namespace_key = '.'.join(node['filepath'].split('/')[4:]).removesuffix(f".{node['filename']}")
    node['reference_key'] = ref_key
    node['namespace_key'] = namespace_key

In [196]:
ref_dict = {}
for node in content:
    ref_dict[node['reference_key']] = {k:node[k] for k in node if k!='reference_key'}

In [198]:
list(ref_dict.values())[0]

{'filename': 'ImageFrameCollectionExtensions.cs',
 'filepath': '../data/ImageSharp-main/src/ImageSharp/ImageFrameCollectionExtensions.cs',
 'source_reference': ['SixLabors.ImageSharp.PixelFormats'],
 'source_comments': ['// Copyright (c) Six Labors.',
  '// Licensed under the Six Labors Split License.',
  '/// <summary>',
  '/// Extension methods for <see cref="ImageFrameCollection{TPixel}"/>.',
  '/// </summary>'],
 'source_class': [{'class_name': 'ImageFrameCollectionExtensions',
   'class_comment': '',
   'class_code': '{\n    /// <inheritdoc cref="Enumerable.AsEnumerable{TPixel}(IEnumerable{TPixel})"/>\n    public static IEnumerable<ImageFrame<TPixel>> AsEnumerable<TPixel>(this ImageFrameCollection<TPixel> source)\n        where TPixel : unmanaged, IPixel<TPixel>\n        => source;\n\n    /// <inheritdoc cref="Enumerable.Select{TPixel, TResult}(IEnumerable{TPixel}, Func{TPixel, int, TResult})"/>\n    public static IEnumerable<TResult> Select<TPixel, TResult>(this ImageFrameCollect

In [199]:
namespace_dict = {}
for key in ref_dict:
    node = ref_dict[key]
    if node['namespace_key'] not in namespace_dict:
        namespace_dict[node['namespace_key']] = []
    namespace_dict[node['namespace_key']].append(node['filepath'])

In [200]:
for key in ref_dict:
    node = ref_dict[key]
    relocal_fpath = []
    relocal_reference_map = {}
    for k in node['source_reference']:
        k = k.removeprefix('SixLabors.')
        if k in ref_dict:
            relocal_reference_map[k] = [node['filepath']]
        if k in namespace_dict:
            relocal_reference_map[k] = namespace_dict[k]
    node['relocal_reference'] = relocal_reference_map

In [201]:
list(ref_dict.values())[0]

{'filename': 'ImageFrameCollectionExtensions.cs',
 'filepath': '../data/ImageSharp-main/src/ImageSharp/ImageFrameCollectionExtensions.cs',
 'source_reference': ['SixLabors.ImageSharp.PixelFormats'],
 'source_comments': ['// Copyright (c) Six Labors.',
  '// Licensed under the Six Labors Split License.',
  '/// <summary>',
  '/// Extension methods for <see cref="ImageFrameCollection{TPixel}"/>.',
  '/// </summary>'],
 'source_class': [{'class_name': 'ImageFrameCollectionExtensions',
   'class_comment': '',
   'class_code': '{\n    /// <inheritdoc cref="Enumerable.AsEnumerable{TPixel}(IEnumerable{TPixel})"/>\n    public static IEnumerable<ImageFrame<TPixel>> AsEnumerable<TPixel>(this ImageFrameCollection<TPixel> source)\n        where TPixel : unmanaged, IPixel<TPixel>\n        => source;\n\n    /// <inheritdoc cref="Enumerable.Select{TPixel, TResult}(IEnumerable{TPixel}, Func{TPixel, int, TResult})"/>\n    public static IEnumerable<TResult> Select<TPixel, TResult>(this ImageFrameCollect

## 4. 保存最终结果

In [203]:
json.dump(ref_dict, open('../data/processed/project-by-tree-sitter.json','w'))