# C++ Code to AST Dataset Generation for CodeBERT

**Objective**: Convert all C++ files from the Plagiarism Dataset into Abstract Syntax Trees (AST) for CodeBERT fine-tuning.

**Input**: Programming Homework Dataset for Plagiarism Detection
**Output**: Structured AST dataset ready for machine learning applications

## Process Overview:
1. Environment setup and library imports
2. Dataset analysis and C++ file collection
3. AST parser implementation
4. Batch processing system
5. Dataset generation and export

In [10]:
# Environment Setup and Library Imports

import os
import json
import pickle
import time
import re
from pathlib import Path
from collections import defaultdict, Counter
from datetime import datetime
from typing import List, Dict, Optional, Tuple, Any

import pandas as pd
import numpy as np
from tqdm import tqdm

# AST parsing libraries
from pycparser import c_parser, c_ast
from pycparser.plyparser import ParseError

print("Environment setup completed")
print(f"Working directory: {Path.cwd()}")

Environment setup completed
Working directory: /Users/onis2/NLP/TestVersion


## Dataset Configuration and Analysis

Define dataset paths and analyze the structure to identify all C++ files for processing.

In [11]:
# Dataset Configuration
DATASET_ROOT = Path("/Users/onis2/Downloads/Plagiarism Dataset")
SRC_PATH = DATASET_ROOT / "src"
OUTPUT_DIR = Path("/Users/onis2/NLP/TestVersion/cpp_ast_dataset")

# Create output directory
OUTPUT_DIR.mkdir(exist_ok=True)

class DatasetAnalyzer:
    """Analyze dataset structure and collect C++ files."""
    
    def __init__(self, src_path: Path):
        self.src_path = src_path
        self.courses = []
        self.cpp_files = []
        
    def analyze_structure(self) -> Dict[str, Any]:
        """Analyze dataset structure and collect statistics."""
        print("Analyzing dataset structure...")
        
        # Get all courses
        self.courses = sorted([d.name for d in self.src_path.iterdir() if d.is_dir()])
        
        # Collect all C++ files
        cpp_count = 0
        course_stats = {}
        
        for course in self.courses:
            course_path = self.src_path / course
            course_cpp_files = []
            
            for assignment_folder in course_path.iterdir():
                if not assignment_folder.is_dir() or not assignment_folder.name.startswith('Z'):
                    continue
                    
                for sub_assignment in assignment_folder.iterdir():
                    if not sub_assignment.is_dir():
                        continue
                    
                    # Find all .cpp files
                    cpp_files_in_assignment = list(sub_assignment.glob("*.cpp"))
                    course_cpp_files.extend(cpp_files_in_assignment)
                    
                    for cpp_file in cpp_files_in_assignment:
                        file_info = {
                            'path': cpp_file,
                            'course': course,
                            'assignment': f"{assignment_folder.name}/{sub_assignment.name}",
                            'student_id': cpp_file.stem,
                            'relative_path': str(cpp_file.relative_to(self.src_path))
                        }
                        self.cpp_files.append(file_info)
            
            course_stats[course] = len(course_cpp_files)
            cpp_count += len(course_cpp_files)
        
        stats = {
            'total_courses': len(self.courses),
            'courses': self.courses,
            'total_cpp_files': cpp_count,
            'files_per_course': course_stats,
            'cpp_files': self.cpp_files
        }
        
        return stats

# Initialize analyzer and collect C++ files
analyzer = DatasetAnalyzer(SRC_PATH)
dataset_stats = analyzer.analyze_structure()

print("Dataset Analysis Results:")
print(f"Total courses: {dataset_stats['total_courses']}")
print(f"Total C++ files: {dataset_stats['total_cpp_files']}")
print("\nFiles per course:")
for course, count in dataset_stats['files_per_course'].items():
    print(f"  {course}: {count:,} files")

# Save file list for reference
cpp_files_list = [
    {
        'course': f['course'],
        'assignment': f['assignment'], 
        'student_id': f['student_id'],
        'path': str(f['path'])
    } 
    for f in dataset_stats['cpp_files']
]

with open(OUTPUT_DIR / "cpp_files_inventory.json", 'w') as f:
    json.dump(cpp_files_list, f, indent=2)

print(f"\nFile inventory saved to: {OUTPUT_DIR / 'cpp_files_inventory.json'}")

Analyzing dataset structure...
Dataset Analysis Results:
Total courses: 4
Total C++ files: 23586

Files per course:
  A2016: 0 files
  A2017: 0 files
  B2016: 12,196 files
  B2017: 11,390 files

File inventory saved to: /Users/onis2/NLP/TestVersion/cpp_ast_dataset/cpp_files_inventory.json

File inventory saved to: /Users/onis2/NLP/TestVersion/cpp_ast_dataset/cpp_files_inventory.json


## AST Node and Parser Implementation

Core classes for AST representation and C++ code parsing.

In [12]:
class ASTNode:
    """Represents a node in the Abstract Syntax Tree."""
    
    def __init__(self, node_type: str, value: Optional[str] = None, children: Optional[List['ASTNode']] = None):
        self.node_type = node_type
        self.value = value
        self.children = children or []
    
    def to_dict(self) -> Dict[str, Any]:
        """Convert AST node to dictionary representation."""
        return {
            'type': self.node_type,
            'value': self.value,
            'children': [child.to_dict() for child in self.children]
        }
    
    def to_sequence(self) -> List[str]:
        """Convert AST to flat sequence representation for CodeBERT."""
        sequence = [f"<{self.node_type}>"]
        if self.value:
            sequence.append(str(self.value))
        
        for child in self.children:
            sequence.extend(child.to_sequence())
        
        sequence.append(f"</{self.node_type}>")
        return sequence
    
    def extract_features(self) -> Dict[str, Any]:
        """Extract structural features from AST."""
        features = {
            'total_nodes': 0,
            'node_types': defaultdict(int),
            'max_depth': 0,
            'identifiers': set(),
            'literals': set(),
            'operators': set()
        }
        
        def traverse(node: 'ASTNode', depth: int = 0):
            features['total_nodes'] += 1
            features['node_types'][node.node_type] += 1
            features['max_depth'] = max(features['max_depth'], depth)
            
            if node.value:
                if node.node_type in ['ID', 'Identifier']:
                    features['identifiers'].add(node.value)
                elif node.node_type in ['Constant', 'literal']:
                    features['literals'].add(node.value)
                elif node.node_type in ['BinaryOp', 'UnaryOp', 'Assignment']:
                    features['operators'].add(node.value)
            
            for child in node.children:
                traverse(child, depth + 1)
        
        traverse(self)
        
        # Convert sets to lists for JSON serialization
        features['identifiers'] = list(features['identifiers'])
        features['literals'] = list(features['literals'])
        features['operators'] = list(features['operators'])
        features['node_types'] = dict(features['node_types'])
        
        return features


class CppASTParser:
    """Enhanced C++ AST Parser with preprocessing capabilities."""
    
    def __init__(self):
        self.parser = c_parser.CParser()
        self.preprocessing_stats = {'successful': 0, 'failed': 0}
    
    def preprocess_cpp_code(self, code: str) -> str:
        """Preprocess C++ code to handle includes and common constructs."""
        # Remove includes and preprocessor directives
        processed_code = re.sub(r'#include\s*[<"][^>"]*[>"]', '', code)
        processed_code = re.sub(r'#ifndef.*?#endif', '', processed_code, flags=re.DOTALL)
        processed_code = re.sub(r'#define.*?\n', '', processed_code)
        processed_code = re.sub(r'#pragma.*?\n', '', processed_code)
        
        # Add basic type definitions and function declarations
        declarations = '''
typedef long size_t;
typedef struct FILE FILE;
extern FILE *stdin, *stdout, *stderr;
int printf(const char *format, ...);
int scanf(const char *format, ...);
void *malloc(size_t size);
void free(void *ptr);
int strcmp(const char *s1, const char *s2);
size_t strlen(const char *s);
        '''
        
        return declarations + processed_code
    
    def parse_code(self, code: str, filename: str = "<string>") -> Optional[ASTNode]:
        """Parse C++ code and return AST representation."""
        try:
            processed_code = self.preprocess_cpp_code(code)
            ast = self.parser.parse(processed_code, filename=filename)
            return self._convert_pycparser_ast(ast)
        except ParseError as e:
            self.preprocessing_stats['failed'] += 1
            return None
        except Exception as e:
            self.preprocessing_stats['failed'] += 1
            return None
    
    def _convert_pycparser_ast(self, node) -> Optional[ASTNode]:
        """Convert pycparser AST to custom ASTNode format."""
        if node is None:
            return None
        
        node_type = node.__class__.__name__
        
        # Extract node value
        value = None
        if hasattr(node, 'name') and node.name:
            value = node.name
        elif hasattr(node, 'value') and node.value:
            value = node.value
        elif hasattr(node, 'op') and node.op:
            value = node.op
        
        # Convert children
        children = []
        for attr_name, attr_value in node.children():
            if attr_value:
                if isinstance(attr_value, list):
                    for item in attr_value:
                        converted_child = self._convert_pycparser_ast(item)
                        if converted_child:
                            children.append(converted_child)
                else:
                    converted_child = self._convert_pycparser_ast(attr_value)
                    if converted_child:
                        children.append(converted_child)
        
        return ASTNode(node_type, value, children)
    
    def get_stats(self) -> Dict[str, int]:
        """Get preprocessing statistics."""
        return self.preprocessing_stats.copy()


# Initialize parser
cpp_parser = CppASTParser()
print("C++ AST Parser initialized")

C++ AST Parser initialized


## Batch Processing System

High-performance batch processor for converting all C++ files to AST representations.

In [13]:
class CppASTProcessor:
    """Batch processor for converting C++ files to AST representations."""
    
    def __init__(self, output_dir: Path):
        self.output_dir = output_dir
        self.parser = CppASTParser()
        self.processing_stats = {
            'total_files': 0,
            'successful_parses': 0,
            'failed_parses': 0,
            'start_time': None,
            'end_time': None,
            'processing_times': [],
            'file_sizes': [],
            'ast_sizes': []
        }
    
    def process_file(self, file_info: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        """Process a single C++ file and return AST data."""
        start_time = time.time()
        
        try:
            file_path = Path(file_info['path'])
            
            # Read source code
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                source_code = f.read()
            
            # Parse to AST
            ast_root = self.parser.parse_code(source_code, str(file_path))
            
            if ast_root is None:
                return None
            
            # Extract features and sequence
            ast_features = ast_root.extract_features()
            ast_sequence = ast_root.to_sequence()
            
            processing_time = time.time() - start_time
            
            result = {
                'file_info': {
                    'course': file_info['course'],
                    'assignment': file_info['assignment'],
                    'student_id': file_info['student_id'],
                    'relative_path': file_info['relative_path']
                },
                'source_code': source_code,
                'ast_features': ast_features,
                'ast_sequence': ast_sequence,
                'processing_time': processing_time,
                'timestamp': datetime.now().isoformat()
            }
            
            # Update statistics
            self.processing_stats['file_sizes'].append(len(source_code))
            self.processing_stats['ast_sizes'].append(len(ast_sequence))
            self.processing_stats['processing_times'].append(processing_time)
            
            return result
            
        except Exception as e:
            print(f"Error processing {file_info['relative_path']}: {str(e)}")
            return None
    
    def process_batch(self, cpp_files: List[Dict[str, Any]], batch_size: int = 1000) -> List[Dict[str, Any]]:
        """Process a batch of C++ files."""
        print(f"Starting batch processing of {len(cpp_files)} C++ files...")
        
        self.processing_stats['total_files'] = len(cpp_files)
        self.processing_stats['start_time'] = datetime.now()
        
        results = []
        failed_files = []
        
        # Process files with progress bar
        for file_info in tqdm(cpp_files, desc="Processing C++ files"):
            result = self.process_file(file_info)
            
            if result:
                results.append(result)
                self.processing_stats['successful_parses'] += 1
            else:
                failed_files.append(file_info)
                self.processing_stats['failed_parses'] += 1
        
        self.processing_stats['end_time'] = datetime.now()
        
        # Save results in batches to manage memory
        self._save_results(results, failed_files)
        
        return results
    
    def _save_results(self, results: List[Dict[str, Any]], failed_files: List[Dict[str, Any]]):
        """Save processing results to files."""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # Save successful results
        results_file = self.output_dir / f"cpp_ast_dataset_{timestamp}.pkl"
        with open(results_file, 'wb') as f:
            pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)
        
        # Save metadata
        metadata = {
            'total_files': len(results),
            'processing_stats': self.processing_stats,
            'parser_stats': self.parser.get_stats(),
            'timestamp': timestamp
        }
        
        metadata_file = self.output_dir / f"metadata_{timestamp}.json"
        with open(metadata_file, 'w') as f:
            # Convert datetime objects to strings for JSON serialization
            metadata_copy = metadata.copy()
            if metadata_copy['processing_stats']['start_time']:
                metadata_copy['processing_stats']['start_time'] = str(metadata_copy['processing_stats']['start_time'])
            if metadata_copy['processing_stats']['end_time']:
                metadata_copy['processing_stats']['end_time'] = str(metadata_copy['processing_stats']['end_time'])
            json.dump(metadata_copy, f, indent=2)
        
        # Save sample data for inspection
        sample_size = min(5, len(results))
        if sample_size > 0:
            sample_data = []
            for result in results[:sample_size]:
                sample = {
                    'file_info': result['file_info'],
                    'ast_features': result['ast_features'],
                    'ast_sequence_length': len(result['ast_sequence']),
                    'ast_sequence_sample': result['ast_sequence'][:20],
                    'processing_time': result['processing_time']
                }
                sample_data.append(sample)
            
            sample_file = self.output_dir / f"sample_results_{timestamp}.json"
            with open(sample_file, 'w') as f:
                json.dump(sample_data, f, indent=2)
        
        # Save failed files list
        if failed_files:
            failed_file = self.output_dir / f"failed_files_{timestamp}.txt"
            with open(failed_file, 'w') as f:
                for file_info in failed_files:
                    f.write(f"{file_info['relative_path']}\\n")
        
        print(f"Results saved:")
        print(f"  Main dataset: {results_file}")
        print(f"  Metadata: {metadata_file}")
        if sample_size > 0:
            print(f"  Sample data: {sample_file}")
        if failed_files:
            print(f"  Failed files: {failed_file}")
    
    def print_summary(self):
        """Print processing summary."""
        stats = self.processing_stats
        
        # Handle duration calculation properly
        duration = None
        if stats['end_time'] and stats['start_time']:
            if isinstance(stats['end_time'], str):
                # Convert string back to datetime for calculation
                from datetime import datetime
                try:
                    end_time = datetime.fromisoformat(stats['end_time'].replace('Z', '+00:00'))
                    start_time = datetime.fromisoformat(stats['start_time'].replace('Z', '+00:00'))
                    duration = end_time - start_time
                except:
                    duration = None
            else:
                # Already datetime objects
                duration = stats['end_time'] - stats['start_time']
        
        print("\\nProcessing Summary:")
        print(f"Total files processed: {stats['total_files']}")
        print(f"Successful parses: {stats['successful_parses']}")
        print(f"Failed parses: {stats['failed_parses']}")
        
        if stats['total_files'] > 0:
            success_rate = (stats['successful_parses'] / stats['total_files']) * 100
            print(f"Success rate: {success_rate:.1f}%")
        
        if duration:
            print(f"Processing duration: {duration}")
        
        if stats['processing_times']:
            avg_time = np.mean(stats['processing_times'])
            print(f"Average processing time: {avg_time:.3f}s per file")
        
        if stats['file_sizes']:
            avg_size = np.mean(stats['file_sizes'])
            print(f"Average file size: {avg_size:.0f} characters")
        
        if stats['ast_sizes']:
            avg_ast_size = np.mean(stats['ast_sizes'])
            print(f"Average AST sequence length: {avg_ast_size:.0f} tokens")


# Initialize processor
processor = CppASTProcessor(OUTPUT_DIR)
print("C++ AST Processor initialized")

C++ AST Processor initialized


## Dataset Generation and Processing

Execute the full pipeline to convert all C++ files to AST dataset.

In [14]:
# Execute full dataset processing pipeline

print("Starting C++ to AST dataset generation...")
print(f"Processing {len(dataset_stats['cpp_files'])} C++ files")
print(f"Output directory: {OUTPUT_DIR}")

# Process all C++ files
results = processor.process_batch(dataset_stats['cpp_files'])

# Print processing summary
processor.print_summary()

print(f"\\nDataset generation completed!")
print(f"Generated AST representations for {len(results)} C++ files")
print(f"Results saved in: {OUTPUT_DIR}")

Starting C++ to AST dataset generation...
Processing 23586 C++ files
Output directory: /Users/onis2/NLP/TestVersion/cpp_ast_dataset
Starting batch processing of 23586 C++ files...


Processing C++ files: 100%|██████████| 23586/23586 [00:18<00:00, 1264.48it/s]

Results saved:
  Main dataset: /Users/onis2/NLP/TestVersion/cpp_ast_dataset/cpp_ast_dataset_20250921_164042.pkl
  Metadata: /Users/onis2/NLP/TestVersion/cpp_ast_dataset/metadata_20250921_164042.json
  Sample data: /Users/onis2/NLP/TestVersion/cpp_ast_dataset/sample_results_20250921_164042.json
  Failed files: /Users/onis2/NLP/TestVersion/cpp_ast_dataset/failed_files_20250921_164042.txt
\nProcessing Summary:
Total files processed: 23586
Successful parses: 172
Failed parses: 23414
Success rate: 0.7%
Processing duration: 0:00:18.653771
Average processing time: 0.001s per file
Average file size: 27 characters
Average AST sequence length: 193 tokens
\nDataset generation completed!
Generated AST representations for 172 C++ files
Results saved in: /Users/onis2/NLP/TestVersion/cpp_ast_dataset





## CodeBERT Dataset Preparation

Prepare the AST dataset for CodeBERT training with proper formatting and structure.

In [15]:
class CodeBERTDatasetFormatter:
    """Format AST dataset for CodeBERT training."""
    
    def __init__(self, output_dir: Path):
        self.output_dir = output_dir
    
    def format_for_codebert(self, ast_results: List[Dict[str, Any]], max_sequence_length: int = 512) -> List[Dict[str, Any]]:
        """Format AST data for CodeBERT input."""
        formatted_data = []
        
        print(f"Formatting {len(ast_results)} AST results for CodeBERT...")
        
        for result in tqdm(ast_results, desc="Formatting for CodeBERT"):
            # Truncate AST sequence to fit model constraints
            ast_sequence = result['ast_sequence'][:max_sequence_length]
            
            # Create CodeBERT-compatible format
            formatted_entry = {
                'id': f"{result['file_info']['course']}_{result['file_info']['assignment']}_{result['file_info']['student_id']}",
                'text': ' '.join(ast_sequence),
                'ast_sequence': ast_sequence,
                'metadata': {
                    'course': result['file_info']['course'],
                    'assignment': result['file_info']['assignment'],
                    'student_id': result['file_info']['student_id'],
                    'ast_features': result['ast_features'],
                    'original_sequence_length': len(result['ast_sequence']),
                    'truncated': len(result['ast_sequence']) > max_sequence_length,
                    'processing_time': result['processing_time']
                }
            }
            
            formatted_data.append(formatted_entry)
        
        return formatted_data
    
    def save_codebert_dataset(self, formatted_data: List[Dict[str, Any]]):
        """Save formatted dataset for CodeBERT."""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # Save complete dataset
        dataset_file = self.output_dir / f"codebert_cpp_dataset_{timestamp}.json"
        with open(dataset_file, 'w') as f:
            json.dump(formatted_data, f, indent=2)
        
        # Create training data CSV for easy loading
        csv_data = []
        for entry in formatted_data:
            csv_data.append({
                'id': entry['id'],
                'text': entry['text'],
                'course': entry['metadata']['course'],
                'assignment': entry['metadata']['assignment'],
                'student_id': entry['metadata']['student_id'],
                'ast_nodes': entry['metadata']['ast_features']['total_nodes'],
                'ast_depth': entry['metadata']['ast_features']['max_depth'],
                'sequence_length': len(entry['ast_sequence'])
            })
        
        csv_file = self.output_dir / f"codebert_cpp_dataset_{timestamp}.csv"
        df = pd.DataFrame(csv_data)
        df.to_csv(csv_file, index=False)
        
        # Generate summary statistics
        summary = {
            'total_samples': len(formatted_data),
            'courses': list(df['course'].unique()),
            'assignments_per_course': df.groupby('course')['assignment'].nunique().to_dict(),
            'avg_sequence_length': df['sequence_length'].mean(),
            'sequence_length_stats': {
                'min': int(df['sequence_length'].min()),
                'max': int(df['sequence_length'].max()),
                'mean': float(df['sequence_length'].mean()),
                'std': float(df['sequence_length'].std())
            },
            'avg_ast_nodes': df['ast_nodes'].mean(),
            'avg_ast_depth': df['ast_depth'].mean(),
            'generation_timestamp': timestamp
        }
        
        summary_file = self.output_dir / f"dataset_summary_{timestamp}.json"
        with open(summary_file, 'w') as f:
            json.dump(summary, f, indent=2)
        
        print(f"CodeBERT dataset saved:")
        print(f"  JSON format: {dataset_file}")
        print(f"  CSV format: {csv_file}")
        print(f"  Summary: {summary_file}")
        
        return summary

# Format dataset for CodeBERT (only if results were generated)
if 'results' in locals() and results:
    formatter = CodeBERTDatasetFormatter(OUTPUT_DIR)
    codebert_data = formatter.format_for_codebert(results, max_sequence_length=512)
    dataset_summary = formatter.save_codebert_dataset(codebert_data)
    
    print("\\nDataset Summary:")
    print(f"Total samples: {dataset_summary['total_samples']}")
    print(f"Courses: {', '.join(dataset_summary['courses'])}")
    print(f"Average sequence length: {dataset_summary['avg_sequence_length']:.1f}")
    print(f"Average AST nodes: {dataset_summary['avg_ast_nodes']:.1f}")
    print(f"Average AST depth: {dataset_summary['avg_ast_depth']:.1f}")
else:
    print("No results available for formatting. Please run the processing step first.")

Formatting 172 AST results for CodeBERT...


Formatting for CodeBERT: 100%|██████████| 172/172 [00:00<00:00, 305945.84it/s]

CodeBERT dataset saved:
  JSON format: /Users/onis2/NLP/TestVersion/cpp_ast_dataset/codebert_cpp_dataset_20250921_164042.json
  CSV format: /Users/onis2/NLP/TestVersion/cpp_ast_dataset/codebert_cpp_dataset_20250921_164042.csv
  Summary: /Users/onis2/NLP/TestVersion/cpp_ast_dataset/dataset_summary_20250921_164042.json
\nDataset Summary:
Total samples: 172
Courses: B2016, B2017
Average sequence length: 189.1
Average AST nodes: 85.4
Average AST depth: 7.1





## Dataset Validation and Analysis

Validate the generated dataset and provide comprehensive analysis.

In [16]:
class DatasetValidator:
    def __init__(self, dataset_dir):
        self.dataset_dir = dataset_dir
        self.validation_results = {}
    
    def validate_dataset(self):
        """Validate the generated dataset"""
        print(f"Validating dataset in: {self.dataset_dir}")
        print("="*60)
        
        # Check dataset structure
        self._validate_structure()
        self._validate_files()
        self._validate_ast_quality()
        self._generate_statistics()
        
        return self.validation_results
    
    def _validate_structure(self):
        """Validate dataset directory structure"""
        print("Validating dataset structure...")
        
        required_files = ['metadata.json', 'samples.json']
        required_dirs = ['individual_asts', 'processed_files']
        
        structure_valid = True
        
        # Check required files
        for file in required_files:
            file_path = os.path.join(self.dataset_dir, file)
            if os.path.exists(file_path):
                print(f"✓ {file} exists")
            else:
                print(f"✗ {file} missing")
                structure_valid = False
        
        # Check required directories
        for dir_name in required_dirs:
            dir_path = os.path.join(self.dataset_dir, dir_name)
            if os.path.exists(dir_path):
                file_count = len(os.listdir(dir_path))
                print(f"✓ {dir_name} exists ({file_count} files)")
            else:
                print(f"✗ {dir_name} missing")
                structure_valid = False
        
        self.validation_results['structure_valid'] = structure_valid
        print()
    
    def _validate_files(self):
        """Validate individual files"""
        print("Validating individual files...")
        
        metadata_path = os.path.join(self.dataset_dir, 'metadata.json')
        samples_path = os.path.join(self.dataset_dir, 'samples.json')
        
        file_validation = {}
        
        # Validate metadata.json
        if os.path.exists(metadata_path):
            try:
                with open(metadata_path, 'r', encoding='utf-8') as f:
                    metadata = json.load(f)
                    required_keys = ['total_files', 'successful_conversions', 'success_rate', 'created_at']
                    
                    metadata_valid = all(key in metadata for key in required_keys)
                    file_validation['metadata'] = {
                        'valid': metadata_valid,
                        'content': metadata
                    }
                    print(f"✓ metadata.json is valid")
            except Exception as e:
                file_validation['metadata'] = {'valid': False, 'error': str(e)}
                print(f"✗ metadata.json validation failed: {e}")
        
        # Validate samples.json
        if os.path.exists(samples_path):
            try:
                with open(samples_path, 'r', encoding='utf-8') as f:
                    samples = json.load(f)
                    
                    samples_valid = isinstance(samples, list) and len(samples) > 0
                    if samples_valid and len(samples) > 0:
                        # Check first sample structure
                        first_sample = samples[0]
                        required_keys = ['file_path', 'file_size', 'ast_sequence']
                        samples_valid = all(key in first_sample for key in required_keys)
                    
                    file_validation['samples'] = {
                        'valid': samples_valid,
                        'count': len(samples) if isinstance(samples, list) else 0
                    }
                    print(f"✓ samples.json is valid ({len(samples)} samples)")
            except Exception as e:
                file_validation['samples'] = {'valid': False, 'error': str(e)}
                print(f"✗ samples.json validation failed: {e}")
        
        self.validation_results['file_validation'] = file_validation
        print()
    
    def _validate_ast_quality(self):
        """Validate AST quality"""
        print("Validating AST quality...")
        
        samples_path = os.path.join(self.dataset_dir, 'samples.json')
        
        if not os.path.exists(samples_path):
            print("✗ Cannot validate AST quality - samples.json not found")
            return
        
        try:
            with open(samples_path, 'r', encoding='utf-8') as f:
                samples = json.load(f)
            
            quality_metrics = {
                'total_samples': len(samples),
                'avg_sequence_length': 0,
                'min_sequence_length': float('inf'),
                'max_sequence_length': 0,
                'empty_sequences': 0,
                'valid_sequences': 0
            }
            
            sequence_lengths = []
            
            for sample in samples:
                if 'ast_sequence' in sample:
                    seq_len = len(sample['ast_sequence'])
                    sequence_lengths.append(seq_len)
                    
                    if seq_len == 0:
                        quality_metrics['empty_sequences'] += 1
                    else:
                        quality_metrics['valid_sequences'] += 1
                        quality_metrics['min_sequence_length'] = min(quality_metrics['min_sequence_length'], seq_len)
                        quality_metrics['max_sequence_length'] = max(quality_metrics['max_sequence_length'], seq_len)
            
            if sequence_lengths:
                quality_metrics['avg_sequence_length'] = sum(sequence_lengths) / len(sequence_lengths)
                quality_metrics['median_sequence_length'] = sorted(sequence_lengths)[len(sequence_lengths)//2]
            
            if quality_metrics['min_sequence_length'] == float('inf'):
                quality_metrics['min_sequence_length'] = 0
            
            print(f"✓ AST Quality Analysis:")
            print(f"  Total samples: {quality_metrics['total_samples']}")
            print(f"  Valid sequences: {quality_metrics['valid_sequences']}")
            print(f"  Empty sequences: {quality_metrics['empty_sequences']}")
            print(f"  Avg sequence length: {quality_metrics['avg_sequence_length']:.1f}")
            print(f"  Min sequence length: {quality_metrics['min_sequence_length']}")
            print(f"  Max sequence length: {quality_metrics['max_sequence_length']}")
            
            self.validation_results['ast_quality'] = quality_metrics
            
        except Exception as e:
            print(f"✗ AST quality validation failed: {e}")
            self.validation_results['ast_quality'] = {'error': str(e)}
        
        print()
    
    def _generate_statistics(self):
        """Generate comprehensive statistics"""
        print("Generating comprehensive statistics...")
        
        try:
            metadata_path = os.path.join(self.dataset_dir, 'metadata.json')
            
            if os.path.exists(metadata_path):
                with open(metadata_path, 'r', encoding='utf-8') as f:
                    metadata = json.load(f)
                
                print(f"Dataset Statistics:")
                print(f"  Created: {metadata.get('created_at', 'Unknown')}")
                print(f"  Total C++ files processed: {metadata.get('total_files', 0):,}")
                print(f"  Successful conversions: {metadata.get('successful_conversions', 0):,}")
                print(f"  Success rate: {metadata.get('success_rate', 0):.1f}%")
                print(f"  Failed conversions: {metadata.get('total_files', 0) - metadata.get('successful_conversions', 0):,}")
                
                # Calculate dataset size
                dataset_size = 0
                for root, dirs, files in os.walk(self.dataset_dir):
                    for file in files:
                        dataset_size += os.path.getsize(os.path.join(root, file))
                
                print(f"  Dataset size: {dataset_size / (1024*1024):.1f} MB")
                
                self.validation_results['statistics'] = {
                    'metadata': metadata,
                    'dataset_size_mb': dataset_size / (1024*1024)
                }
        
        except Exception as e:
            print(f"✗ Statistics generation failed: {e}")
        
        print()
    
    def export_validation_report(self):
        """Export validation report"""
        report_path = os.path.join(self.dataset_dir, 'validation_report.json')
        
        try:
            with open(report_path, 'w', encoding='utf-8') as f:
                json.dump(self.validation_results, f, indent=2, ensure_ascii=False)
            
            print(f"✓ Validation report exported to: {report_path}")
            return report_path
        
        except Exception as e:
            print(f"✗ Failed to export validation report: {e}")
            return None

## Execute Complete Pipeline

Run the complete pipeline to process all C++ files and generate the final dataset.

In [None]:
def main_pipeline():
    """Execute the complete C++ AST dataset generation pipeline"""
    
    print("Starting C++ AST Dataset Generation Pipeline")
    print("="*60)
    
    # Configuration
    plagiarism_dataset_path = Path("/Users/onis2/Downloads/Plagiarism Dataset/src")
    output_dir = Path("/Users/onis2/NLP/TestVersion/cpp_ast_dataset")
    
    # Step 1: Analyze dataset
    print("\n🔍 Step 1: Analyzing dataset...")
    analyzer = DatasetAnalyzer(plagiarism_dataset_path)
    dataset_stats = analyzer.analyze_structure()
    cpp_files = dataset_stats['cpp_files']
    
    print(f"Found {len(cpp_files)} C++ files for processing")
    
    if len(cpp_files) == 0:
        print("❌ No C++ files found. Exiting...")
        return
    
    # Step 2: Process C++ files to AST
    print(f"\n🔄 Step 2: Processing {len(cpp_files)} C++ files...")
    processor = CppASTProcessor(output_dir)
    processing_results = processor.process_batch(cpp_files)
    
    # Step 3: Format for CodeBERT
    print(f"\n📊 Step 3: Formatting dataset for CodeBERT...")
    formatter = CodeBERTDatasetFormatter(output_dir)
    codebert_data = formatter.format_for_codebert(processing_results, max_sequence_length=512)
    dataset_summary = formatter.save_codebert_dataset(codebert_data)
    
    # Step 4: Validate dataset
    print(f"\n✅ Step 4: Validating generated dataset...")
    validator = DatasetValidator(output_dir)
    validation_results = validator.validate_dataset()
    validator.export_validation_report()
    
    # Summary
    print("\n" + "="*60)
    print("PIPELINE COMPLETION SUMMARY")
    print("="*60)
    
    print(f"📁 Dataset location: {output_dir}")
    print(f"📈 Total C++ files found: {len(cpp_files):,}")
    print(f"✅ Successfully processed: {len(processing_results):,}")
    print(f"❌ Failed conversions: {len(cpp_files) - len(processing_results):,}")
    
    if len(cpp_files) > 0:
        success_rate = (len(processing_results) / len(cpp_files)) * 100
        print(f"📊 Success rate: {success_rate:.1f}%")
    
    if 'ast_quality' in validation_results:
        quality = validation_results['ast_quality']
        if 'avg_sequence_length' in quality:
            print(f"📏 Average AST sequence length: {quality['avg_sequence_length']:.1f}")
    
    print(f"\n🎯 Dataset ready for CodeBERT fine-tuning!")
    print(f"📋 Check validation_report.json for detailed analysis")
    
    return {
        'dataset_path': output_dir,
        'processing_results': processing_results,
        'validation_results': validation_results,
        'total_files': len(cpp_files)
    }

# Execute the pipeline
if __name__ == "__main__":
    results = main_pipeline()

Starting C++ AST Dataset Generation Pipeline

🔍 Step 1: Analyzing dataset...


AttributeError: 'DatasetAnalyzer' object has no attribute 'analyze_dataset'