In [17]:
## For C language 
import os

def clean_tokenized_code(tokenized_code):
    # Define which tokens to keep
    tokens_to_keep = {'NEWLINE:', 'ID:', 'SEMI:', 'LBRACE:', 'RBRACE:', 'LPAREN:', 'RPAREN:', 'NUMBER:', 'STRING:', 'TIMES:', 'PLUS:', 'DIVIDE:'}
    
    # Create a cleaned list of tokens
    cleaned_code = []
    
    for token in tokenized_code:
        if token in tokens_to_keep:
            cleaned_code.append(token)
        else:
            # Retain the actual code values (e.g., variable names, function names, etc.)
            cleaned_code.append(token)
    
    # Remove extra white spaces
    cleaned_code = ' '.join(cleaned_code)
    
    return cleaned_code

def clean_files_in_folder(input_folder, output_folder):
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Iterate through all files in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):  # Assuming your files have .txt extension
            input_file_path = os.path.join(input_folder, filename)
            output_file_path = os.path.join(output_folder, filename)

            # Read the content of the file
            with open(input_file_path, 'r', encoding='utf-8') as file:  # Specify encoding
                code = file.read().split()
                
            # Clean the tokenized code
            cleaned_code = clean_tokenized_code(code)
            
            # Write the cleaned code to the output file
            with open(output_file_path, 'w', encoding='utf-8') as file:  # Specify encoding
                file.write(cleaned_code)

    # Print completion message
    print("Task complete: All files have been cleaned and saved.")

# Example usage
input_folder = r"E:\codes\sumit\SEM_PROJ_4\Token_output\C++_tokenise_data\C++"  # Use raw string
output_folder = r"E:\codes\sumit\SEM_PROJ_4\Cleaned_Tokenise_data\C++_Cleaned_Tokenise_data\C++"  # Use raw string
clean_files_in_folder(input_folder, output_folder)


In [None]:
## For C++ language 

import os
import re

def clean_tokenized_code_via_regex(tokenized_code):
    # Define the regex patterns to match the tokens to keep
    token_patterns = [
        r'Token.Comment.Preproc', r'Token.Comment.Single', r'Token.Text.Whitespace', 
        r'Token.Comment.PreprocFile', r'Token.Keyword', r'Token.Name.Namespace', 
        r'Token.Punctuation', r'Token.Keyword.Type', r'Token.Name.Function', 
        r'Token.Name', r'Token.Operator', r'Token.Literal.Number.Integer', 
        r'Token.Literal.String', r'Token.Literal.Number.Float', r'Token.Name.Class'
    ]
    
    # Compile the patterns into a single regex pattern
    combined_pattern = re.compile(r'|'.join(token_patterns))
    
    # Create a cleaned list of tokens
    cleaned_code = []
    
    # Iterate over the tokens and filter using regex
    for line in tokenized_code:
        token_match = combined_pattern.search(line)
        if token_match:
            cleaned_code.append(line)
            # Debug print to confirm matching tokens
            print(f'Adding matched token: {line}')
    
    # Join the cleaned code into a single string
    cleaned_code_str = ' '.join(cleaned_code)
    
    return cleaned_code_str

def clean_files_in_folder_via_regex(input_folder, output_folder):
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Iterate through all files in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):  # Assuming your files have .txt extension
            input_file_path = os.path.join(input_folder, filename)
            output_file_path = os.path.join(output_folder, filename)

            # Read the content of the file
            with open(input_file_path, 'r', encoding='utf-8') as file:  # Specify encoding
                code = file.read().split('\n')
                
            # Debug print to verify file reading
            print(f'Reading file: {filename}, with {len(code)} lines of tokens')
            
            # Clean the tokenized code via regex
            cleaned_code = clean_tokenized_code_via_regex(code)
            
            # Debug print to verify cleaned code length
            print(f'Writing cleaned code with {len(cleaned_code.split())} tokens to file: {output_file_path}')
            
            # Write the cleaned code to the output file
            with open(output_file_path, 'w', encoding='utf-8') as file:  # Specify encoding
                file.write(cleaned_code)

    # Print completion message
    print("Task complete: All files have been cleaned and saved.")

# Example usage
input_folder = r"E:\codes\sumit\SEM_PROJ_4\Token_output\C++_tokenise_data\C++ 7"  # Use raw string
output_folder = r"E:\codes\sumit\SEM_PROJ_4\Cleaned_Tokenise_data\C++_Cleaned_Tokenise_data\C++ 7"  # Use raw string
clean_files_in_folder_via_regex(input_folder, output_folder)
