In [10]:
import os
from pygments import lex
from pygments.lexers import PythonLexer
from pygments.token import Token

def tokenize_python_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        code = file.read()
    tokens = lex(code, PythonLexer())
    token_list = []
    for token in tokens:
        token_type = token[0]
        token_value = token[1]
        # Skip whitespace tokens
        if token_type not in Token.Text.Whitespace and token_value.strip():
            token_list.append(f"{token_value} ({token_type})")
    return token_list

def tokenize_files_in_folder(input_folder_path, output_folder_path):
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)

    supported_extensions = ('.py',)

    for filename in os.listdir(input_folder_path):
        if filename.endswith(supported_extensions):
            filepath = os.path.join(input_folder_path, filename)
            tokens = tokenize_python_file(filepath)

            if tokens:
                output_filepath = os.path.join(output_folder_path, f"{filename}_tokens.txt")
                with open(output_filepath, 'w', encoding='utf-8') as output_file:
                    for token in tokens:
                        output_file.write(f"{token}\n")
            else:
                print(f"No tokens generated for {filename}")

# Example usage:
input_folder_path = r"E:\codes\sumit\SEM_PROJ_4\Mini_Project_CodeNet\data\p04030\Python"
output_folder_path = r"E:\codes\sumit\SEM_PROJ_4\Token_output\Python_tokenise_data\python 7"
tokenize_files_in_folder(input_folder_path, output_folder_path)
print(f"Tokenized files are saved in: {output_folder_path}")


Tokenized files are saved in: E:\codes\sumit\SEM_PROJ_4\Token_output\Python_tokenise_data\python 7
