In [None]:
pip install transformers torch



In [None]:
import os
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load pre-trained NER model
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"  # Replace with a model suitable for your needs
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Create NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Folder containing text files
folder_path = "/content/drive/mydrive/trimmed_captions"  # Update with the path to your folder

# Classification map for replacing entity types
classification_map = {
    "PER": "Player",   # Assuming Person names are Players
    "ORG": "Team"     # Assuming Organization names are Team
}

# Function to process a single file
def process_file(file_path):
    # Read file content
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()

    # Detect entities
    entities = ner_pipeline(text)

    # Replace entities with labels
    modified_text = text
    for entity in entities:
        word = entity["word"]
        entity_type = entity["entity_group"]
        if entity_type in classification_map:
            replacement = classification_map[entity_type]
            modified_text = modified_text.replace(word, replacement)

    return modified_text

# Process all text files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):  # Process only .txt files
        file_path = os.path.join(folder_path, filename)

        # Process the file and get modified content
        modified_content = process_file(file_path)

        # Save the modified content to a new file
        new_file_path = os.path.join(folder_path, f"{os.path.splitext(filename)[0]}_modified.txt")
        with open(new_file_path, "w", encoding="utf-8") as new_file:
            new_file.write(modified_content)

        print(f"Processed and saved: {new_file_path}")
