In [3]:
import spacy
import os
import pandas as pd

nlp = spacy.load("en_core_web_sm")
base_dir = "../data/subdomains/"
subdomains = os.listdir(base_dir)

for subdomain in subdomains:
    folder = os.path.join(base_dir, subdomain)
    for fname in os.listdir(folder):
        if fname.endswith(".cleaned.txt"):
            file_path = os.path.join(folder, fname)
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read()
            doc = nlp(text)
            tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
            sentences = [sent.text.strip() for sent in doc.sents]
            entities = [(ent.text, ent.label_) for ent in doc.ents]
            # Save tokens
            with open(file_path.replace(".cleaned.txt", ".tokens.txt"), "w", encoding="utf-8") as f:
                f.write(" ".join(tokens))
            # Save sentences
            with open(file_path.replace(".cleaned.txt", ".sentences.txt"), "w", encoding="utf-8") as f:
                f.write("\n".join(sentences))
            # Save entities
            pd.DataFrame(entities, columns=["entity", "label"]).to_csv(file_path.replace(".cleaned.txt", ".entities.csv"), index=False)


In [4]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')
for subdomain in subdomains:
    folder = os.path.join(base_dir, subdomain)
    for fname in os.listdir(folder):
        if fname.endswith(".cleaned.txt"):
            with open(os.path.join(folder, fname), "r", encoding="utf-8") as f:
                text = f.read()
            embedding = model.encode([text])[0]
            np.save(os.path.join(folder, fname.replace(".cleaned.txt", ".embedding.npy")), embedding)


In [5]:
for subdomain in subdomains:
    folder = os.path.join(base_dir, subdomain)
    data = []
    for fname in os.listdir(folder):
        if fname.endswith(".embedding.npy"):
            text_file = fname.replace(".embedding.npy", ".cleaned.txt")
            text_path = os.path.join(folder, text_file)
            embedding_path = os.path.join(folder, fname)
            with open(text_path, "r", encoding="utf-8") as f:
                text = f.read()
            data.append({
                "text": text,
                "embedding_path": embedding_path,
                "label": subdomain
            })
    pd.DataFrame(data).to_csv(os.path.join(folder, "train.csv"), index=False)