In [None]:
import os
import sys
import logging

logging.basicConfig(level=logging.DEBUG)

# Use the current working directory to construct the src path, since __file__ is not defined in notebooks
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..", "src")))

In [None]:
from models.nlp_models.underthesea import UndertheseaModel
from models.config import NLPConfig

model = UndertheseaModel(NLPConfig())

In [None]:
query = "Thống kê số trạm Smallcell của Viettel tại Khu vực 1 năm 2025"
print(f"Tokenize: {model.tokenize(query)}")
print(f"POS Tag: {model.pos_tag(query)}")
print(f"Extract Entities: {model.extract_entities(query)}")
print(f"Analyze Sentiment: {model.analyze_sentiment(query)}")
print(f"Parse Dependencies: {model.parse_dependencies(query)}")

In [None]:
# Assuming the provided code is saved in a file named `netmind_nlp.py`

from pathlib import Path
from models.nlp_processor import NLPProcessor, NLPTechnique

# --- Test Functions ---

def test_nlp_processor():
    """
    Comprehensive test of the NLPProcessor class.
    """
    print("--- Initializing NLPProcessor ---")
    try:
        # Initialize the processor with the dummy config
        processor = NLPProcessor(model_config_path=r"..\config\nlp.yaml")
        print("✅ Processor initialized successfully.")
    except Exception as e:
        print(f"❌ Initialization failed: {e}")
        return

    # --- Test 1: Supported Languages & Techniques ---
    print("\n--- Test 1: Supported Languages & Techniques ---")
    supported_langs = processor.get_supported_languages()
    available_techniques = processor.get_available_techniques()
    print(f"Supported Languages: {supported_langs}")
    print(f"Available Techniques: {available_techniques}")

    # --- Test 2: English Text Processing ---
    print("\n--- Test 2: English Text Processing ---")
    english_text = "Google's Gemini is a powerful large language model. It's revolutionizing the way we interact with information."
    print(f"Input Text: '{english_text}'")

    english_result = processor.process_text(
        english_text,
        techniques=[
            NLPTechnique.TOKENIZATION,
            NLPTechnique.POS_TAGGING,
            NLPTechnique.SENTIMENT_ANALYSIS,
            NLPTechnique.NAMED_ENTITY_RECOGNITION
        ]
    )

    print(f"Detected Language: {english_result.language}")
    print(f"Processing Time: {english_result.processing_time:.4f}s")
    print("Tokens:", english_result.tokens)
    print("POS Tags:", english_result.pos_tags)
    print("Sentiment:", english_result.sentiment)
    print("Entities:", english_result.entities)

    # --- Test 3: Vietnamese Text Processing ---
    print("\n--- Test 3: Vietnamese Text Processing ---")
    vietnamese_text = "Hôm nay, thời tiết ở Hà Nội rất đẹp. Tôi muốn đi chơi."
    print(f"Input Text: '{vietnamese_text}'")

    vietnamese_result = processor.process_text(
        vietnamese_text,
        techniques=[
            NLPTechnique.TOKENIZATION,
            NLPTechnique.LEMMATIZATION,
            NLPTechnique.NAMED_ENTITY_RECOGNITION
        ]
    )
    print(f"Detected Language: {vietnamese_result.language}")
    print(f"Processing Time: {vietnamese_result.processing_time:.4f}s")
    print("Tokens:", vietnamese_result.tokens)
    print("Lemmas:", vietnamese_result.lemmas)
    print("Entities:", vietnamese_result.entities)


    # --- Test 4: Comprehensive Analysis ---
    print("\n--- Test 4: Comprehensive Text Analysis ---")
    analysis_text = "The company's new AI product, 'QuantumSphere,' achieved record sales in Q3."
    analysis_result = processor.analyze_text_comprehensive(analysis_text)

    print(f"Input Text: '{analysis_text}'")
    print(f"Language: {analysis_result.language}")
    print(f"Word Count: {analysis_result.word_count}")
    print(f"Average Word Length: {analysis_result.avg_word_length:.2f}")
    print(f"Sentiment Score: {analysis_result.sentiment_score:.2f}")
    print(f"Keywords: {analysis_result.keywords}")
    print(f"Entities: {analysis_result.entities}")
    print(f"Topics: {analysis_result.topics}")
    print(f"Grammar Issues: {analysis_result.grammar_issues}")

    # --- Test 5: Batch Processing ---
    print("\n--- Test 5: Batch Processing ---")
    batch_texts = [
        "This is the first sentence.",
        "And this is the second one.",
        "Đây là câu thứ ba trong danh sách."
    ]
    batch_results = processor.batch_process(batch_texts)

    for i, result in enumerate(batch_results):
        print(f"\nResult {i+1}:")
        print(f"Text: '{result.text}'")
        print(f"Language: {result.language}")
        print(f"Tokens: {result.tokens}")

    print("\n--- All tests completed. ---")

# --- Run the tests ---
if __name__ == "__main__":
    test_nlp_processor()

In [None]:
from models import nlp_processor

In [None]:
nlp_processor.process_text("Thông tin thuê bao ()*-25-890248598^&(Q#^%(&^$*(%^*(&)*#%&) Offline chặn cắt tại Trung tâm Quỳnh Lưu tỉnh Nghệ An trong cơn bão số 5")