# Project: Term Extraction from Russian Texts

This project aims to extract key terms from Russian texts using NLP techniques
and a neural network model.

In [None]:
# Install necessary libraries
!pip install nltk spacy scikit-learn tensorflow pandas matplotlib seaborn wordcloud pymorphy2

# Download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Download spaCy Russian language model
!python -m spacy download ru_core_news_sm

print("Setup Complete. Libraries installed and resources downloaded.")

## Text Preprocessing Functions

In [None]:
# Import necessary libraries for preprocessing
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pymorphy2
import string

# Initialize MorphAnalyzer for Russian
morph = pymorphy2.MorphAnalyzer()

# Russian stopwords and punctuation
russian_stopwords = stopwords.words('russian')
punctuations = list(string.punctuation)

# --- Tokenization ---
def tokenize_text(text):
    """Tokenizes Russian text."""
    tokens = word_tokenize(text.lower(), language='russian') # Convert to lowercase for consistency
    return tokens

# --- Lemmatization ---
def lemmatize_tokens(tokens):
    """Lemmatizes a list of tokens using pymorphy2."""
    lemmatized_tokens = [morph.parse(token)[0].normal_form for token in tokens]
    return lemmatized_tokens

# --- Stop Word Removal ---
def remove_stopwords(tokens):
    """Removes Russian stopwords and punctuation from a list of tokens."""
    filtered_tokens = [token for token in tokens 
                       if token not in russian_stopwords 
                       and token not in punctuations 
                       and token.strip()] # Also remove empty strings that might result from punctuation removal
    return filtered_tokens

# --- Morphological Analysis ---
def get_morphological_analysis(tokens):
    """Performs morphological analysis on a list of tokens."""
    morph_info = []
    for token in tokens:
        parsed_token = morph.parse(token)[0]
        morph_info.append({'token': token, 'tag': parsed_token.tag})
    return morph_info

print("Text preprocessing and morphological analysis functions defined.")

### Example Usage of Preprocessing Functions

In [None]:
# Example Usage
sample_text = "Пример текста для демонстрации предобработки данных, включая знаки препинания!"
print(f"Original text: {sample_text}")

# Tokenize
tokens = tokenize_text(sample_text)
print(f"\nTokens: {tokens}")

# Lemmatize
lemmatized = lemmatize_tokens(tokens)
print(f"\nLemmatized: {lemmatized}")

# Remove Stopwords and Punctuation
no_stopwords = remove_stopwords(lemmatized)
print(f"\nWithout stopwords and punctuation: {no_stopwords}")

# Example with text that might produce empty strings after processing
sample_text_2 = "Ещё один пример - очень короткий."
print(f"\nOriginal text 2: {sample_text_2}")
tokens_2 = tokenize_text(sample_text_2)
lemmatized_2 = lemmatize_tokens(tokens_2)
no_stopwords_2 = remove_stopwords(lemmatized_2)
print(f"Without stopwords (Text 2): {no_stopwords_2}")

### Example Usage of Morphological Analysis Function

In [None]:
# Example Usage for Morphological Analysis
sample_tokens_for_morph = ['текста', 'для', 'демонстрации', 'обработки'] # Using some tokens from the previous example
print(f"Sample tokens for morphological analysis: {sample_tokens_for_morph}")
morphological_data = get_morphological_analysis(sample_tokens_for_morph)
for item in morphological_data:
    print(f"Token: {item['token']}, Morphology: {item['tag']}")

print("\n---")

# Example integrating with tokenize_text
sample_text_morph = "Красивая мама мыла раму."
print(f"\nOriginal text for morph analysis: {sample_text_morph}")

tokens_morph = tokenize_text(sample_text_morph)
print(f"Tokens: {tokens_morph}")

morphological_data_full = get_morphological_analysis(tokens_morph)
print("\nMorphological analysis of tokenized text:")
for item in morphological_data_full:
    print(f"Token: {item['token']}, Morphology: {item['tag']}")

print("\n---")
print("\nExample with lemmatized tokens (to see how tags might change or simplify):")
lemmatized_tokens_morph = lemmatize_tokens(tokens_morph) # Lemmatize first
print(f"Lemmatized Tokens: {lemmatized_tokens_morph}")
morphological_data_lemmatized = get_morphological_analysis(lemmatized_tokens_morph)
for item in morphological_data_lemmatized:
    print(f"Token: {item['token']}, Morphology: {item['tag']}")

## Statistical Analysis Functions

In [None]:
# Import necessary libraries for statistical analysis
import nltk
from nltk.probability import FreqDist

# --- Term Frequency (TF) Calculation ---
def calculate_tf(tokens):
    """Calculates Term Frequency for a list of tokens.
    
    Args:
        tokens (list): A list of processed tokens (preferably lemmatized and without stop words).
        
    Returns:
        nltk.probability.FreqDist: A FreqDist object containing token frequencies.
    """
    fdist = FreqDist(tokens)
    return fdist

# --- Display Most Frequent Terms ---
def display_most_frequent(fdist, top_n=10):
    """Displays the most frequent terms from a FreqDist object.
    
    Args:
        fdist (nltk.probability.FreqDist): A FreqDist object or a dictionary of token frequencies.
        top_n (int): The number of most frequent terms to display.
    """
    print(f"Top {top_n} most frequent terms:")
    for word, count in fdist.most_common(top_n):
        print(f"{word}: {count}")

print("Statistical analysis functions defined.")

### Example Usage of Statistical Analysis Functions

In [None]:
# Example Usage of Statistical Analysis Functions
sample_text_stats = "Это пример текста, который является примером и используется для демонстрации статистического анализа текста. Анализ текста важен для понимания текста."
print(f"Original text for statistical analysis: {sample_text_stats}")

# Preprocess the text (Tokenize, Lemmatize, Remove Stopwords)
tokens_stats = tokenize_text(sample_text_stats)
print(f"\nTokens: {tokens_stats}")

lemmatized_stats = lemmatize_tokens(tokens_stats)
print(f"\nLemmatized Tokens: {lemmatized_stats}")

no_stopwords_stats = remove_stopwords(lemmatized_stats)
print(f"\nProcessed (no stopwords) tokens for stats: {no_stopwords_stats}")

# Calculate TF
term_frequencies = calculate_tf(no_stopwords_stats)
print(f"\nTerm Frequencies (FreqDist): {term_frequencies}") # Shows the FreqDist object
# To see it as a dictionary:
# print(f"Term Frequencies (dict): {dict(term_frequencies)}")

# Display most frequent terms
display_most_frequent(term_frequencies, top_n=5)

print("\n---")
print("Note: For more advanced term importance, especially across multiple documents, TF-IDF (Term Frequency-Inverse Document Frequency) would be a next step.")

## Semantic Analysis Functions (spaCy)

In [None]:
# Import spacy and numpy (for potential vector operations if needed)
import spacy
import numpy as np # Usually not directly needed for basic similarity, but good to have for vector math

# Load the spaCy model for Russian.
# Note: 'ru_core_news_sm' is small and has basic vectors.
# For better quality word vectors, consider using larger models:
# !python -m spacy download ru_core_news_md
# nlp_spacy = spacy.load('ru_core_news_md')
# OR
# !python -m spacy download ru_core_news_lg
# nlp_spacy = spacy.load('ru_core_news_lg')
try:
    nlp_spacy = spacy.load('ru_core_news_sm')
except OSError:
    print('spaCy Russian model not found. Please run in the first cell:\n!python -m spacy download ru_core_news_sm')
    # Create a dummy nlp_spacy object to avoid errors in function definitions if model isn't downloaded yet
    class DummySpacyNLP:
        def __call__(self, text):
            class DummyDoc:
                def __init__(self, text):
                    self.text = text
                    self.vector = np.zeros((1,)) # Dummy vector
                    self.has_vector = False
                def similarity(self, other_doc):
                    return 0.0
            return DummyDoc(text)
    nlp_spacy = DummySpacyNLP()

# --- Get Word Vector ---
def get_word_vector(token_text, nlp_model):
    """Returns the word vector for a token using a spaCy model."""
    return nlp_model(token_text).vector

# --- Calculate Similarity Between Two Tokens ---
def calculate_similarity_between_tokens(token1_text, token2_text, nlp_model):
    """Calculates cosine similarity between two tokens using a spaCy model."""
    token1 = nlp_model(str(token1_text).lower()) # Process as string and lowercase
    token2 = nlp_model(str(token2_text).lower())
    if token1.has_vector and token2.has_vector:
        return token1.similarity(token2)
    else:
        # print(f"Warning: Vectors not found for '{token1.text}' or '{token2.text}'. Similarity is 0.")
        return 0.0

# --- Find Most Similar Tokens from a List ---
def find_most_similar_from_list(target_token_text, candidate_tokens_list, nlp_model, top_n=3):
    """Finds the most similar tokens to a target token from a given list."""
    target_token = nlp_model(str(target_token_text).lower())
    if not target_token.has_vector:
        print(f"Warning: No vector for target token '{target_token.text}' in the model.")
        return []
    
    similarities = []
    for candidate_text in candidate_tokens_list:
        candidate_token = nlp_model(str(candidate_text).lower())
        if candidate_token.has_vector and candidate_token.text != target_token.text: # Ensure not comparing to itself
            similarity_score = target_token.similarity(candidate_token)
            similarities.append((candidate_token.text, similarity_score))
            
    # Sort by similarity score in descending order
    similarities.sort(key=lambda item: item[1], reverse=True)
    return similarities[:top_n]

print("Semantic analysis functions (spaCy) defined.")

### Example Usage of Semantic Analysis Functions

In [None]:
# Example Usage of Semantic Analysis Functions

# Ensure nlp_spacy is loaded (it should be from the cell above)
if isinstance(nlp_spacy, spacy.lang.ru.Russian):
    print(f"spaCy model '{nlp_spacy.meta['name']}' loaded successfully for semantic analysis.\n")
else:
    print("spaCy model was not loaded successfully. Semantic analysis examples might not work.\n")

sample_text_semantic = "Компьютер это мощное устройство для вычислений и обработки данных. Телефон также является устройством связи."
print(f"Sample text for semantic analysis: {sample_text_semantic}")

# Tokenize the text (using existing function for consistency, though spaCy can tokenize directly)
raw_tokens_semantic = tokenize_text(sample_text_semantic) 
print(f"\nRaw tokens from NLTK tokenizer: {raw_tokens_semantic}")

# For spaCy similarity, it's often good to work with lemmas or lowercase individual words.
# We will use lowercase versions of the NLTK-tokenized words for this example.

# 1. Demonstrate getting a word vector
example_token_for_vector = "компьютер" # Using lowercase directly
if nlp_spacy(example_token_for_vector).has_vector:
    vector = get_word_vector(example_token_for_vector, nlp_spacy)
    print(f"\nVector for '{example_token_for_vector}' (first 5 elements): {vector[:5]}...")
else:
    print(f"\nNo vector found for '{example_token_for_vector}' in the current spaCy model.")

# 2. Demonstrate similarity calculation between two tokens
token_a_text = "компьютер"
token_b_text = "устройство"
similarity_score = calculate_similarity_between_tokens(token_a_text, token_b_text, nlp_spacy)
print(f"\nSimilarity between '{token_a_text}' and '{token_b_text}': {similarity_score:.4f}")

token_c_text = "телефон"
similarity_score_2 = calculate_similarity_between_tokens(token_a_text, token_c_text, nlp_spacy)
print(f"Similarity between '{token_a_text}' and '{token_c_text}': {similarity_score_2:.4f}")

token_d_text = "книга"
similarity_score_3 = calculate_similarity_between_tokens(token_a_text, token_d_text, nlp_spacy)
print(f"Similarity between '{token_a_text}' and '{token_d_text}': {similarity_score_3:.4f}")

# 3. Demonstrate finding most similar tokens from a predefined list
target_word_semantic = "обработка"
# Use unique, lemmatized, and stopword-removed tokens from the sample text as candidates, or a custom list
lemmatized_semantic = lemmatize_tokens(raw_tokens_semantic)
processed_tokens_semantic = remove_stopwords(lemmatized_semantic) # These are already lowercase
candidate_list_semantic = list(set(processed_tokens_semantic)) # Unique tokens from the sample text
print(f"\nFinding words similar to '{target_word_semantic}' from candidates: {candidate_list_semantic}")

similar_tokens_list = find_most_similar_from_list(target_word_semantic, candidate_list_semantic, nlp_spacy, top_n=3)

if similar_tokens_list:
    print(f"Most similar tokens to '{target_word_semantic}':")
    for token, score in similar_tokens_list:
        print(f"- {token}: {score:.4f}")
else:
    print(f"Could not find similar tokens for '{target_word_semantic}' from the list or it has no vector.")

print("\nNote: Similarity scores depend heavily on the quality of word vectors in the loaded spaCy model.")
print("Using 'ru_core_news_md' or 'ru_core_news_lg' would likely provide more nuanced results.")

## Neural Network for Term Extraction (BiLSTM Model)

This section outlines the basic structure for a Bidirectional LSTM model using TensorFlow/Keras for sequence tagging, which is a common approach for term extraction. This is a placeholder and requires real annotated data for training and meaningful results.

In [None]:
# --- Imports and Setup for Neural Network ---
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# --- Constants and Mappings (Placeholders) ---
MAX_SEQ_LEN = 50      # Max length of a sequence (sentence)
EMBEDDING_DIM = 100   # Dimension of word embeddings
LSTM_UNITS = 64       # Number of units in LSTM layer
VOCAB_SIZE = 1000     # Placeholder: actual size depends on your corpus, e.g., len(word_to_idx)
NUM_TAGS = 3          # Example: 0 for O (Outside), 1 for B-TERM (Begin), 2 for I-TERM (Inside)

# Dummy mappings (replace with actual mappings from your data)
word_to_idx_nn = {'<pad>': 0, '<UNK>': 1, 'слово': 2, 'термин': 3, 'анализ': 4, 'данные': 5, 'текст': 6} # Extended for dummy data + UNK
tag_to_idx_nn = {'O': 0, 'B-TERM': 1, 'I-TERM': 2}
idx_to_tag_nn = {v: k for k, v in tag_to_idx_nn.items()}

# Update VOCAB_SIZE based on the dummy mapping for this example
VOCAB_SIZE = len(word_to_idx_nn)

# --- 1. Data Preparation Placeholder Function ---
def prepare_nn_data(texts_tokens_list, tags_list, word_to_idx_map, tag_to_idx_map, max_seq_len, num_tags_val):
    """Generates dummy sequences of word indices and tag indices.
    In a real scenario:
    1. Convert lists of tokens (texts_tokens_list) to sequences of integers using word_to_idx_map.
    2. Convert lists of tags (tags_list) to sequences of integers using tag_to_idx_map.
    3. Pad both token and tag sequences to max_seq_len.
    4. One-hot encode the tag sequences if using 'categorical_crossentropy' loss.
    """
    print("\n--- Generating Dummy Data for NN --- ")
    num_samples = 5 # Number of dummy samples to generate
    min_len, max_len = 5, max_seq_len - 5 # Min/max length for random sequences before padding

    # Generate dummy word index sequences
    # Each sample will have a random length between min_len and max_len
    X_dummy_sequences = []
    for _ in range(num_samples):
        seq_len = np.random.randint(min_len, max_len + 1)
        # Ensure values are within the valid range of word_to_idx_map keys (excluding <pad> and <UNK> for generation)
        sequence = np.random.randint(2, len(word_to_idx_map), size=seq_len) # Start from index 2 to avoid <pad> and <UNK>
        X_dummy_sequences.append(sequence)
    
    X_padded = pad_sequences(X_dummy_sequences, maxlen=max_seq_len, padding='post', value=word_to_idx_map.get('<pad>', 0))
    
    # Generate dummy tag index sequences (corresponding to the unpadded length of X)
    y_dummy_indices_list = []
    for seq in X_dummy_sequences: # Use the length of the original unpadded sequences for tags
        tag_sequence = np.random.randint(0, num_tags_val, size=len(seq))
        y_dummy_indices_list.append(tag_sequence)
        
    y_padded_indices = pad_sequences(y_dummy_indices_list, maxlen=max_seq_len, padding='post', value=tag_to_idx_map.get('O', 0)) # Pad with 'O' tag
    y_padded_one_hot = to_categorical(y_padded_indices, num_classes=num_tags_val)
    
    print(f"Generated {num_samples} dummy samples.")
    print(f"X_padded shape: {X_padded.shape} (Samples, Max Sequence Length)")
    print(f"y_padded_one_hot shape: {y_padded_one_hot.shape} (Samples, Max Sequence Length, Num Tags)")
    print("Note: This is DUMMY data. Real data requires careful preprocessing and annotation.")
    return X_padded, y_padded_one_hot

# --- 2. Build BiLSTM Model Function ---
def build_bilstm_model(max_seq_len_val, vocab_size_val, embedding_dim_val, lstm_units_val, num_tags_val):
    """Builds and compiles a BiLSTM model for sequence tagging."""
    # Input layer
    input_layer = Input(shape=(max_seq_len_val,))
    
    # Embedding layer
    embedding_layer = Embedding(input_dim=vocab_size_val, 
                                output_dim=embedding_dim_val, 
                                input_length=max_seq_len_val,
                                mask_zero=True)(input_layer) # mask_zero=True is useful if padding with 0
    
    # BiLSTM layer
    bilstm_layer = Bidirectional(LSTM(units=lstm_units_val, 
                                      return_sequences=True, 
                                      recurrent_dropout=0.1))(embedding_layer)
    
    # TimeDistributed Dense output layer
    # Applies a Dense layer to each time step of the BiLSTM output
    output_layer = TimeDistributed(Dense(num_tags_val, activation="softmax"))(bilstm_layer)
    
    # Create the Keras Model
    model = Model(inputs=input_layer, outputs=output_layer)
    
    # Compile the model
    model.compile(optimizer="adam", 
                  loss="categorical_crossentropy", 
                  metrics=["accuracy"])
    return model

# --- 3. Training Placeholder Function ---
def train_nn_model(model, X_train, y_train, epochs_val=1, batch_size_val=2): # Reduced defaults for dummy run
    """Trains the model. For this placeholder, it runs for a few epochs on dummy data."""
    print("\n--- Training Neural Network Model (Placeholder) ---")
    if X_train is None or y_train is None:
        print("No training data provided. Skipping actual training call.")
        return model
        
    print(f"Starting 'training' for {epochs_val} epoch(s) with batch size {batch_size_val}...")
    # In a real scenario, use more epochs, proper validation split, callbacks, etc.
    model.fit(X_train, y_train, epochs=epochs_val, batch_size=batch_size_val, verbose=1)
    print("Placeholder 'training' complete. With real data, this would involve more extensive training.")
    return model

# --- 4. Prediction Function ---
def predict_terms_nn(model, text_sequence_padded_idx, idx_to_tag_map):
    """Predicts tag labels for a given preprocessed text sequence."""
    if model is None:
        # Try to build and train a dummy model if not available (e.g., if pipeline run without running NN example cell)
        print("NN model not found. Building and 'training' a dummy one for pipeline demonstration.")
        temp_X, temp_y = prepare_nn_data(None, None, word_to_idx_nn, tag_to_idx_nn, MAX_SEQ_LEN, NUM_TAGS)
        temp_model = build_bilstm_model(MAX_SEQ_LEN, VOCAB_SIZE, EMBEDDING_DIM, LSTM_UNITS, NUM_TAGS)
        temp_model = train_nn_model(temp_model, temp_X, temp_y, epochs_val=1, batch_size_val=1)
        # Make the temporary model global for subsequent calls in the same pipeline run if needed
        # Or better, ensure nn_model is always passed or available in the scope calling this.
        # For this function, we'll use the locally trained temp_model for prediction here.
        # This is not ideal for performance but makes pipeline more robust to cell execution order for demo.
        if temp_model is None:
            return ["Error: Dummy model could not be built for prediction."] * len(text_sequence_padded_idx[0])
        prediction_model = temp_model
    else:
        prediction_model = model
        
    # Model expects a batch, so even for a single sequence, it should be shaped (1, max_seq_len)
    if len(text_sequence_padded_idx.shape) == 1: # If it's a flat array
        text_sequence_padded_idx = np.expand_dims(text_sequence_padded_idx, axis=0)
        
    # Get tag probabilities from the model
    tag_probabilities = prediction_model.predict(text_sequence_padded_idx, verbose=0) # verbose=0 to reduce output in pipeline
    
    # Convert probabilities to tag indices (select the tag with the highest probability at each step)
    predicted_tag_indices = np.argmax(tag_probabilities, axis=-1)
    
    # Convert indices to tag labels
    predicted_labels = [idx_to_tag_map.get(idx, 'UNK') for idx in predicted_tag_indices[0]] # [0] because we predict one sequence at a time
    
    return predicted_labels

print("Neural Network (BiLSTM) functions defined.")

### Example Usage of Neural Network Model (BiLSTM)

In [None]:
# --- Example Usage / Demonstration of BiLSTM ---
print("\n--- Starting Neural Network (BiLSTM) Demonstration ---")

# 1. Generate Dummy Data
# In a real scenario, 'texts_tokens_list' and 'tags_list' would come from your annotated dataset.
X_train_dummy, y_train_dummy_one_hot = prepare_nn_data(
    texts_tokens_list=None, # Placeholder for actual tokenized sentences
    tags_list=None,         # Placeholder for actual tag sequences
    word_to_idx_map=word_to_idx_nn,
    tag_to_idx_map=tag_to_idx_nn,
    max_seq_len=MAX_SEQ_LEN,
    num_tags_val=NUM_TAGS
)

# 2. Build the Model
print("\n--- Building BiLSTM Model ---")
nn_model = build_bilstm_model(
    max_seq_len_val=MAX_SEQ_LEN, 
    vocab_size_val=VOCAB_SIZE, # Make sure VOCAB_SIZE is updated if word_to_idx_nn changes
    embedding_dim_val=EMBEDDING_DIM, 
    lstm_units_val=LSTM_UNITS, 
    num_tags_val=NUM_TAGS
)

if nn_model:
    nn_model.summary() # Print model architecture
else:
    print("Model building returned None.")

# 3. "Train" the Model (on dummy data for demonstration)
if nn_model:
    nn_model = train_nn_model(nn_model, X_train_dummy, y_train_dummy_one_hot, epochs_val=1, batch_size_val=2)
else:
    print("Skipping training as model was not built.")

# 4. Predict on a Sample Test Sequence (dummy)
print("\n--- Predicting with BiLSTM Model (Dummy Example) ---")
if nn_model and X_train_dummy.shape[0] > 0:
    # Take the first dummy sequence from the generated training data as a sample for prediction
    sample_test_sequence_idx = X_train_dummy[0:1] # Shape (1, MAX_SEQ_LEN)
    original_input_tokens_indices = sample_test_sequence_idx[0]

    predicted_tag_labels = predict_terms_nn(nn_model, sample_test_sequence_idx, idx_to_tag_nn)
    
    print(f"\nSample Input (dummy word indices, first 15): {original_input_tokens_indices[:15]}...")
    # For clarity, let's map some input indices back to words (if they exist in our tiny dummy map)
    idx_to_word_nn_rev = {v: k for k,v in word_to_idx_nn.items()} # Corrected variable name for reverse map
    sample_input_words = [idx_to_word_nn_rev.get(idx, '?') for idx in original_input_tokens_indices[:15]]
    print(f"Sample Input (dummy words, first 15):       {sample_input_words}...")
    print(f"Predicted Tags (first 15):                {predicted_tag_labels[:15]}...")
    
    # Show actual (dummy) Y for comparison for the first sample
    actual_tags_one_hot = y_train_dummy_one_hot[0]
    actual_tags_indices = np.argmax(actual_tags_one_hot, axis=-1)
    actual_labels = [idx_to_tag_nn.get(idx, 'UNK') for idx in actual_tags_indices[:15]]
    print(f"Actual Tags (dummy, first 15):            {actual_labels}...")
else:
    print("Skipping prediction as model or dummy data is not available.")

print("\n--- Neural Network (BiLSTM) Demonstration Complete ---")
print("Reminder: This setup uses DUMMY data and MAPPINGS. For real term extraction, use annotated text data.")

## Glossary Management

In [None]:
# --- Glossary Management ---
glossary = [] # Initialize the global glossary list

# --- Add Term to Glossary ---
def add_to_glossary(term_text, source_method, details_dict=None):
    """Adds a term to the global glossary, checking for case-insensitive duplicates."""
    global glossary
    term_text_lower = term_text.lower() # For case-insensitive check
    
    # Check for existing term (case-insensitive)
    for entry in glossary:
        if entry['term'].lower() == term_text_lower:
            print(f"Term '{term_text}' (as '{entry['term']}') already in glossary. (Original source: {entry['source']}). Not adding again.")
            # Optional: Could update details or add new source here if desired.
            # For example, if entry['source'] != source_method, append to a list of sources.
            return

    new_entry = {
        'term': term_text, 
        'source': source_method, 
        'details': details_dict if details_dict is not None else {}
    }
    glossary.append(new_entry)
    print(f"Term '{term_text}' added to glossary from {source_method}.")

# --- Display Glossary ---
def display_glossary():
    """Displays all terms in the global glossary."""
    global glossary
    print("\n--- Glossary ---")
    if not glossary:
        print("The glossary is currently empty.")
        return
    
    for i, entry in enumerate(glossary):
        term_info = f"{i+1}. Term: {entry['term']}\n"
        term_info += f"   Source: {entry['source']}\n"
        if entry['details']:
            term_info += f"   Details: {entry['details']}"
        else:
            term_info += f"   Details: N/A"
        print(term_info)

print("Glossary management functions defined.")

### Example Usage of Glossary Management

In [None]:
# --- Testing Glossary Management ---
print("\n--- Testing Glossary Management ---")

# Re-initialize glossary for clean testing if this cell is run multiple times
glossary = [] 
print("Glossary reset for this test run.")

add_to_glossary("машинное обучение", "manual", {'definition': 'Раздел ИИ, изучающий методы построения алгоритмов, способных обучаться.'})
add_to_glossary("анализ данных", "statistical", {'frequency': 15, 'document_id': 'doc_A01'})
add_to_glossary("нейронная сеть", "NN_placeholder", {'confidence': 0.88, 'layers': 3})
add_to_glossary("Естественный язык", "semantic_similarity", {'related_to': 'обработка текста', 'score': 0.92})

# Attempt to add a duplicate term (case-insensitive)
add_to_glossary("Анализ Данных", "manual") # Should be detected as duplicate
add_to_glossary("нейронная Сеть", "other_method") # Another duplicate test

display_glossary()

print("\n--- Adding terms from a hypothetical list ---")
hypothetical_terms = [
    ("обработка текста", "statistical", {"score": 0.75, "co_occurrence_with": "алгоритм"}),
    ("ключевое слово", "semantic", {"related_to": "термин", "similarity": 0.85}),
    ("Машинное Обучение", "review", {"notes": "Needs verification by expert"}) # Duplicate check
]

for term, source, details in hypothetical_terms:
    add_to_glossary(term, source, details)

display_glossary()

# Test adding a term with no details
add_to_glossary("Простой Термин", "manual_input")
display_glossary()

## Visualization Functions

In [None]:
# --- Visualization Imports---
import matplotlib.pyplot as plt
from wordcloud import WordCloud
# NLTK FreqDist is used by plot_term_frequencies, ensure it's available
# It's imported in the 'Statistical Analysis Functions' section, so it should be in scope
# from nltk.probability import FreqDist 

# --- Generate Word Cloud ---
def generate_word_cloud(tokens_list, font_path='/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf', width=800, height=400, background_color='white'):
    """Generates and displays a word cloud from a list of tokens."""
    text = " ".join(tokens_list)
    if not text.strip(): # Check if the text is empty or only whitespace
        print("Cannot generate word cloud from empty text.")
        return
    
    try:
        wordcloud_obj = WordCloud(width=width, height=height, 
                                  background_color=background_color, 
                                  font_path=font_path,
                                  collocations=False).generate(text) # Added collocations=False for individual words
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud_obj, interpolation='bilinear')
        plt.axis("off")
        plt.show()
    except RuntimeError as e: # Handle font not found or other WordCloud errors
        print(f"Error generating word cloud (likely font issue or empty text): {e}")
        print("Please ensure a Cyrillic-supporting font is available at the specified font_path.")
        print("In Colab, DejaVuSans.ttf is usually available. If not, upload a .ttf font and update font_path, or install one system-wide.")
    except Exception as e: # Catch any other unexpected errors
        print(f"An unexpected error occurred during word cloud generation: {e}")

# --- Plot Term Frequencies ---
def plot_term_frequencies(fdist, top_n=10, title='Top N Term Frequencies'):
    """Plots a bar chart of the most frequent terms from a FreqDist object."""
    if not isinstance(fdist, FreqDist) and not isinstance(fdist, dict):
        print("Invalid input: fdist must be an NLTK FreqDist object or a dictionary.")
        return
    if not fdist:
        print("Frequency distribution is empty. Cannot plot.")
        return
        
    # Get most common items. If fdist is dict, convert to FreqDist for most_common
    if isinstance(fdist, dict):
        fdist_converted = FreqDist(fdist)
    else:
        fdist_converted = fdist
        
    common_words = fdist_converted.most_common(top_n)
    
    if not common_words:
        print("No common words to plot (possibly after filtering or for low top_n)." )
        return
        
    terms, frequencies = zip(*common_words)
    
    plt.figure(figsize=(10, 6))
    plt.bar(terms, frequencies, color='skyblue')
    plt.xlabel('Terms', fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    plt.title(title, fontsize=14)
    plt.xticks(rotation=45, ha="right", fontsize=10)
    plt.yticks(fontsize=10)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout() # Adjust layout to prevent labels from overlapping
    plt.show()

print("Visualization functions (generate_word_cloud, plot_term_frequencies) defined.")

### Example Usage of Visualization Functions

In [None]:
# --- Testing Visualization Functions ---
print("\n--- Testing Visualization Functions ---")

sample_text_viz = "Это пример текста для визуализации частоты слов и облака слов. " \
                  "Повторяем слова: текст, слова, пример. Ещё немного текста про слова и анализ. " \
                  "Визуализация это важный этап анализа данных. Анализ, анализ, анализ!"

print(f"Original text for visualization: {sample_text_viz}")

# Preprocess the text (using existing functions)
tokens_viz = tokenize_text(sample_text_viz)
lemmatized_viz = lemmatize_tokens(tokens_viz)
no_stopwords_viz = remove_stopwords(lemmatized_viz)

print(f"\nProcessed tokens for visualization: {no_stopwords_viz}")

if no_stopwords_viz:
    # Generate Word Cloud
    print("\nGenerating Word Cloud...")
    generate_word_cloud(no_stopwords_viz) # Using default font_path
    # generate_word_cloud(no_stopwords_viz, font_path='non_existent_font.ttf') # Test font error handling

    # Plot Term Frequencies
    print("\nPlotting Term Frequencies...")
    fdist_viz = calculate_tf(no_stopwords_viz) # Assumes calculate_tf is defined
    plot_term_frequencies(fdist_viz, top_n=7, title="Top 7 Frequent Terms in Sample Text")
    
    # Test with an empty list of tokens for word cloud
    print("\nTesting word cloud with empty token list...")
    generate_word_cloud([])
    
    # Test with empty FreqDist for bar plot
    print("\nTesting frequency plot with empty FreqDist...")
    plot_term_frequencies(FreqDist())
else:
    print("No tokens to visualize after preprocessing.")

## End-to-End Term Extraction Pipeline

In [None]:
# --- End-to-End Term Extraction Pipeline Function ---
def run_term_extraction_pipeline(raw_russian_text):
    """Integrates and demonstrates the full term extraction workflow."""
    global glossary # Using the global glossary list, re-initialize for each run
    global nn_model # Assuming nn_model is globally available or built by its example cell
    # Other globals assumed to be available: morph, nlp_spacy, word_to_idx_nn, idx_to_tag_nn, MAX_SEQ_LEN, NUM_TAGS

    print("--- Starting Term Extraction Pipeline ---")
    print(f"Original Text:\n{raw_russian_text}\n")

    # 1. Initial Setup - Reset Glossary for this run
    glossary = []
    print("Glossary reset for this pipeline run.")

    # 2. Preprocessing
    print("\n--- 1. Preprocessing Stage ---")
    tokens = tokenize_text(raw_russian_text)
    print(f"Tokens (first 10): {tokens[:10]}...")
    lemmatized_tokens = lemmatize_tokens(tokens)
    print(f"Lemmatized Tokens (first 10): {lemmatized_tokens[:10]}...")
    filtered_tokens = remove_stopwords(lemmatized_tokens)
    print(f"Filtered Tokens (first 10): {filtered_tokens[:10]}...")

    if not filtered_tokens:
        print("No tokens remaining after preprocessing. Pipeline cannot continue meaningfully.")
        return

    # 3. Morphological Analysis
    print("\n--- 2. Morphological Analysis (First 5 Filtered Tokens) ---")
    morph_analysis_results = get_morphological_analysis(filtered_tokens[:5])
    for res in morph_analysis_results:
        print(f"Token: {res['token']}, Tag: {res['tag']}")

    # 4. Statistical Analysis
    print("\n--- 3. Statistical Analysis ---")
    fdist = calculate_tf(filtered_tokens)
    display_most_frequent(fdist, top_n=5)
    print("Adding top 3 frequent terms to glossary:")
    for term, freq in fdist.most_common(3):
        add_to_glossary(term, "statistical (TF)", {'frequency': freq})

    # 5. Semantic Analysis (Demonstration)
    print("\n--- 4. Semantic Analysis (Demonstration) ---")
    if len(filtered_tokens) >= 2:
        token1_sem = filtered_tokens[0]
        token2_sem = filtered_tokens[1]
        # Pick a common related word and an unrelated one for demonstration
        related_word_example = "данные" # (data)
        unrelated_word_example = "кошка" # (cat)
        
        print(f"Calculating similarity for '{token1_sem}':")
        sim1 = calculate_similarity_between_tokens(token1_sem, related_word_example, nlp_spacy)
        print(f"  Similarity between '{token1_sem}' and '{related_word_example}': {sim1:.4f}")
        sim2 = calculate_similarity_between_tokens(token1_sem, unrelated_word_example, nlp_spacy)
        print(f"  Similarity between '{token1_sem}' and '{unrelated_word_example}': {sim2:.4f}")
        
        # Find similar from a small candidate list
        candidate_list_sem = [t for t in filtered_tokens[1:6] if t != token1_sem] # a few other unique tokens from text
        if candidate_list_sem:
            print(f"Finding terms similar to '{token1_sem}' from candidates: {candidate_list_sem}")
            similar_list = find_most_similar_from_list(token1_sem, candidate_list_sem, nlp_spacy, top_n=2)
            if similar_list:
                for term, score in similar_list:
                    print(f"  - {term}: {score:.4f}")
            else:
                print("  No similar tokens found in the candidate list or target has no vector.")
        else:
            print("  Not enough unique candidate tokens for 'find_most_similar_from_list' demo.")
    else:
        print("Not enough filtered tokens for semantic analysis demonstration.")

    # 6. Neural Network Term Extraction (Simulated)
    print("\n--- 5. Neural Network Term Extraction (SIMULATED) ---")
    print("NOTE: The following NN results are from an UNTRAINED model using DUMMY mappings & data.")
    print("This is for demonstrating pipeline flow ONLY.")
    
    # Convert filtered_tokens to sequences of indices
    # Using word_to_idx_nn which now includes '<UNK>'
    indexed_tokens = [word_to_idx_nn.get(token, word_to_idx_nn['<UNK>']) for token in filtered_tokens]
    padded_sequence = pad_sequences([indexed_tokens], maxlen=MAX_SEQ_LEN, padding='post', value=word_to_idx_nn['<pad>'])
    
    # Ensure nn_model is available (it should be if the NN example cell was run)
    global nn_model # Explicitly declare usage of global model for clarity
    if 'nn_model' not in globals() or nn_model is None:
        print("NN model is not available. Building and training a dummy one for this run...")
        # This part is to make the pipeline runnable even if NN example cell wasn't run
        # It uses the global constants like MAX_SEQ_LEN, VOCAB_SIZE etc.
        temp_X, temp_y = prepare_nn_data(None, None, word_to_idx_nn, tag_to_idx_nn, MAX_SEQ_LEN, NUM_TAGS)
        nn_model = build_bilstm_model(MAX_SEQ_LEN, VOCAB_SIZE, EMBEDDING_DIM, LSTM_UNITS, NUM_TAGS)
        nn_model = train_nn_model(nn_model, temp_X, temp_y, epochs_val=1, batch_size_val=1)
        print("Dummy NN model prepared.")
        
    predicted_nn_tags = predict_terms_nn(nn_model, padded_sequence, idx_to_tag_nn)
    
    print("Hypothetically extracted NN terms (B-TERM or I-TERM predictions):")
    nn_term_count = 0
    for i, token_text in enumerate(filtered_tokens):
        if i < len(predicted_nn_tags): # Ensure we don't go out of bounds if padding affected tag list length
            tag = predicted_nn_tags[i]
            if tag in ['B-TERM', 'I-TERM']:
                print(f"  - Term: '{token_text}', Predicted Tag: {tag}")
                if nn_term_count < 2: # Add first two hypothetical terms to glossary
                    add_to_glossary(token_text, "NN (simulated)", {'predicted_tag': tag, 'confidence': 'N/A (dummy)'})
                    nn_term_count += 1
        else:
            break # Stop if we run out of predicted tags
    if nn_term_count == 0:
        print("  No terms tagged as B-TERM or I-TERM by the simulated NN.")

    # 7. Glossary Display
    print("\n--- 6. Final Glossary ---")
    display_glossary()

    # 8. Visualization
    print("\n--- 7. Visualizations ---")
    if filtered_tokens:
        print("Generating Word Cloud for filtered tokens...")
        generate_word_cloud(filtered_tokens)
        print("Plotting Term Frequencies for filtered tokens...")
        plot_term_frequencies(fdist, top_n=10, title="Top 10 Frequent Terms in Processed Text")
    else:
        print("No filtered tokens to visualize.")
        
    print("\n--- Term Extraction Pipeline Complete ---")

print("End-to-end term extraction pipeline function defined.")

### Example Usage of the Full Pipeline

Before running the pipeline example, ensure that the necessary global variables and models from previous sections are initialized. This includes:
- `morph` (Pymorphy2 MorphAnalyzer)
- `nlp_spacy` (spaCy Russian model)
- `word_to_idx_nn`, `tag_to_idx_nn`, `idx_to_tag_nn`, `MAX_SEQ_LEN`, `EMBEDDING_DIM`, `LSTM_UNITS`, `VOCAB_SIZE`, `NUM_TAGS` (for the NN part)
- The `nn_model` itself (the dummy BiLSTM model built in the NN example section). The pipeline has a fallback to build it if not found, but it's better if the NN example cell is run first.

The `glossary` list will be reset at the beginning of each pipeline run.

In [None]:
# --- Example: Running the Full Term Extraction Pipeline ---
print("\n--- Example: Full Term Extraction Pipeline Run ---")

# Ensure global models/variables are available. 
# If any of these were not run in their respective cells, errors might occur or dummy models will be built.
print("Checking for required global variables and models...")
required_globals = ['morph', 'nlp_spacy', 'word_to_idx_nn', 'idx_to_tag_nn', 'MAX_SEQ_LEN', 
                    'EMBEDDING_DIM', 'LSTM_UNITS', 'VOCAB_SIZE', 'NUM_TAGS', 'nn_model', 'glossary']
for var_name in required_globals:
    if var_name not in globals():
        print(f"Warning: Global variable or model '{var_name}' may not be initialized. Please run previous cells.")
    elif var_name == 'nn_model' and globals().get('nn_model') is None:
        print(f"Warning: Global 'nn_model' is None. Pipeline will attempt to build a dummy one.")
print("Checks complete.")

sample_russian_paragraph = ( # Using a multi-line string for a slightly longer text
    "Современные методы анализа данных играют ключевую роль в научных исследованиях и бизнес-аналитике. "
    "Машинное обучение, как одно из направлений искусственного интеллекта, предлагает мощные инструменты для обработки больших объемов информации. "
    "Нейронные сети и глубокое обучение позволяют выявлять сложные закономерности в данных. "
    "Для эффективного анализа текста важно использовать качественную предобработку, включая лемматизацию и удаление стоп-слов. "
    "Результаты такого анализа могут быть представлены в виде облака слов или частотных диаграмм."
)

run_term_extraction_pipeline(sample_russian_paragraph)

print("\n--- Second Pipeline Run with Different Text (Demonstrating Glossary Reset) ---")
sample_short_text = "Искусственный интеллект это технология будущего. Анализ текста важен."
run_term_extraction_pipeline(sample_short_text)