# Chunking

# THIS BELOW CODE WILL RUN IN JUPYTER NOTEBOOK

import nltk
from nltk import pos_tag, word_tokenize, RegexParser

 # Download necessery NLTK data files (only need to this once)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# sample text
text = "Full stack datascience, generative ai, agentic ai, llm model keep increse by different company"

# Tokenize the text
tokens = word_tokenize(text)

# Perform part-of-speech tagging
tagged_tokens = pos_tag(tokens)

# Define a chunk grammr
chunk_grammar = r"""
    NP: {<DT>?<JJ>*<NN>}  # Noun phrase
    VP: {<VB.*><NP|PP|CLAUSE>*}  # Verb phrase
    PP: {<IN><NP>}  # Prepositional phrase
"""

# Create a chunk parser
chunk_parser = RegexParser(chunk_grammar)

# Parse the tagged tokens
chunked = chunk_parser.parse(tagged_tokens)

# Print the chunked output
print(chunked)

# optinally , you cam visulize the chunks
chunked.draw()



In [9]:
# Install NLTK (only needed if not pre-installed)
!pip install nltk --quiet

import nltk
from nltk import pos_tag, word_tokenize
from nltk.chunk import RegexpParser

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')  # For newer NLTK versions

# Sample text
text = "Full stack datascience, generative ai, agentic ai, llm model keep increase by different company"

# Tokenize the text
tokens = word_tokenize(text)

# POS tagging
tagged_tokens = pos_tag(tokens)

# Display POS-tagged tokens
print("POS Tagged Tokens:")
print(tagged_tokens)

# Define chunk grammar
chunk_grammar = r"""
    NP: {<DT>?<JJ>*<NN.*>}        # Noun Phrase
    VP: {<VB.*><NP|PP|CLAUSE>*}   # Verb Phrase
    PP: {<IN><NP>}                # Prepositional Phrase
"""

# Create a chunk parser
chunk_parser = RegexpParser(chunk_grammar)

# Parse the tagged tokens
chunked = chunk_parser.parse(tagged_tokens)

# Display parsed chunks
print("\nChunked Structure:")
print(chunked)

# NOTE: chunked.draw() won't work in Colab since it requires a GUI.
# To visualize chunks in Colab, you'd need to extract them programmatically instead.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


POS Tagged Tokens:
[('Full', 'NNP'), ('stack', 'NN'), ('datascience', 'NN'), (',', ','), ('generative', 'JJ'), ('ai', 'NN'), (',', ','), ('agentic', 'JJ'), ('ai', 'NN'), (',', ','), ('llm', 'JJ'), ('model', 'NN'), ('keep', 'VB'), ('increase', 'NN'), ('by', 'IN'), ('different', 'JJ'), ('company', 'NN')]

Chunked Structure:
(S
  (NP Full/NNP)
  (NP stack/NN)
  (NP datascience/NN)
  ,/,
  (NP generative/JJ ai/NN)
  ,/,
  (NP agentic/JJ ai/NN)
  ,/,
  (NP llm/JJ model/NN)
  (VP keep/VB (NP increase/NN))
  (PP by/IN (NP different/JJ company/NN)))


# LLM in chunking

In [16]:
!pip install transformers



In [18]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load a pre-trained model and tokenizer
model_name = "gpt2"  # You can replace with any other LLM
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Set pad_token_id to eos_token_id to avoid a warning
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id


def chunk_text(text, max_length=512):
    """Chunk text into smaller pieces."""
    tokens = tokenizer.encode(text, return_tensors='pt')[0]
    chunks = []

    for i in range(0, len(tokens), max_length):
        chunk = tokens[i:i + max_length]
        chunks.append(chunk)

    return chunks

def generate_responses(chunks):
    """Generate responses for each chunk using the LLM."""
    responses = []
    for chunk in chunks:
        input_ids = chunk.unsqueeze(0)  # Add batch dimension
        # Use max_new_tokens instead of max_length to control the length of the generated response
        output = model.generate(input_ids, max_new_tokens=100)  # Generate response
        responses.append(tokenizer.decode(output[0], skip_special_tokens=True))

    return responses

# Example long text
long_text = "Your long text goes here. " * 50  # Repeat to simulate long text

# Chunk the text
chunks = chunk_text(long_text)

# Generate responses for each chunk
responses = generate_responses(chunks)

# Print the responses
for i, response in enumerate(responses):
    print(f"Response for chunk {i+1}:\n{response}\n")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Response for chunk 1:
Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text goes here. Your long text g