In [None]:
!pip install spacy



In [None]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m102.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
paragraph = """Big data is transforming industries across the globe. It allows companies to gain insights from massive datasets. This leads to better decision-making and innovation."""

In [None]:
doc = nlp(paragraph)

In [None]:

sentences = [sent.text for sent in doc.sents]

In [None]:

for i, sentence in enumerate(sentences, 1):
  print(f"Sentence {i}: {sentence}")

Sentence 1: Big data is transforming industries across the globe.
Sentence 2: It allows companies to gain insights from massive datasets.
Sentence 3: This leads to better decision-making and innovation.


In [None]:
for i, sent in enumerate(doc.sents, 1):
    print(f"\nSentence {i}: {sent.text.strip()}")
    print("Tokens:")
    for token in sent:
        print(f"  - {token.text}")


Sentence 1: Big data is transforming industries across the globe.
Tokens:
  - Big
  - data
  - is
  - transforming
  - industries
  - across
  - the
  - globe
  - .

Sentence 2: It allows companies to gain insights from massive datasets.
Tokens:
  - It
  - allows
  - companies
  - to
  - gain
  - insights
  - from
  - massive
  - datasets
  - .

Sentence 3: This leads to better decision-making and innovation.
Tokens:
  - This
  - leads
  - to
  - better
  - decision
  - -
  - making
  - and
  - innovation
  - .


In [None]:
!pip install nltk



In [None]:
import nltk

In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
sentences = nltk.sent_tokenize(paragraph)

In [None]:
for i, sent in enumerate(sentences, 1):
    print(f"\nSentence {i}: {sent.strip()}")
    print("Tokens:")
    # Tokenize the sentence into words and punctuation
    tokens = nltk.word_tokenize(sent)
    for token in tokens:
        print(f"  - {token}")


Sentence 1: Big data is transforming industries across the globe.
Tokens:
  - Big
  - data
  - is
  - transforming
  - industries
  - across
  - the
  - globe
  - .

Sentence 2: It allows companies to gain insights from massive datasets.
Tokens:
  - It
  - allows
  - companies
  - to
  - gain
  - insights
  - from
  - massive
  - datasets
  - .

Sentence 3: This leads to better decision-making and innovation.
Tokens:
  - This
  - leads
  - to
  - better
  - decision-making
  - and
  - innovation
  - .


In [None]:
from nltk.tokenize import word_tokenize

In [None]:
def character_tokenize_with_word_tokenize(text, use_word_preprocess=True):
    """
    Perform character-level tokenization of a given text with optional preprocessing using word_tokenize from NLTK.

    Args:
        text (str): The input string to tokenize into characters.
        use_word_preprocess (bool): If True, preprocesses the text using word_tokenize before splitting into characters.
                                    Default is True to utilize the imported word_tokenize.

    Returns:
        list: A list of individual characters from the input text.
    """
    # Preprocess using word_tokenize if requested
    if use_word_preprocess:
        words = word_tokenize(text)
        # Rejoin the tokenized words into a single string (preserves NLTK's handling of punctuation)
        text = ' '.join(words)

    # Perform character-level tokenization using Python
    char_tokens = list(text)
    return char_tokens

# Example usage
text = "Big data is amazing!"
char_tokens = character_tokenize_with_word_tokenize(text, use_word_preprocess=True)

# Print the result
print("Input Text:", text)
print("Character Tokens:")
for i, char in enumerate(char_tokens, 1):
    print(f"  - Token {i}: '{char}'")

Input Text: Big data is amazing!
Character Tokens:
  - Token 1: 'B'
  - Token 2: 'i'
  - Token 3: 'g'
  - Token 4: ' '
  - Token 5: 'd'
  - Token 6: 'a'
  - Token 7: 't'
  - Token 8: 'a'
  - Token 9: ' '
  - Token 10: 'i'
  - Token 11: 's'
  - Token 12: ' '
  - Token 13: 'a'
  - Token 14: 'm'
  - Token 15: 'a'
  - Token 16: 'z'
  - Token 17: 'i'
  - Token 18: 'n'
  - Token 19: 'g'
  - Token 20: ' '
  - Token 21: '!'


In [None]:
!pip install tokenizers



In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
import os

In [None]:
def build_bpe_tokenizer(corpus_file, vocab_size=5000, output_path="bpe_tokenizer"):
    """
    Build and train a Byte Pair Encoding (BPE) tokenizer using Hugging Face tokenizers library.

    Args:
        corpus_file (str): Path to the text file used for training the tokenizer.
        vocab_size (int): The size of the vocabulary to build (number of merges).
        output_path (str): Directory to save the trained tokenizer.

    Returns:
        Tokenizer: Trained BPE tokenizer.
    """
    # Initialize a tokenizer with BPE model
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

    # Set a pre-tokenizer to split on whitespace
    tokenizer.pre_tokenizer = Whitespace()

    # Define the trainer with desired vocabulary size and special tokens
    trainer = BpeTrainer(
        vocab_size=vocab_size,
        show_progress=True,
        special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
    )

    # Train the tokenizer on the provided corpus
    tokenizer.train(files=[corpus_file], trainer=trainer)

    # Save the tokenizer
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    tokenizer.save(f"{output_path}/tokenizer.json")

    return tokenizer

In [None]:
def main():
    # Step 1: Prepare a sample corpus (replace with your own dataset)
    sample_text = """
    Big data refers to extremely large data sets that may be analyzed computationally to reveal patterns,
    trends, and associations, especially relating to human behavior and interactions. It includes data streams,
    static archives, structured and unstructured data, and comes from many sources like social media, sensors,
    and transaction logs. Handling big data at scale requires specialized tools and architectures.
    """

    # Save sample text to a temporary file for training
    corpus_file = "sample_corpus.txt"
    with open(corpus_file, "w", encoding="utf-8") as f:
        f.write(sample_text)

    # Step 2: Build and train the BPE tokenizer
    vocab_size = 100  # Small vocab size for demo; increase for real applications
    tokenizer = build_bpe_tokenizer(corpus_file, vocab_size=vocab_size, output_path="bpe_tokenizer")

    # Step 3: Demonstrate tokenization
    test_text = "Big data is amazing for analyzing trends!"
    encoded = tokenizer.encode(test_text)

    print("\nOriginal Text:", test_text)
    print("Token IDs:", encoded.ids)
    print("Tokens:", encoded.tokens)

    # Step 4: Demonstrate decoding
    decoded_text = tokenizer.decode(encoded.ids)
    print("Decoded Text:", decoded_text)

    # Clean up temporary file
    if os.path.exists(corpus_file):
        os.remove(corpus_file)

In [None]:
if __name__ == "__main__":
    main()


Original Text: Big data is amazing for analyzing trends!
Token IDs: [78, 44, 18, 27, 81, 10, 33, 72, 15, 23, 26, 35, 37, 32, 33, 72, 68, 22, 89, 0]
Tokens: ['Big', 'data', 'i', 's', 'am', 'a', 'z', 'ing', 'f', 'o', 'r', 'an', 'al', 'y', 'z', 'ing', 'tre', 'n', 'ds', '[UNK]']
Decoded Text: Big data i s am a z ing f o r an al y z ing tre n ds
