#Tokenization Techniques in NLP
Week 2 NLP Pipeline
PBA/ Genap 2025/ Irmasari Hafidz
irma@its.ac.id

## Install Dependencies


In [None]:
!pip install nltk
!pip install spacy sacremoses sentencepiece
! python3 -m spacy download en_core_web_sm

from spacy.lang.en import English
nlp = English()


## Import Required Libraries

In [None]:
import nltk
import spacy
import re
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from nltk.tokenize.treebank import TreebankWordTokenizer
from sacremoses import MosesTokenizer
import sentencepiece as spm

### White Space Tokenization
*   one of the simplest tokenization techniques as it uses whitespace within the string as the delimiter of words.
*   Wherever the white space is, it will split the data at that point
*   Using **Python built** .split() in or **Spacy**

```
# sentence.split()
```




In [None]:
sentence = "Dikutip dari Reuters, dia menyampaikan permintaan maaf sebelum mengundurkan diri karena telah berlibur dengan keluarganya selama 4 minggu tak lama setelah banjir terjadi dan menewaskan lebih dari 100 orang."
sentence.split()

In [None]:
import nltk
from nltk.tokenize import WhitespaceTokenizer

# Create a reference variable for Class WhitespaceTokenizer
wtk = WhitespaceTokenizer()
#give string input
text1 = "Computers offer powerful capabilities for searching and reasoning about structured records and relational data"
#use tokenize method
tokens = wtk.tokenize(text1)
print(tokens)

#### **Tokenization** using Spacy and NLTK

Word level Tokenization

In [None]:
import spacy

# Sample text for tokenization
text = "Menteri Urusan Keluarga di Jerman Anne Spiegel mengundurkan diri setelah kontroversinya berlibur pascabanjir dahsyat melanda Jerman pada 2021"
nlp = spacy.load('en_core_web_sm')

doc= nlp(text)
for token in doc:
  print(token, token.idx)

tokens = [token.text for token in doc]
print("Spacy Word Tokens:", tokens)

In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
tokens = word_tokenize(text)
print("NLTK Word Tokens:", tokens)

In [None]:
#import sent_tokenize from nltk library
from nltk import sent_tokenize
text = "Good morning everyone. Welcome to the AI Workshop. Dr. Carter and Michael T. Anderson are waiting for you. They'll join you shortly."
for t in sent_tokenize(text):

    x =word_tokenize(t)
    print(x)

Character Tokenization

In [None]:
text = "Jerman"
characters = list(text)
print("Character Tokens:", characters)

Subword Tokenization

*   breaks down words into smaller units called subwords.
*   used to handle unknown or rare words by breaking them into small known words.



In [None]:
from transformers import BertTokenizer

text = "Public opposition to the urbanization of the countryside"
text2 = "Tokenization is essential in NLP"
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Change 'tokenize.tokenize(text)' to 'tokenizer.tokenize(text)'
subwords = tokenizer.tokenize(text)
subwords2 = tokenizer.tokenize(text2)

print("Bert Subword Tokens:", subwords)
print("Bert Subword Tokens:", subwords2)

In [None]:
import nltk
import spacy
import re
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from nltk.tokenize.treebank import TreebankWordTokenizer
from sacremoses import MosesTokenizer
import sentencepiece as spm

# Sample text for tokenization
text = "Pengunduran diri tersebut dilakukan atas keputusannya sendiri pada 11 April 2022. Spiegel diketahui berlibur setelah bencana banjir terjadi di negara bagian tempat dia menjabat sebagai pejabat senior."

# White Space Tokenization
def whitespace_tokenization(text):
    return text.split()

# Regular Expression Tokenizer
def regex_tokenization(text):
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(text)

# Penn Treebank Tokenization
def penn_treebank_tokenization(text):
    tokenizer = TreebankWordTokenizer()
    return tokenizer.tokenize(text)

# SpaCy Tokenization
nlp = spacy.load("en_core_web_sm")
def spacy_tokenization(text):
    doc = nlp(text)
    return [token.text for token in doc]

# Moses Tokenization
moses_tokenizer = MosesTokenizer()
def moses_tokenization(text):
    return moses_tokenizer.tokenize(text)


In [None]:

# Display results
print("White Space Tokenization:", whitespace_tokenization(text))
print("Regular Expression Tokenization:", regex_tokenization(text))
print("Penn Treebank Tokenization:", penn_treebank_tokenization(text))
print("SpaCy Tokenization:", spacy_tokenization(text))
print("Moses Tokenization:", moses_tokenization(text))

## IndoBERT

In [None]:
import torch

if torch.cuda.is_available():
  device = torch.device('cuda')

  print('there are %d GPU(s) available.' % torch.cuda.device_count())

  print('we will use the GPU: ', torch.cuda.get_device_name(0))

else:
  print("No GPU available, using the CPU instead")
  device = torch.device("cpu")

In [None]:
from google.colab import files
uploaded = files.upload()

import io
import pandas as pd

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

  # Assuming the uploaded file is a CSV
  try:
    df_okejek = pd.read_csv(io.BytesIO(uploaded[fn]))
    print("Successfully loaded CSV into df_okejek")
  except pd.errors.ParserError:
      print(f"Error: Could not parse {fn} as a CSV. Please upload a valid CSV file.")
      df_okejek = pd.DataFrame() # Create an empty DataFrame if parsing fails
  except Exception as e:
    print(f"An unexpected error occurred: {e}")
    df_okejek = pd.DataFrame()

In [None]:
import transformers
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1')

In [None]:
sentences = df_okejek.content.astype(str).values

In [None]:
tokens_list  = []

for sent in sentences:
    tokens = tokenizer.tokenize(sent)
    tokens_list.append(tokens)


print("Original: ", sentences[350])
print("Token IDs: ", tokens_list [350])

In [None]:
df_okejek['tokens'] = df_okejek['content'].astype(str).apply(lambda x: tokenizer.tokenize(x))

df_okejek.to_csv("tokenized.csv", index=False)