Research Domain-Edtech/Learning tools
The EdTech domain focuses on leveraging natural language processing to enhance digital learning tools by analyzing educational content, personalizing student experiences, and improving instructional feedback through automated text processing techniques.


In [None]:
!pip install --quiet spacy sentencepiece
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
import re, sentencepiece as spm
import spacy
nlp = spacy.load("en_core_web_sm")
from nltk.tokenize import word_tokenize, regexp_tokenize, TreebankWordTokenizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
# 1. Whitespace Tokenization
text = "LearnWise helps students learn better through personalized content."
tokens = text.split()
print("Whitespace Tokens:", tokens)


Whitespace Tokens: ['LearnWise', 'helps', 'students', 'learn', 'better', 'through', 'personalized', 'content.']


In [None]:
# 2. Regex Tokenization
text = "LearnWise adapts quickly: fast, smart, and scalable!"
tokens = regexp_tokenize(text, r'\w+|\$[\d\.]+|\S+')
print("Regex Tokens:", tokens)


Regex Tokens: ['LearnWise', 'adapts', 'quickly', ':', 'fast', ',', 'smart', ',', 'and', 'scalable', '!']


In [None]:
# 3. NLTK Word Tokenizer
text = "LearnWise offers adaptive learning for every student."
tokens = word_tokenize(text)
print("Word Tokens:", tokens)


Word Tokens: ['LearnWise', 'offers', 'adaptive', 'learning', 'for', 'every', 'student', '.']


In [None]:
# 4. spaCy Tokenizer
text = "LearnWise provides rich insights from learner activity."
doc = nlp(text)
tokens = [token.text for token in doc]
print("spaCy Tokens:", tokens)


spaCy Tokens: ['LearnWise', 'provides', 'rich', 'insights', 'from', 'learner', 'activity', '.']


In [None]:
# 5. Treebank Tokenizer
text = "Students' feedback improves LearnWise's tools."
tokens = TreebankWordTokenizer().tokenize(text)
print("Treebank Tokens:", tokens)


Treebank Tokens: ['Students', "'", 'feedback', 'improves', 'LearnWise', "'s", 'tools', '.']


In [None]:
# 6. WordPiece-style Tokenization (simplified)
text = "LearnWise personalization rocks"
subwords = []
for word in text.split():
    if len(word) > 6:
        subwords.append(word[:4])
        subwords.append("##" + word[4:])
    else:
        subwords.append(word)
print("WordPiece Tokens:", subwords)


WordPiece Tokens: ['Lear', '##nWise', 'pers', '##onalization', 'rocks']


In [None]:
# 7. Character Tokenization
text = "Learn"
tokens = list(text)
print("Character Tokens:", tokens)


Character Tokens: ['L', 'e', 'a', 'r', 'n']


In [None]:
# 8. SentencePiece Tokenization
with open("learnwise_data.txt", "w") as f:
    f.write("LearnWise enables adaptive, scalable learning solutions.")


spm.SentencePieceTrainer.Train('--input=learnwise_data.txt --model_prefix=sp --vocab_size=28')
sp = spm.SentencePieceProcessor()
sp.load("sp.model")

text = "LearnWise personalizes learning."
tokens = sp.encode(text, out_type=str)
print("SentencePiece Tokens:", tokens)

SentencePiece Tokens: ['▁', 'L', 'earn', 'W', 'i', 's', 'e', '▁', 'p', 'e', 'r', 's', 'o', 'n', 'a', 'l', 'i', 'z', 'e', 's', '▁', 'l', 'earn', 'i', 'n', 'g', '.']


In [None]:
# 9. Custom Dictionary Tokenization
terms = ["LearnWise", "adaptive learning", "feedback"]
text = "LearnWise supports adaptive learning and real-time feedback."

for t in sorted(terms, key=len, reverse=True):
    text = text.replace(t, t.replace(" ", "_"))

tokens = text.split()
print("Custom Dict Tokens:", tokens)


Custom Dict Tokens: ['LearnWise', 'supports', 'adaptive_learning', 'and', 'real-time', 'feedback.']


In [None]:
# 10. Hybrid Tokenization
terms = ["LearnWise", "lesson planner", "AI"]
text = "LearnWise has an AI-powered lesson planner."

for t in terms:
    text = text.replace(t, t.replace(" ", "_"))

text = re.sub(r'[^\w\s_]', '', text)
tokens = word_tokenize(text)
print("Hybrid Tokens:", tokens)


Hybrid Tokens: ['LearnWise', 'has', 'an', 'AIpowered', 'lesson_planner']
