In [1]:
# 1. basic string operations
sentence = "Iowa State University, located in Ames, is a renowned public research university."
print("[example sentence]: " + sentence)
print()

# 1.1 convert to uppercase/lowercase
uppercase = sentence.upper()
lowercase = sentence.lower()
print("Uppercase:", uppercase)
print("Lowercase:", lowercase)
print()

[example sentence]: Iowa State University, located in Ames, is a renowned public research university.

Uppercase: IOWA STATE UNIVERSITY, LOCATED IN AMES, IS A RENOWNED PUBLIC RESEARCH UNIVERSITY.
Lowercase: iowa state university, located in ames, is a renowned public research university.



In [2]:
# 1.2 split into words & join words
words = sentence.split()
print("Words in the sentence:", words)
print()

joined_sentence = " ".join(words)
print("Joined Sentence:", joined_sentence)
print()

Words in the sentence: ['Iowa', 'State', 'University,', 'located', 'in', 'Ames,', 'is', 'a', 'renowned', 'public', 'research', 'university.']

Joined Sentence: Iowa State University, located in Ames, is a renowned public research university.



In [3]:
# 1.3 find substrings & replace substrings
index = sentence.find("Ames")
# returns the index of the first occurrence of the substring.
# if the substring is not found, it returns -1
print(f"'Ames' found at index: {index}")
print()

modified_sentence = sentence.replace("Ames", "Ames, Iowa")
print("Modified Sentence:", modified_sentence)
print()

'Ames' found at index: 34

Modified Sentence: Iowa State University, located in Ames, Iowa, is a renowned public research university.



In [4]:
# 1.4 access characters by Index
first_char = sentence[0]
last_char = sentence[-1]
print(f"First Character: {first_char}")
print(f"Last Character: {last_char}")
print()

substring = sentence[0:21]  # "Iowa State University", the blank space also counts
print("Substring (0:21):", substring)
print()

First Character: I
Last Character: .

Substring (0:21): Iowa State University



In [6]:
# 2. NLTK. "The Natural Language Toolkit"
import nltk
nltk.download('punkt_tab')  # ensure tokenizer resources are available

sentence = "Iowa State University, located in Ames, is a renowned public research university."

# 2.1 Tokenization
# Word
from nltk.tokenize import word_tokenize
tokens = word_tokenize(sentence)
print("Word Tokens:", tokens)
print()

# Sentence
from nltk.tokenize import sent_tokenize
long_introduction = "Iowa State University (ISU), in Ames, Iowa, is a top public research institution founded in 1858. \
Renowned for science, engineering, and agriculture, it hosts the U.S. Department of Energy’s Ames Laboratory. \
With over 36,000 students, ISU fosters innovation and global impact."
sentences = sent_tokenize(long_introduction)
print("Sentence Tokens:", sentences)
print()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Word Tokens: ['Iowa', 'State', 'University', ',', 'located', 'in', 'Ames', ',', 'is', 'a', 'renowned', 'public', 'research', 'university', '.']

Sentence Tokens: ['Iowa State University (ISU), in Ames, Iowa, is a top public research institution founded in 1858.', 'Renowned for science, engineering, and agriculture, it hosts the U.S. Department of Energy’s Ames Laboratory.', 'With over 36,000 students, ISU fosters innovation and global impact.']



In [7]:
# 2.2 Stop Words Removal & Frequency Distribution
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
print("NLTK stopwords:", stop_words)
print()
filtered_words = [word for word in word_tokenize(sentence) if word.lower() not in stop_words]
print("Filtered Words (No Stop Words):", filtered_words)
print()

from nltk.probability import FreqDist
filtered_words = [word.lower() for word in filtered_words]
freq_dist = FreqDist(filtered_words)
print("Frequency Distribution:")
print(freq_dist.most_common(5))
print()

NLTK stopwords: {'both', 'hasn', 'yourselves', 'what', 'same', "he's", 'to', "you'll", 'up', 'on', "it'll", 'weren', 'it', "they'd", 'can', 'for', "aren't", 'over', 'down', 'is', 'nor', 'that', 'them', 'him', 'any', "i'm", 'if', 'other', 'his', 'once', 'who', 'not', 'there', "weren't", 't', 'through', 'were', "we'll", 'the', 'so', "won't", "he'll", 'too', "mustn't", 'this', 'wouldn', "that'll", "you've", "shan't", 'd', 'hadn', 'and', "they'll", 'shan', 'hers', 'more', 'itself', 'of', 'when', 'they', 'needn', 'y', 'yourself', 'as', 'has', 'while', 'her', "didn't", "don't", 'few', 'above', 'myself', "she's", 'ourselves', 'didn', 'by', 'from', 'again', 'now', "hadn't", 'further', 's', 'most', 'did', 'because', 'whom', 'between', "you'd", "couldn't", 'theirs', 'before', 'does', 'our', 'isn', 'own', 'ain', 'just', "we've", "i'd", 'those', 'aren', 'my', "he'd", 'ma', 're', 'i', 'was', 'll', 'off', 'ours', "she'll", "needn't", 'with', "you're", "doesn't", 'a', 'under', 'you', 'shouldn', 'o', 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
# 3. spaCy
sentence = "Iowa State University, located in Ames, is a renowned public research university."

import spacy
nlp = spacy.load("en_core_web_sm")  # load the small English model
# spaCy also comes with pre-trained models for multiple languages,
# allowing us to get started quickly with real-world text analysis
doc = nlp(sentence)

# 3.1 Tokenization
print("Tokens:")
for token in doc:
    print(token.text)
print()

Tokens:
Iowa
State
University
,
located
in
Ames
,
is
a
renowned
public
research
university
.



In [9]:
# 3.2 Part-of-Speech (POS) Tagging
print("POS Tags:")
for token in doc:
    print(f"{token.text} -> {token.pos_} ({token.tag_})")
# token.pos_: The simplified part-of-speech tag (e.g., NOUN, VERB, etc.).
# token.tag_: The fine-grained POS tag, which provides more specific grammatical details (e.g., VBN for past participle verb).
print()

POS Tags:
Iowa -> PROPN (NNP)
State -> PROPN (NNP)
University -> PROPN (NNP)
, -> PUNCT (,)
located -> VERB (VBN)
in -> ADP (IN)
Ames -> PROPN (NNP)
, -> PUNCT (,)
is -> AUX (VBZ)
a -> DET (DT)
renowned -> ADJ (JJ)
public -> ADJ (JJ)
research -> NOUN (NN)
university -> NOUN (NN)
. -> PUNCT (.)



In [10]:
# 3.3 Named Entity Recognition (NER)
print("Named Entities:")
for ent in doc.ents:
    print(f"{ent.text} -> {ent.label_} ({spacy.explain(ent.label_)})")
print()

Named Entities:
Iowa State University -> ORG (Companies, agencies, institutions, etc.)
Ames -> GPE (Countries, cities, states)



In [11]:
# 3.4 Dependency Parsing
# Focuses on relationships between words in a sentence.
# Represents sentences as a directed graph where words are nodes, and dependencies (like subject-verb, object-verb) are edges.
print("Dependency Parsing:")
for token in doc:
    print(f"{token.text} -> {token.dep_} (Head: {token.head.text})")
print()

# this is just a simple parser from spaCy
# there are many more advanced parsers, for example, you can try the demo: https://corenlp.run/
# parsing bridges the gap between raw text and its syntactic or semantic meaning, making it essential for advanced language understanding

Dependency Parsing:
Iowa -> compound (Head: University)
State -> compound (Head: University)
University -> nsubj (Head: is)
, -> punct (Head: University)
located -> acl (Head: University)
in -> prep (Head: located)
Ames -> pobj (Head: in)
, -> punct (Head: University)
is -> ROOT (Head: is)
a -> det (Head: university)
renowned -> amod (Head: university)
public -> amod (Head: research)
research -> compound (Head: university)
university -> attr (Head: is)
. -> punct (Head: is)



In [12]:
# 3.5 Sentence Segmentation
long_introduction = "Iowa State University (ISU), in Ames, Iowa, is a top public research institution founded in 1858. \
Renowned for science, engineering, and agriculture, it hosts the U.S. Department of Energy’s Ames Laboratory. \
With over 36,000 students, ISU fosters innovation and global impact."
doc_long = nlp(long_introduction)

print("Sentences:")
for sent in doc_long.sents:
    print(sent.text)
print()

Sentences:
Iowa State University (ISU), in Ames, Iowa, is a top public research institution founded in 1858.
Renowned for science, engineering, and agriculture, it hosts the U.S. Department of Energy’s Ames Laboratory.
With over 36,000 students, ISU fosters innovation and global impact.



In [13]:
# 3.6 Lemmatization
# reduce a word to its base or root form (known as the "lemma")
print("Lemmatized Tokens:")
for token in doc:
    print(f"{token.text} -> {token.lemma_}")
print()

Lemmatized Tokens:
Iowa -> Iowa
State -> State
University -> University
, -> ,
located -> locate
in -> in
Ames -> Ames
, -> ,
is -> be
a -> a
renowned -> renowned
public -> public
research -> research
university -> university
. -> .



In [14]:
# 3.7 Similarity Between Words
word1 = nlp("research")
word2 = nlp("university")
similarity = word1.similarity(word2)
print(f"Similarity between '{word1}' and '{word2}': {similarity}")
print()

Similarity between 'research' and 'university': 0.7308306694030762



  similarity = word1.similarity(word2)


In [15]:
# 4. Regular Expression (Regex)
import re

# 4.1 Check if a Pattern Exists
pattern = r"Ames"
match = re.search(pattern, sentence)  # stops after finding the first match in the string
if match:
    print(f"Pattern '{pattern}' found at position: {match.start()}")
else:
    print(f"Pattern '{pattern}' not found")
print()

Pattern 'Ames' found at position: 34



In [16]:
# 4.2 Find all Case-Insensitive Matching
pattern = r"university"
matches = re.findall(pattern, sentence, re.IGNORECASE)
print(f"Case-insensitive matches for '{pattern}':", matches)
print()

Case-insensitive matches for 'university': ['University', 'university']



In [17]:
# 4.3 Split String Using a Pattern
pattern = r",|\."  # The pipe symbol "|" means "or" in regex; "\." matches the dot "."
parts = re.split(pattern, sentence)
print("Split Sentence:", parts)
print()

Split Sentence: ['Iowa State University', ' located in Ames', ' is a renowned public research university', '']



In [19]:
# 4.4 Validate Patterns (e.g., Email-Like Text)
test_string = "Contact me at qli@iastate.edu"

pattern = r"\b[A-Za-z0-9._-]+@[A-Za-z0-9._-]+\.[A-Za-z]{2,}\b"
# \b
# Matches a word boundary, ensuring the email address is a standalone word and not part of a larger string.
# This is used at both the start and end of the pattern.

# [A-Za-z0-9._-]+
# Matches the local part of the email address (before the @).
# Allows any combination of uppercase and lowercase letters (A-Za-z), digits (0-9), dots (.), underscores (_), and dashes (-).
# The "+" ensures there is at least one character.

# [A-Za-z]{2,}
# Matches the top-level domain (e.g., com, org, net).
# Accepts at least two characters ({2,}) and ensures they are only uppercase (A-Z) or lowercase (a-z) letters.

if re.search(pattern, test_string):
    print("Valid email found!")
else:
    print("No valid email found.")
print()

Valid email found!



In [20]:
# 5. BERT (Bidirectional Encoder Representations from Transformers)
from transformers import BertTokenizer, BertModel
import torch

# 5.1 Prepare Input for BERT

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

sentence = "Iowa State University, located in Ames, is a renowned public research university."

# Tokenize the input sentence
tokens = tokenizer.tokenize(sentence)
print("Tokens:", tokens)

# Convert tokens to input IDs
input_ids = tokenizer.encode(sentence, add_special_tokens=True)
print("Input IDs:", input_ids)
print("Decoded Sentence:", tokenizer.decode(input_ids))
# with special tokens [CLS] and [SEP] used for classification tasks and indicating sentence boundaries


# Convert to PyTorch tensors
inputs = tokenizer(sentence, return_tensors="pt", add_special_tokens=True)

print("Input Tensor Keys:", inputs.keys())  # 'input_ids' and 'attention_mask'
print("Input IDs Tensor:", inputs["input_ids"])
print("Attention Mask Tensor:", inputs["attention_mask"])
# the mask indicates which tokens should be attended to (1) and which should be ignored (0)
# In this example, all tokens have a mask value of 1, meaning all tokens should be attended to
print()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Tokens: ['iowa', 'state', 'university', ',', 'located', 'in', 'ames', ',', 'is', 'a', 'renowned', 'public', 'research', 'university', '.']
Input IDs: [101, 5947, 2110, 2118, 1010, 2284, 1999, 19900, 1010, 2003, 1037, 8228, 2270, 2470, 2118, 1012, 102]
Decoded Sentence: [CLS] iowa state university, located in ames, is a renowned public research university. [SEP]
Input Tensor Keys: dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
Input IDs Tensor: tensor([[  101,  5947,  2110,  2118,  1010,  2284,  1999, 19900,  1010,  2003,
          1037,  8228,  2270,  2470,  2118,  1012,   102]])
Attention Mask Tensor: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])



In [21]:
# 5.2 Get BERT Output

# Pass the input through BERT
outputs = model(**inputs)

# Outputs contain 'last_hidden_state' and 'pooler_output'
last_hidden_state = outputs.last_hidden_state
pooled_output = outputs.pooler_output

print("Last Hidden State Shape:", last_hidden_state.shape)  # (batch_size, seq_len, hidden_size)
print("Pooled Output Shape:", pooled_output.shape)  # (batch_size, hidden_size)
print()

# The pooler_output can be used as a fixed-size embedding for the sentence:
sentence_embedding = pooled_output.squeeze(0)  # Remove batch dimension
print("Sentence Embedding (768-dim):", sentence_embedding)
print()

Last Hidden State Shape: torch.Size([1, 17, 768])
Pooled Output Shape: torch.Size([1, 768])

Sentence Embedding (768-dim): tensor([-0.9481, -0.6508, -0.9143,  0.8588,  0.8159, -0.3357,  0.8573,  0.5973,
        -0.7377, -1.0000, -0.7547,  0.9797,  0.9933,  0.2668,  0.8878, -0.8048,
        -0.6174, -0.6574,  0.5649,  0.1456,  0.7986,  1.0000, -0.3083,  0.6085,
         0.6283,  0.9938, -0.8767,  0.9488,  0.9641,  0.8719, -0.7002,  0.5704,
        -0.9975, -0.2268, -0.8265, -0.9946,  0.6888, -0.7981, -0.0198, -0.3524,
        -0.9090,  0.6835,  1.0000,  0.0334,  0.7192, -0.4340, -1.0000,  0.5348,
        -0.9142,  0.9602,  0.8069,  0.9344,  0.3704,  0.7103,  0.6457, -0.5981,
         0.2211,  0.4503, -0.5062, -0.7467, -0.6920,  0.5280, -0.9071, -0.8873,
         0.9438,  0.7676, -0.3514, -0.5435, -0.2921,  0.2980,  0.8565,  0.3189,
        -0.3033, -0.7898,  0.6055,  0.3297, -0.4019,  1.0000, -0.5569, -0.9937,
         0.5377,  0.5880,  0.3716, -0.4007,  0.3338, -1.0000,  0.7053, -0.404

In [22]:
# 5.3 Token-Level Embeddings

# Extract embeddings for each token
token_embeddings = last_hidden_state.squeeze(0)  # Remove batch dimension
print("Token Embeddings Shape:", token_embeddings.shape)  # (seq_len, hidden_size)

# Example: Embedding for the first token
print("First Token Embedding:", token_embeddings[0])
print()

Token Embeddings Shape: torch.Size([17, 768])
First Token Embedding: tensor([-5.9449e-01, -1.4600e-01, -4.9150e-01,  8.9375e-02, -3.3897e-01,
         2.3716e-02,  4.0410e-01,  9.6559e-01, -2.6545e-01,  1.6714e-01,
        -5.2128e-02, -4.4273e-01, -2.4112e-01,  9.3894e-01,  3.1113e-01,
         3.0133e-02, -4.3646e-01,  7.7356e-01,  3.3667e-01,  2.0074e-01,
        -1.9550e-01, -8.9275e-01,  4.4156e-01,  3.8249e-01,  4.1533e-01,
        -5.5003e-02, -4.6654e-02, -3.2453e-01, -6.3444e-02, -5.9779e-02,
        -2.3659e-01, -4.2598e-02, -1.0019e-01, -6.8473e-02,  5.5173e-01,
        -3.8214e-02, -2.4742e-01, -3.3074e-01,  3.0208e-01,  2.0757e-01,
        -2.7058e-01, -4.0728e-01,  5.2700e-01,  9.4850e-02, -1.8823e-01,
        -8.9372e-01, -2.6808e+00,  2.4461e-01, -3.0044e-01,  9.7514e-02,
        -3.3243e-01,  4.3123e-01,  2.1552e-01,  8.0473e-01,  4.0600e-01,
         5.0902e-01,  1.0342e-01,  5.5658e-01, -1.6712e-01,  1.5147e-01,
        -3.2781e-01,  3.5046e-01, -4.1866e-01,  3.5627e

In [23]:
# 5.4 Compute Sentence Similarity

# Encode two sentences and compute their similarity
sentence2 = "Ames is home to Iowa State University, a prominent research institution."
inputs2 = tokenizer(sentence2, return_tensors="pt", add_special_tokens=True)

# Get embeddings for both sentences
outputs1 = model(**inputs)
outputs2 = model(**inputs2)

embedding1 = outputs1.pooler_output
embedding2 = outputs2.pooler_output

# Compute cosine similarity
cosine_similarity = torch.nn.functional.cosine_similarity(embedding1, embedding2)
print("Cosine Similarity between sentences:", cosine_similarity.item())
print()

Cosine Similarity between sentences: 0.9922517538070679

