In [None]:
# 1. basic string operations
sentence = "Iowa State University, located in Ames, is a renowned public research university."
print("[example sentence]: " + sentence)
print()

# 1.1 convert to uppercase/lowercase
uppercase = sentence.upper()
lowercase = sentence.lower()
print("Uppercase:", uppercase)
print("Lowercase:", lowercase)
print()

In [None]:
# 1.2 split into words & join words
words = sentence.split()
print("Words in the sentence:", words)
print()

joined_sentence = " ".join(words)
print("Joined Sentence:", joined_sentence)
print()

In [None]:
# 1.3 find substrings & replace substrings
index = sentence.find("Ames")
# returns the index of the first occurrence of the substring.
# if the substring is not found, it returns -1
print(f"'Ames' found at index: {index}")
print()

modified_sentence = sentence.replace("Ames", "Ames, Iowa")
print("Modified Sentence:", modified_sentence)
print()

In [None]:
# 1.4 access characters by Index
first_char = sentence[0]
last_char = sentence[-1]
print(f"First Character: {first_char}")
print(f"Last Character: {last_char}")
print()

substring = sentence[0:21]  # "Iowa State University", the blank space also counts
print("Substring (0:21):", substring)
print()

In [None]:
# 2. NLTK. "The Natural Language Toolkit"
import nltk
nltk.download('punkt_tab')  # ensure tokenizer resources are available

sentence = "Iowa State University, located in Ames, is a renowned public research university."

# 2.1 Tokenization
# Word
from nltk.tokenize import word_tokenize
tokens = word_tokenize(sentence)
print("Word Tokens:", tokens)
print()

# Sentence
from nltk.tokenize import sent_tokenize
long_introduction = "Iowa State University (ISU), in Ames, Iowa, is a top public research institution founded in 1858. \
Renowned for science, engineering, and agriculture, it hosts the U.S. Department of Energy’s Ames Laboratory. \
With over 36,000 students, ISU fosters innovation and global impact."
sentences = sent_tokenize(long_introduction)
print("Sentence Tokens:", sentences)
print()

In [None]:
# 2.2 Stop Words Removal & Frequency Distribution
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
print("NLTK stopwords:", stop_words)
print()
filtered_words = [word for word in word_tokenize(sentence) if word.lower() not in stop_words]
print("Filtered Words (No Stop Words):", filtered_words)
print()

from nltk.probability import FreqDist
filtered_words = [word.lower() for word in filtered_words]
freq_dist = FreqDist(filtered_words)
print("Frequency Distribution:")
print(freq_dist.most_common(5))
print()

In [None]:
# 3. Regular Expression (Regex)
import re

# 3.1 Check if a Pattern Exists
pattern = r"Ames"
match = re.search(pattern, sentence)  # stops after finding the first match in the string
if match:
    print(f"Pattern '{pattern}' found at position: {match.start()}")
else:
    print(f"Pattern '{pattern}' not found")
print()

In [None]:
# 3.2 Find all Case-Insensitive Matching
pattern = r"university"
matches = re.findall(pattern, sentence, re.IGNORECASE)
print(f"Case-insensitive matches for '{pattern}':", matches)
print()

In [None]:
# 3.3 Split String Using a Pattern
pattern = r",|\."  # The pipe symbol "|" means "or" in regex; "\." matches the dot "."
parts = re.split(pattern, sentence)
print("Split Sentence:", parts)
print()

In [None]:
# 3.4 Validate Patterns (e.g., Email-Like Text)
test_string = "Contact me at qli@iastate.edu"

pattern = r"\b[A-Za-z0-9._-]+@[A-Za-z0-9._-]+\.[A-Za-z]{2,}\b"
# \b
# Matches a word boundary, ensuring the email address is a standalone word and not part of a larger string.
# This is used at both the start and end of the pattern.

# [A-Za-z0-9._-]+
# Matches the local part of the email address (before the @).
# Allows any combination of uppercase and lowercase letters (A-Za-z), digits (0-9), dots (.), underscores (_), and dashes (-).
# The "+" ensures there is at least one character.

# [A-Za-z]{2,}
# Matches the top-level domain (e.g., com, org, net).
# Accepts at least two characters ({2,}) and ensures they are only uppercase (A-Z) or lowercase (a-z) letters.

if re.search(pattern, test_string):
    print("Valid email found!")
else:
    print("No valid email found.")
print()

In [None]:
# 4. scikit-learn. "Machine Learning in Python"
# 4.1 classification
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause

import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap

from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

names = [
    "Nearest Neighbors",
    "Linear SVM",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "Naive Bayes",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025, random_state=42),
    DecisionTreeClassifier(max_depth=5, random_state=42),
    RandomForestClassifier(
        max_depth=5, n_estimators=10, max_features=1, random_state=42
    ),
    MLPClassifier(alpha=1, max_iter=1000, random_state=42),
    GaussianNB(),
]

X, y = make_classification(
    n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1
)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)

datasets = [
    make_moons(noise=0.3, random_state=0),
    make_circles(noise=0.2, factor=0.5, random_state=1),
    linearly_separable,
]

figure = plt.figure(figsize=(27, 9))
i = 1
# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
    # preprocess dataset, split into training and test part
    X, y = ds
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.4, random_state=42
    )

    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5

    # just plot the dataset first
    cm = plt.cm.RdBu
    cm_bright = ListedColormap(["#FF0000", "#0000FF"])
    ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
    if ds_cnt == 0:
        ax.set_title("Input data")
    # Plot the training points
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k")
    # Plot the testing points
    #ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors="k")
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
    ax.set_xticks(())
    ax.set_yticks(())
    i += 1

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)

        clf = make_pipeline(StandardScaler(), clf)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        DecisionBoundaryDisplay.from_estimator(
            clf, X, cmap=cm, alpha=0.8, ax=ax, eps=0.5
        )

        # Plot the training points
        # ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k")
        # Plot the testing points
        ax.scatter(
            X_test[:, 0],
            X_test[:, 1],
            c=y_test,
            cmap=cm_bright,
            edgecolors="k",
            alpha=0.6,
        )

        ax.set_xlim(x_min, x_max)
        ax.set_ylim(y_min, y_max)
        ax.set_xticks(())
        ax.set_yticks(())
        if ds_cnt == 0:
            ax.set_title(name)
        ax.text(
            x_max - 0.3,
            y_min + 0.3,
            ("%.2f" % score).lstrip("0"),
            size=15,
            horizontalalignment="right",
        )
        i += 1

plt.tight_layout()
plt.show()

In [None]:
# 4.2 clustering
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause

import time
import warnings
from itertools import cycle, islice

import matplotlib.pyplot as plt
import numpy as np

from sklearn import cluster, datasets
from sklearn.preprocessing import StandardScaler


n_samples = 1500
noisy_circles = datasets.make_circles(
    n_samples=n_samples, factor=0.5, noise=0.05, random_state=170
)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=0.05, random_state=170)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=170)
rng = np.random.RandomState(170)
no_structure = rng.rand(n_samples, 2), None

# Anisotropicly distributed data
X, y = datasets.make_blobs(n_samples=n_samples, random_state=170)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso = np.dot(X, transformation)
aniso = (X_aniso, y)

# blobs with varied variances
varied = datasets.make_blobs(
    n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=170
)

# Set up cluster parameters
plt.figure(figsize=(9 * 1.3 + 2, 14.5))
plt.subplots_adjust(
    left=0.02, right=0.98, bottom=0.001, top=0.96, wspace=0.05, hspace=0.01
)

plot_num = 1

default_base = {"n_neighbors": 10, "n_clusters": 3}

datasets = [
    (noisy_circles, {"n_clusters": 2}),
    (noisy_moons, {"n_clusters": 2}),
    (varied, {"n_neighbors": 2}),
    (aniso, {"n_neighbors": 2}),
    (blobs, {}),
    (no_structure, {}),
]

for i_dataset, (dataset, algo_params) in enumerate(datasets):
    # update parameters with dataset-specific values
    params = default_base.copy()
    params.update(algo_params)

    X, y = dataset

    # normalize dataset for easier parameter selection
    X = StandardScaler().fit_transform(X)

    # ============
    # Create cluster objects
    # ============
    ward = cluster.AgglomerativeClustering(
        n_clusters=params["n_clusters"], linkage="ward"
    )
    complete = cluster.AgglomerativeClustering(
        n_clusters=params["n_clusters"], linkage="complete"
    )
    average = cluster.AgglomerativeClustering(
        n_clusters=params["n_clusters"], linkage="average"
    )
    single = cluster.AgglomerativeClustering(
        n_clusters=params["n_clusters"], linkage="single"
    )

    clustering_algorithms = (
        ("Single Linkage", single),
        ("Average Linkage", average),
        ("Complete Linkage", complete),
        ("Ward Linkage", ward),
    )

    for name, algorithm in clustering_algorithms:
        t0 = time.time()

        # catch warnings related to kneighbors_graph
        with warnings.catch_warnings():
            warnings.filterwarnings(
                "ignore",
                message="the number of connected components of the "
                + "connectivity matrix is [0-9]{1,2}"
                + " > 1. Completing it to avoid stopping the tree early.",
                category=UserWarning,
            )
            algorithm.fit(X)

        t1 = time.time()
        if hasattr(algorithm, "labels_"):
            y_pred = algorithm.labels_.astype(int)
        else:
            y_pred = algorithm.predict(X)

        plt.subplot(len(datasets), len(clustering_algorithms), plot_num)
        if i_dataset == 0:
            plt.title(name, size=18)

        colors = np.array(
            list(
                islice(
                    cycle(
                        [
                            "#377eb8",
                            "#ff7f00",
                            "#4daf4a",
                            "#f781bf",
                            "#a65628",
                            "#984ea3",
                            "#999999",
                            "#e41a1c",
                            "#dede00",
                        ]
                    ),
                    int(max(y_pred) + 1),
                )
            )
        )
        plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])

        plt.xlim(-2.5, 2.5)
        plt.ylim(-2.5, 2.5)
        plt.xticks(())
        plt.yticks(())
        plt.text(
            0.99,
            0.01,
            ("%.2fs" % (t1 - t0)).lstrip("0"),
            transform=plt.gca().transAxes,
            size=15,
            horizontalalignment="right",
        )
        plot_num += 1

plt.show()

In [None]:
# 5. BERT (Bidirectional Encoder Representations from Transformers)
from transformers import BertTokenizer, BertModel
import torch

# 5.1 Prepare Input for BERT

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

sentence = "Iowa State University, located in Ames, is a renowned public research university."

# Tokenize the input sentence
tokens = tokenizer.tokenize(sentence)
print("Tokens:", tokens)

# Convert tokens to input IDs
input_ids = tokenizer.encode(sentence, add_special_tokens=True)
print("Input IDs:", input_ids)
print("Decoded Sentence:", tokenizer.decode(input_ids))
# with special tokens [CLS] and [SEP] used for classification tasks and indicating sentence boundaries


# Convert to PyTorch tensors
inputs = tokenizer(sentence, return_tensors="pt", add_special_tokens=True)

print("Input Tensor Keys:", inputs.keys())  # 'input_ids' and 'attention_mask'
print("Input IDs Tensor:", inputs["input_ids"])
print("Attention Mask Tensor:", inputs["attention_mask"])
# the mask indicates which tokens should be attended to (1) and which should be ignored (0)
# In this example, all tokens have a mask value of 1, meaning all tokens should be attended to
print()

In [None]:
# 5.2 Get BERT Output

# Pass the input through BERT
outputs = model(**inputs)

# Outputs contain 'last_hidden_state' and 'pooler_output'
last_hidden_state = outputs.last_hidden_state
pooled_output = outputs.pooler_output

print("Last Hidden State Shape:", last_hidden_state.shape)  # (batch_size, seq_len, hidden_size)
print("Pooled Output Shape:", pooled_output.shape)  # (batch_size, hidden_size)
print()

# The pooler_output can be used as a fixed-size embedding for the sentence:
sentence_embedding = pooled_output.squeeze(0)  # Remove batch dimension
print("Sentence Embedding (768-dim):", sentence_embedding)
print()

In [None]:
# 5.3 Token-Level Embeddings

# Extract embeddings for each token
token_embeddings = last_hidden_state.squeeze(0)  # Remove batch dimension
print("Token Embeddings Shape:", token_embeddings.shape)  # (seq_len, hidden_size)

# Example: Embedding for the first token
print("First Token Embedding:", token_embeddings[0])
print()

In [None]:
# 5.4 Compute Sentence Similarity

# Encode two sentences and compute their similarity
sentence2 = "Ames is home to Iowa State University, a prominent research institution."
inputs2 = tokenizer(sentence2, return_tensors="pt", add_special_tokens=True)

# Get embeddings for both sentences
outputs1 = model(**inputs)
outputs2 = model(**inputs2)

embedding1 = outputs1.pooler_output
embedding2 = outputs2.pooler_output

# Compute cosine similarity
cosine_similarity = torch.nn.functional.cosine_similarity(embedding1, embedding2)
print("Cosine Similarity between sentences:", cosine_similarity.item())
print()