<a href="https://colab.research.google.com/github/WSLINMSAI/MSAI-531-B01/blob/main/Copy_of_Project_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ✅ STEP 1: Install dependencies
!pip install nltk ipywidgets jieba langdetect regex --quiet

In [2]:
# ✅ STEP 2: Import libraries and download NLTK resources
import nltk
nltk.download('brown')
nltk.download('punkt')
nltk.download('words')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [3]:
# ✅ STEP 3: Import supporting modules
import jieba
from langdetect import detect
import re
import math
from nltk.corpus import brown, words as nltk_words
from nltk import FreqDist, bigrams
from collections import defaultdict
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML
import base64

In [4]:
# ✅ STEP 4: Load corpora and create word lists
brown_words = [w.lower() for w in brown.words() if w.isalpha()]
english_vocab = set(w.lower() for w in nltk_words.words())

In [5]:
# ✅ STEP 5: Build unigram and bigram frequency models
unigram_freq = FreqDist(brown_words)
total_unigrams = sum(unigram_freq.values())

unigram_probs = {
    word: math.log(freq / total_unigrams)
    for word, freq in unigram_freq.items()
}

bigram_freq = FreqDist(bigrams(brown_words))
bigram_probs = {
    (w1, w2): math.log(freq / unigram_freq[w1])
    for (w1, w2), freq in bigram_freq.items()
    if unigram_freq[w1] > 0
}

In [6]:
# ✅ STEP 6: Probability scoring functions
def unigram_log_prob(word):
    if word in unigram_probs:
        return unigram_probs[word]
    elif word in english_vocab:
        return math.log(1e-6)
    else:
        return -10 * len(word)

def bigram_log_prob(prev, word):
    if (prev, word) in bigram_probs:
        return bigram_probs[(prev, word)]
    else:
        return unigram_log_prob(word)

In [7]:
# ✅ STEP 7: English bigram segmentation using Viterbi
def segment_bigram(text):
    n = len(text)
    best_score = [float('inf')] * (n + 1)
    backtrack = [0] * (n + 1)
    words_at = [''] * (n + 1)
    prev_words = [''] * (n + 1)

    best_score[0] = 0
    words_at[0] = ''

    for i in range(1, n + 1):
        for j in range(max(0, i - 25), i):
            word = text[j:i]
            prev = words_at[j] if j > 0 else '<s>'
            score = best_score[j] + (-bigram_log_prob(prev, word))
            if score < best_score[i]:
                best_score[i] = score
                backtrack[i] = j
                words_at[i] = word
                prev_words[i] = prev

    i = n
    segments = []
    while i > 0:
        j = backtrack[i]
        word = text[j:i]
        prev = prev_words[i] if j > 0 else '<s>'
        score = -bigram_log_prob(prev, word)
        segments.append((word, score))
        i = j

    segments.reverse()
    return segments

In [8]:
# ✅ STEP 8: Mixed-language processor using regex

def process_mixed_language(text):
    tokens = re.findall(r'[\u4e00-\u9fff]+|[a-zA-Z]+', text)
    output = []

    for token in tokens:
        if re.match(r'^[\u4e00-\u9fff]+$', token):
            output.extend([(w, 'zh') for w in jieba.cut(token)])
        elif token.isalpha():
            segs = segment_bigram(token)
            output.extend([(w, s) for w, s in segs])
        else:
            output.append((token, 'unknown'))

    return output

In [9]:
# ✅ STEP 9: Define confidence-based color coding

def get_color(score):
    if score < 5:
        return 'green'
    elif score < 10:
        return 'orange'
    else:
        return 'red'

In [10]:
# ✅ STEP 10: Interactive widget to display segmentation
text_input = widgets.Text(
    value='machinelearning正在重新定义教育healthcare和transportation领域的解决方案和效率',
    placeholder='Enter English, Chinese, or mixed text',
    description='Input:',
    layout=widgets.Layout(width='100%')
)

output_area = widgets.Output()

def on_submit(change):
    with output_area:
        clear_output()
        user_input = text_input.value.strip()
        if not user_input:
            print("❌ Please enter non-empty text.")
            return

        result = process_mixed_language(user_input)
        scores = [s for _, s in result if isinstance(s, float)]
        avg_score = sum(scores) / len(scores) if scores else 0
        high_conf = sum(1 for s in scores if s < 5)

        display(HTML("<h3>✅ Mixed-language Segmentation</h3>"))

        html = """
        <style>
        table { border-collapse: collapse; margin-top: 10px; }
        th, td { border: 1px solid #ccc; padding: 6px 12px; text-align: left; }
        th { background-color: #f2f2f2; }
        .green { color: green; font-weight: bold; }
        .orange { color: orange; font-weight: bold; }
        .red { color: red; font-weight: bold; }
        .blue { color: blue; font-weight: bold; }
        </style>
        <table>
        <tr><th>Word</th><th>Score</th><th>Confidence</th><th>Language</th></tr>
        """

        for word, info in result:
            if isinstance(info, str) and info == 'zh':
                html += f"<tr><td class='blue'>{word}</td><td>–</td><td>–</td><td>Chinese</td></tr>"
            elif isinstance(info, float):
                color = get_color(info)
                explanation = (
                    "High confidence" if color == 'green' else
                    "Moderate confidence" if color == 'orange' else
                    "Low confidence"
                )
                html += f"<tr><td class='{color}'>{word}</td><td>{info:.3f}</td><td>{explanation}</td><td>English</td></tr>"
            else:
                html += f"<tr><td>{word}</td><td>–</td><td>Unknown</td><td>Unknown</td></tr>"

        html += "</table>"
        display(HTML(html))

        summary = f"<p><b>Summary:</b> {len(scores)} English words, {high_conf} high-confidence. Avg. score = {avg_score:.3f}</p>"
        display(HTML(summary))
        print("\nNote: English words use a bigram model trained on the Brown corpus. Chinese words use jieba.")


In [11]:
# ✅ STEP 11: Launch the interface
text_input.on_submit(on_submit)
display(text_input, output_area)

Text(value='machinelearning正在重新定义教育healthcare和transportation领域的解决方案和效率', description='Input:', layout=Layout(w…

Output()

In [13]:
# ✅ STEP: Export GitHub-friendly copy (keep outputs, remove widget state everywhere)
from google.colab import files
import nbformat as nbf
from nbformat.validator import validate, ValidationError
import io
import sys

print("1) In Colab: File > Download > Download .ipynb")
print("2) Upload that file when prompted below")
uploaded = files.upload()
if not uploaded:
    raise SystemExit("No file uploaded.")

in_name = next(iter(uploaded.keys()))
nb = nbf.read(io.BytesIO(uploaded[in_name]), as_version=4)

def strip_widgets_everywhere(nbnode):
    # Remove notebook-level widgets metadata
    if "widgets" in nbnode.metadata:
        del nbnode.metadata["widgets"]
    # Remove cell-level widgets metadata and other widget keys
    for cell in nbnode.cells:
        md = cell.get("metadata", {})
        if "widgets" in md:
            del md["widgets"]
        # Colab sometimes stores widget-like stuff under these keys
        for k in ["outputId", "executionInfo", "colab", "id"]:
            if k in md:
                del md[k]
        cell["metadata"] = md
        # Drop widget-view outputs but keep everything else
        if "outputs" in cell:
            cleaned = []
            for out in cell["outputs"]:
                if out.get("output_type") == "display_data":
                    data = out.get("data", {})
                    if "application/vnd.jupyter.widget-view+json" in data:
                        continue  # skip widget view
                cleaned.append(out)
            cell["outputs"] = cleaned

strip_widgets_everywhere(nb)

# Ensure kernelspec and language_info exist so GitHub renders nicely
nb.metadata["kernelspec"] = {
    "display_name": "Python 3",
    "language": "python",
    "name": "python3"
}
nb.metadata["language_info"] = {
    "name": "python",
    "version": sys.version.split()[0]
}

# Validate before writing
try:
    validate(nb)
    print("Validation OK")
except ValidationError as e:
    print("Validation warning:", e)

out_name = "github_ready.ipynb"
with open(out_name, "w", encoding="utf-8") as f:
    nbf.write(nb, f)

print("✅ Cleaned notebook saved as", out_name)
files.download(out_name)


1) In Colab: File > Download > Download .ipynb
2) Upload that file when prompted below


Saving Copy_of_Project_4.ipynb to Copy_of_Project_4 (3).ipynb
Validation OK
✅ Cleaned notebook saved as github_ready.ipynb


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>