In [None]:
import json
import spacy
import re
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from google.colab import files

# Upload dataset
uploaded = files.upload()
data_path = list(uploaded.keys())[0]




Saving dailydialog_train.json to dailydialog_train (2).json


In [None]:
# Load dataset
with open(data_path, "r", encoding="utf-8") as f:
    dataset = json.load(f)



In [None]:
# Load SpaCy with custom configuration
try:
    nlp = spacy.load("en_core_web_trf")  # Try transformer model first
except OSError:
    !python -m spacy download en_core_web_sm
    nlp = spacy.load("en_core_web_sm")

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m101.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Check if "sentencizer" is already in the pipeline
if "sentencizer" not in nlp.pipe_names:
    nlp.add_pipe("sentencizer", before="parser")

# Modify sentence boundary rules
sentencizer = nlp.get_pipe("sentencizer")
sentencizer.punct_chars.update([":", "—", "…", "\\", "/"])

# Disable unnecessary components for faster processing
disabled_pipes = ["ner", "lemmatizer", "attribute_ruler"]
for pipe in disabled_pipes:
    if pipe in nlp.pipe_names:
        nlp.disable_pipe(pipe)

print("Pipeline setup updated successfully!")


Pipeline setup updated successfully!


In [None]:
def enhanced_clause_segmentation(utterance):
    doc = nlp(utterance)
    clauses = []

    for sent in doc.sents:
        current_clause = []
        for i, token in enumerate(sent):
            # Split at coordinating/subordinating conjunctions
            if token.dep_ in ("cc", "mark") or token.pos_ == "SCONJ":
                if current_clause:
                    clauses.append(" ".join(current_clause).strip())
                    current_clause = []

                # Add conjunction to next clause if appropriate
                if token.text.lower() not in {"and", "or", "but"}:
                    current_clause.append(token.text)
                continue

            # Split at commas with clausal function
            if token.text == "," and i > 0:
                prev_token = sent[i-1]
                next_token = sent[i+1] if i+1 < len(sent) else None
                if next_token and next_token.pos_ in ("VERB", "AUX"):
                    if current_clause:
                        clauses.append(" ".join(current_clause).strip())
                        current_clause = []
                    continue

            current_clause.append(token.text)

        if current_clause:
            clauses.append(" ".join(current_clause).strip())

    # Post-process to merge short fragments
    final_clauses = []
    for clause in clauses:
        if not final_clauses:
            final_clauses.append(clause)
            continue

        # Merge fragments under 3 words with previous clause
        if len(clause.split()) < 3 and final_clauses:
            final_clauses[-1] += " " + clause
        else:
            final_clauses.append(clause)

    return final_clauses

In [None]:
# Process conversations
results = {}
for conv_id, conv_data in dataset.items():
    print(conv_id)
    results[conv_id] = []
    for turn in conv_data[0]:
        utterance = turn["utterance"]
        clauses = enhanced_clause_segmentation(utterance)
        results[conv_id].append({
            "turn": turn["turn"],
            "original": utterance,
            "clauses": clauses
        })


tr_4466
tr_7536
tr_754
tr_4110
tr_3432
tr_943
tr_3894
tr_1561
tr_4533
tr_5297
tr_19
tr_4248
tr_2317
tr_1452
tr_3694
tr_1711
tr_264
tr_1146
tr_2469
tr_692
tr_3807
tr_5174
tr_3821
tr_4915
tr_654
tr_513
tr_3580
tr_4974
tr_5766
tr_4181
tr_2813
tr_1566
tr_1975
tr_398
tr_4127
tr_58
tr_1902
tr_6618
tr_13
tr_2232
tr_4756
tr_4502
tr_2908
tr_6500
tr_2122
tr_1464
tr_2652
tr_2657
tr_914
tr_2135
tr_534
tr_2179
tr_2816
tr_4464
tr_402
tr_924
tr_1684
tr_1086
tr_6324
tr_154
tr_4747
tr_3734
tr_5307
tr_802
tr_3620
tr_786
tr_1470
tr_3449
tr_3386
tr_4434
tr_6265
tr_1474
tr_2767
tr_4103
tr_3192
tr_1614
tr_2511
tr_4080
tr_3025
tr_1531
tr_3477
tr_774
tr_2217
tr_5190
tr_109
tr_1026
tr_2105
tr_3357
tr_1569
tr_4182
tr_3399
tr_1522
tr_3588
tr_4339
tr_7105
tr_2535
tr_5706
tr_3342
tr_779
tr_3753
tr_3417
tr_4201
tr_3896
tr_2968
tr_1360
tr_3962
tr_5811
tr_8
tr_3185
tr_72
tr_2851
tr_1447
tr_3700
tr_4680
tr_4307
tr_2822
tr_66
tr_3511
tr_850
tr_4155
tr_2484
tr_7614
tr_3354
tr_7317
tr_945
tr_4679
tr_2684
tr_1607
tr_2991


In [None]:
output_path = "/content/clauses_enhanced.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

files.download(output_path)
print("Enhanced clause segmentation completed!")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Enhanced clause segmentation completed!
