# Text Segmentation

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from segmentation_tool import segmentation_tool
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

## Some generated random documents

In [6]:
short_text = {
    "id": "doc-1",
    "content": "This is a short text. It has only a few sentences. Each sentence is quite simple."
}

paragraphed_text = {
    "id": "doc-2",
    "content": """Paragraph 1: This is the first paragraph. It contains multiple sentences. The sentences are about the same topic.

Paragraph 2: This is the second paragraph. It's separate from the first one. A blank line divides them.

Paragraph 3: This is the third paragraph. Each paragraph can be considered a semantic unit. They should be segmented properly."""
}

long_text = {
    "id": "doc-3",
    "content": """Machine learning is a field of inquiry devoted to understanding and building methods that 'learn', that improve performance based on experience. Machine learning algorithms build a model based on sample data, known as training data, in order to make predictions or decisions without being explicitly programmed to do so. Machine learning algorithms are used in a wide variety of applications, such as in medicine, email filtering, speech recognition, and computer vision, where it is difficult or unfeasible to develop conventional algorithms to perform the needed tasks.

A subset of machine learning is closely related to computational statistics, which focuses on making predictions using computers, but not all machine learning is statistical learning. The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning. Data mining is a related field of study, focusing on exploratory data analysis through unsupervised learning.

Some implementations of machine learning use data and neural networks in a way that mimics the working of a biological brain. In its application across business problems, machine learning is also referred to as predictive analytics.

The term machine learning was coined in 1959 by Arthur Samuel, an IBM employee and pioneer in the field of computer gaming and artificial intelligence. The synonym self-teaching computers was also used in this time period.

By 1985, Terry Sejnowski and Charles Rosenberg had trained NETtalk, a neural network that learned to pronounce words in the same way as a baby, by being shown text along with correct phonetic transcriptions. Later work would use genetic algorithms or reinforcement learning, where the machine is rewarded for good responses and punished for bad ones."""
}

# Document with clear thematic shifts
thematic_shifts = {
    "id": "doc-4",
    "content": """The solar system is the gravitationally bound system of the Sun and the objects that orbit it. The largest objects that orbit the Sun directly are the eight planets, with the remainder being smaller objects, such as the five dwarf planets and small Solar System bodies.

Climate change is a long-term change in the average weather patterns that have come to define Earth's local, regional and global climates. Human activities, particularly fossil fuel burning, increase heat-trapping greenhouse gas levels in Earth's atmosphere, raising Earth's average surface temperature. This is global warming, which is one aspect of climate change.

Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to the natural intelligence displayed by animals including humans. Leading AI textbooks define the field as the study of 'intelligent agents': any system that perceives its environment and takes actions that maximize its chance of achieving its goals.

The Renaissance was a period in European history marking the transition from the Middle Ages to modernity and covering the 15th and 16th centuries. It began in Italy and spread to the rest of Europe by the 16th century. The Renaissance was characterized by an emphasis on humanism, a revival of classical learning, and reforms across society."""
}

documents = [short_text, paragraphed_text, long_text, thematic_shifts]

## Sentence-based Segmentation

In [None]:
for doc in documents:
    result = segmentation_tool(
        doc.copy(),
        strategy="sentence",
        content_key="content",
        output_key="segments"
    )
    
    print(f"Document {doc['id']} - Sentence Segmentation:")
    print(f"Number of segments: {len(result['segments'])}")
    
    for i, segment in enumerate(result['segments']):
        print(f"  Segment {i}: {segment}")
    print()

## Paragraph-based Segmentation

In [None]:
for doc in documents:
    result = segmentation_tool(
        doc.copy(),
        strategy="paragraph",
        content_key="content",
        output_key="segments"
    )
    
    print(f"Document {doc['id']} - Paragraph Segmentation:")
    print(f"Number of segments: {len(result['segments'])}")
    
    for i, segment in enumerate(result['segments']):
        print(f"  Segment {i}: {segment}")
        
    print()

## Fixed-Length Segmentation

In [None]:
doc = long_text.copy()
chunk_size = 50
overlap = 10
result = segmentation_tool(
        doc.copy(),
        strategy="fixed_length",
        content_key="content",
        output_key="segments",
        chunk_size=chunk_size,
        overlap=overlap
    )
    
print(f"Fixed-length segmentation with chunk_size={chunk_size}, overlap={overlap}:")
print(f"Number of segments: {len(result['segments'])}")

for i, segment in enumerate(result['segments']):
    print(f"  Segment {i}: {segment}")
    
print()

## Semantic Segmentation

In [None]:

result = segmentation_tool(
    thematic_shifts.copy(),
    strategy="semantic",
    content_key="content",
    output_key="segments",
    threshold=0.5  # threshold to control sensitivity to topic changes..
)

print(f"Semantic Segmentation Results:")
print(f"Number of segments: {len(result['segments'])}")

for i, segment in enumerate(result['segments']):
    print(f"  Segment {i}: {segment}")

# Compare with paragraph segmentation
para_result = segmentation_tool(
    thematic_shifts.copy(),
    strategy="paragraph",
    content_key="content",
    output_key="segments"
)

print(f"\nParagraph Segmentation for comparison:")
print(f"Number of segments: {len(para_result['segments'])}")

for i, segment in enumerate(para_result['segments']):
    print(f"  Segment {i}: {segment}")