In [53]:
import pandas as pd

df = pd.read_csv('ProjectGutenberg-ShortStories-Dataset/stories.csv')

# Inspect dataframe structure
print(df.head())
print(df.columns)

        bookno                                            content
0    51082.txt  *** START OF THIS PROJECT GUTENBERG EBOOK COMI...
1    32243.txt  *** START OF THIS PROJECT GUTENBERG EBOOK CONF...
2    306-0.txt  *** START OF THIS PROJECT GUTENBERG EBOOK EARL...
3    31038.txt  *** START OF THIS PROJECT GUTENBERG EBOOK THE ...
4  28636-8.txt  *** START OF THIS PROJECT GUTENBERG EBOOK THE ...
Index(['bookno', 'content'], dtype='object')


In [54]:
df.iloc[0]['content']

'*** START OF THIS PROJECT GUTENBERG EBOOK COMING ATTRACTION ***\n\n\n\n\n\n\n\n\n\nProduced by Greg Weeks, Mary Meehan and the Online\n\nDistributed Proofreading Team at http://www.pgdp.net\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                           Coming Attraction\n\n\n\n                            BY FRITZ LEIBER\n\n\n\n                       Illustrated by Paul Calle\n\n\n\n           [Transcriber\'s Note: This etext was produced from\n\n                 Galaxy Science Fiction November 1950.\n\n         Extensive research did not uncover any evidence that\n\n         the U.S. copyright on this publication was renewed.]\n\n\n\n\n\n\n\n\n\n           Women will always go on trying to attract men ...\n\n             even when the future seems to have no future!\n\n\n\n\n\nThe coupe with the fishhooks welded to the fender shouldered up over\n\nthe curb like the nose of a nightmare. The girl in its path stood\n\nfrozen, her face probably stiff with fright under her mask. For onc

In [None]:
import re

def clean_gutenberg_text_improved(text):
    # Remove Gutenberg header explicitly
    header_pattern = r"\*\*\* START OF (THIS|THE) PROJECT GUTENBERG EBOOK.*?\*\*\*"
    header_match = re.search(header_pattern, text, re.IGNORECASE | re.DOTALL)

    # Remove Gutenberg footer explicitly
    footer_pattern = r"End of (the|this) Project Gutenberg.*"
    footer_match = re.search(footer_pattern, text, re.IGNORECASE | re.DOTALL)

    if header_match:
        text = text[header_match.end():]

    if footer_match:
        text = text[:footer_match.start()]

    # Remove transcriber notes, production notes, and URLs explicitly
    text = re.sub(r'Produced by.*?(?=\s[A-Z])', '', text, flags=re.IGNORECASE | re.DOTALL)
    text = re.sub(r'\[Transcriber.*?\]', '', text, flags=re.IGNORECASE | re.DOTALL)
    text = re.sub(r'http\S+', '', text)

    # Remove standalone illustrator or author lines (optional, if you don't want them)
    text = re.sub(r'(Illustrated by.*?\.)', '', text, flags=re.IGNORECASE)

    # Normalize escaped newlines and whitespace
    #text = text.replace('\\n', ' ')
    #text = re.sub(r'\s+', ' ', text).strip()

    return text

import pandas as pd

# df = pd.read_csv('stories.csv')

# Apply the improved cleaning
df['clean_content'] = df['content'].apply(clean_gutenberg_text_improved)

# Quick check on the first story
print(df['clean_content'][0][:1500])











 Greg Weeks, Mary Meehan and the Online

Distributed Proofreading Team at 



















                           Coming Attraction



                            BY FRITZ LEIBER



                       Illustrated by Paul Calle



           









           Women will always go on trying to attract men ...

             even when the future seems to have no future!





The coupe with the fishhooks welded to the fender shouldered up over

the curb like the nose of a nightmare. The girl in its path stood

frozen, her face probably stiff with fright under her mask. For once my

reflexes weren't shy. I took a fast step toward her, grabbed her elbow,

yanked her back. Her black skirt swirled out.



The big coupe shot by, its turbine humming. I glimpsed three faces.

Something ripped. I felt the hot exhaust on my ankles as the big

coupe swerved back into the street. A thick cloud like a black flower

blossomed from its jouncing rear end, while from the fishhooks f

In [56]:
import pandas as pd

# Load original CSV file
# df = pd.read_csv('stories.csv')

# Extract the top 5 rows
top5_df = df.head(5)

# Save these top 5 rows to a new CSV file
top5_df.to_csv('top5_stories.csv', index=False)

print("Top 5 rows successfully saved to top5_stories.csv")

Top 5 rows successfully saved to top5_stories.csv


In [None]:
import pandas as pd
import re

# Robust function to extract story content
def extract_story(text):
    # Step 1: Remove Gutenberg header/footer explicitly
    text = re.sub(r'\*\*\* START OF (THIS|THE) PROJECT GUTENBERG EBOOK.*?\*\*\*', '', text, flags=re.I|re.DOTALL)
    text = re.sub(r'End of (the|this) Project Gutenberg.*', '', text, flags=re.I|re.DOTALL)

    # Step 2: Remove production/transcriber notes and URLs
    text = re.sub(r'Produced by.*?(?=[A-Z])', '', text, flags=re.I|re.DOTALL)
    text = re.sub(r'\[.*?Transcriber.*?\]', '', text, flags=re.I|re.DOTALL)
    text = re.sub(r'http\S+', '', text)

    # Step 3: Normalize whitespace
    text = re.sub(r'\n{4,}', '\n\n\n', text)
    #text = text.replace('\\n', ' ')
    #text = re.sub(r'\s+', ' ', text).strip()

    # Step 4: Identify the story's beginning
    sentences = re.split(r'(?<=[.!?]) +', text)
    
    narrative_start = 0
    for i, sentence in enumerate(sentences):
        words = sentence.split()
        # Heuristic: start from first sentence with >=8 words, ends with '.', '!', '?'
        if len(words) >= 8 and sentence[-1] in '.!?':
            narrative_start = i
            break
    
    story_text = ' '.join(sentences[narrative_start:]).strip()

    return story_text

# Load your CSV
df = pd.read_csv('top5_stories.csv')

# Apply extraction to 'content' column
df['story_only'] = df['content'].apply(extract_story)

# Optional: Save to new CSV
df[['story_only']].to_csv('top5_stories_extracted.csv', index=False)

# Quick verification
print(df['story_only'].iloc[0][:1000])

Greg Weeks, Mary Meehan and the Online

Distributed Proofreading Team at 


                           Coming Attraction


                            BY FRITZ LEIBER


                       Illustrated by Paul Calle


           


           Women will always go on trying to attract men ...

             even when the future seems to have no future!


The coupe with the fishhooks welded to the fender shouldered up over

the curb like the nose of a nightmare. The girl in its path stood

frozen, her face probably stiff with fright under her mask. For once my

reflexes weren't shy. I took a fast step toward her, grabbed her elbow,

yanked her back. Her black skirt swirled out.


The big coupe shot by, its turbine humming. I glimpsed three faces.

Something ripped. I felt the hot exhaust on my ankles as the big

coupe swerved back into the street. A thick cloud like a black flower

blossomed from its jouncing rear end, while from the fishhooks flew a

black shimmering rag.


"Did they g

In [58]:
story_one = df['story_only'].iloc[0][:2000]
print(story_one)

Greg Weeks, Mary Meehan and the Online

Distributed Proofreading Team at 


                           Coming Attraction


                            BY FRITZ LEIBER


                       Illustrated by Paul Calle


           


           Women will always go on trying to attract men ...

             even when the future seems to have no future!


The coupe with the fishhooks welded to the fender shouldered up over

the curb like the nose of a nightmare. The girl in its path stood

frozen, her face probably stiff with fright under her mask. For once my

reflexes weren't shy. I took a fast step toward her, grabbed her elbow,

yanked her back. Her black skirt swirled out.


The big coupe shot by, its turbine humming. I glimpsed three faces.

Something ripped. I felt the hot exhaust on my ankles as the big

coupe swerved back into the street. A thick cloud like a black flower

blossomed from its jouncing rear end, while from the fishhooks flew a

black shimmering rag.


"Did they g

In [61]:
import nltk
nltk.download('punkt_tab')

# Function to segment text into sentences and paragraphs
def segment_text(text):
    # Segment paragraphs (assuming paragraphs are separated by double newlines)
    paragraphs = text.split('\n\n\n')
    
    # Segment sentences for each paragraph
    segmented_paragraphs = []
    for paragraph in paragraphs:
        sentences = nltk.sent_tokenize(paragraph)
        segmented_paragraphs.append(sentences)
    
    return segmented_paragraphs

# Sample text
text = """
This is the first paragraph. It has multiple sentences. 
Here is another sentence.

This is the second paragraph. It also has more sentences.
The final sentence of the second paragraph.
"""

# Segment the text into sentences and paragraphs
segmented_text = segment_text(story_one)

# Display the result
for paragraph in segmented_text:
    print("Paragraph:")
    for sentence in paragraph:
        print(f"  {sentence}")


Paragraph:
  Greg Weeks, Mary Meehan and the Online

Distributed Proofreading Team at
Paragraph:
                             Coming Attraction
Paragraph:
                              BY FRITZ LEIBER
Paragraph:
                         Illustrated by Paul Calle
Paragraph:
Paragraph:
             Women will always go on trying to attract men ...

             even when the future seems to have no future!
Paragraph:
  The coupe with the fishhooks welded to the fender shouldered up over

the curb like the nose of a nightmare.
  The girl in its path stood

frozen, her face probably stiff with fright under her mask.
  For once my

reflexes weren't shy.
  I took a fast step toward her, grabbed her elbow,

yanked her back.
  Her black skirt swirled out.
Paragraph:
  The big coupe shot by, its turbine humming.
  I glimpsed three faces.
  Something ripped.
  I felt the hot exhaust on my ankles as the big

coupe swerved back into the street.
  A thick cloud like a black flower

blossomed from i

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\wccha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [52]:
from sentence_transformers import SentenceTransformer
import nltk

# Download the NLTK tokenizer model
nltk.download('punkt')

# Function to segment text into sentences and paragraphs
def segment_text(text):
    # Segment paragraphs (assuming paragraphs are separated by double newlines)
    paragraphs = text.split('\n\n')
    
    return paragraphs

# Function to embed paragraphs using Sentence-BERT
def embed_paragraphs(paragraphs, model_name='all-MiniLM-L6-v2'):
    # Load the pre-trained Sentence-BERT model
    model = SentenceTransformer(model_name)
    
    # Embed each paragraph (or small group of paragraphs)
    paragraph_embeddings = []
    for i in range(0, len(paragraphs), 3):  # Group paragraphs in chunks of 1-3
        grouped_paragraphs = ' '.join(paragraphs[i:i+3])  # Combine up to 3 paragraphs
        embedding = model.encode(grouped_paragraphs)  # Embed the combined paragraph(s)
        paragraph_embeddings.append(embedding)
    
    return paragraph_embeddings

# Sample text
text = """
This is the first paragraph. It has multiple sentences. 
Here is another sentence.

This is the second paragraph. It also has more sentences.
The final sentence of the second paragraph.

Here comes the third paragraph. It talks about something else entirely.
And this is an additional sentence in the third paragraph.

This is the fourth paragraph. It continues with new content.
"""

# Segment the text into paragraphs
paragraphs = segment_text(text)

# Embed the paragraphs using Sentence-BERT
embeddings = embed_paragraphs(paragraphs)

# Display the resulting embeddings (optional, here we print the shape of embeddings)
for idx, embedding in enumerate(embeddings):
    print(f"Embedding for paragraph group {idx+1}:")
    print(embedding[:10])  # Display the first 10 elements of the embedding for brevity


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wccha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Embedding for paragraph group 1:
[-0.00302981  0.02635819  0.02379966 -0.00545861  0.07930513  0.07003536
  0.02205101 -0.05280449  0.10622332 -0.01892141]
Embedding for paragraph group 2:
[ 0.00204728  0.01365002  0.07008743 -0.02713877  0.05963783  0.07407629
 -0.01847285 -0.13995185  0.06961206  0.02326934]
