https://towardsdatascience.com/how-to-chunk-text-data-a-comparative-analysis-3858c4a0997a

In [1]:
from PyPDF2 import PdfReader

# Extracting Text from PDF
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        pdf = PdfReader(file)
        text = " ".join(page.extract_text() for page in pdf.pages)
    return text

# Extract text from the PDF and split it into sentences
text = extract_text_from_pdf('DW.pdf')

In [2]:
sample = text[1015:3037]
print(sample)

ILIZZO QUOTIDIANO.................................................................................. 21
11. CONSIGLI E SUGGERIMENTI UTILI...............................................................24
12. MANUTENZIONE E PULIZIA........................................................................... 26
13. RISOLUZIONE DEI PROBLEMI....................................................................... 30
14. INFORMAZIONI TECNICHE.............................................................................36
15. CONSIDERAZIONI SULL'AMBIENTE.............................................................. 37
My AEG Kitchen app www.aeg.com 2 PER RISULTATI PERFETTI
Grazie per aver scelto di acquistare questo prodotto AEG. Lo abbiamo creato per
fornirvi prestazioni impeccabili per molti anni, grazie a tecnologie innovative che
vi semplificheranno la vita - funzioni che non troverete sulle normali
apparecchiature. Vi invitiamo di dedicare qualche minuto alla lettura per sapere
come trarre il 

### NLTK Sentence Tokenizer ###

In [3]:
import nltk
nltk.download('punkt')

# Splitting Text into Sentences
def split_text_into_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

sentences = split_text_into_sentences(text)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/stefanopetrina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
print('The text has', len(text), 'characters of length.')
print('Sentences extracted:', len(sentences))
print('Mean of', len(text)/len(sentences), 'characters per sentence.')

The text has 70369 characters of length.
Sentences extracted: 817
Mean of 86.13096695226439 characters per sentence.


### Spacy Sentence Splitter ###

In [7]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(text)
sentences = list(doc.sents)

In [8]:
print('Sentences extracted:', len(sentences))
print('Mean of', len(text)/len(sentences), 'characters per sentence.')

Sentences extracted: 933
Mean of 75.42229367631298 characters per sentence.


### Langchain Character Text Splitter ###

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Initialize the text splitter with custom parameters
custom_text_splitter = RecursiveCharacterTextSplitter(
    # Set custom chunk size
    chunk_size = 100,
    chunk_overlap  = 20,
    # Use length of the text as the size measure
    length_function = len

)

# Create the chunks
sentences = custom_text_splitter.create_documents([sample])

In [10]:
# Print the first two chunks
print(f'### Chunk 1: \n\n{sentences[25].page_content}\n\n=====\n')
print(f'### Chunk 2: \n\n{sentences[26].page_content}\n\n=====')

### Chunk 1: 

di serie.
Le informazioni sono riportate sulla targhetta identificativa.

=====

### Chunk 2: 

Avvertenza/Attenzione - Importanti Informazioni per la sicurezza

=====


In [11]:
print('Sentences extracted:', len(sentences))
print('Mean of', len(text)/len(sentences), 'characters per sentence.')

Sentences extracted: 30
Mean of 2345.633333333333 characters per sentence.


In [22]:
# Initialize the text splitter with custom parameters
custom_text_splitter = RecursiveCharacterTextSplitter(
    # Set custom chunk size
    chunk_size = 300,
    chunk_overlap  = 30,
    # Use length of the text as the size measure
    length_function = len,
    # Use only "\n\n" as the separator
    separators = ['\n']
)

# Create the chunks
custom_texts = custom_text_splitter.create_documents([sample])

In [23]:
print('Sentences extracted:', len(custom_texts))
print('Mean of', len(text)/len(custom_texts), 'characters per sentence.')

Sentences extracted: 8
Mean of 8796.125 characters per sentence.


In [32]:
# sentences1 = ["This is an example sentence.", "Another sentence goes here.", "..."]
# print(type(sentences1))

### KMeans Clustering ###

In [43]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

# Load the Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define a list of sentences (your text data)
sentences = ["This is an example sentence.", "Another sentence goes here.", "..."]

# Generate embeddings for the sentences
embeddings = model.encode(sentences)

# Choose an appropriate number of clusters (here we choose 5 as an example)
num_clusters = 3

# Perform K-means clustering
kmeans = KMeans(n_clusters=num_clusters)
clusters = kmeans.fit_predict(embeddings)

  super()._check_params_vs_input(X, default_n_init=10)


In [44]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

nltk.download('stopwords')

# Define a list of stop words
stop_words = set(stopwords.words('italian'))

# Define a function to clean sentences
def clean_sentence(sentence):
    # Tokenize the sentence
    tokens = word_tokenize(sentence)
    # Convert to lower case
    tokens = [w.lower() for w in tokens]
    # Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # Remove non-alphabetic tokens
    words = [word for word in stripped if word.isalpha()]
    # Filter out stop words
    words = [w for w in words if not w in stop_words]
    return words

# Compute and print Word Clouds for each cluster
for i in range(num_clusters):
    cluster_sentences = [sentences[j] for j in range(len(sentences)) if clusters[j] == i]
    cleaned_sentences = [' '.join(clean_sentence(s)) for s in cluster_sentences]
    text = ' '.join(cleaned_sentences)

    wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(text)
    plt.figure()
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Cluster {i}")
    plt.show()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/stefanopetrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ValueError: We need at least 1 word to plot a word cloud, got 0.

### Clustering Adjacent Sentences ###

In [12]:
import numpy as np
import spacy

# Load the Spacy model
nlp = spacy.load('en_core_web_sm')

def process(text):
    doc = nlp(text)
    sents = list(doc.sents)
    vecs = np.stack([sent.vector / sent.vector_norm for sent in sents])

    return sents, vecs

def cluster_text(sents, vecs, threshold):
    clusters = [[0]]
    for i in range(1, len(sents)):
        if np.dot(vecs[i], vecs[i-1]) < threshold:
            clusters.append([])
        clusters[-1].append(i)
    
    return clusters

def clean_text(text):
    # Add your text cleaning process here
    return text

# Initialize the clusters lengths list and final texts list
clusters_lens = []
final_texts = []

# Process the chunk
threshold = 0.3
sents, vecs = process(text)

# Cluster the sentences
clusters = cluster_text(sents, vecs, threshold)

for cluster in clusters:
    cluster_txt = clean_text(' '.join([sents[i].text for i in cluster]))
    cluster_len = len(cluster_txt)
    
    # Check if the cluster is too short
    if cluster_len < 60:
        continue
    
    # Check if the cluster is too long
    elif cluster_len > 3000:
        threshold = 0.6
        sents_div, vecs_div = process(cluster_txt)
        reclusters = cluster_text(sents_div, vecs_div, threshold)
        
        for subcluster in reclusters:
            div_txt = clean_text(' '.join([sents_div[i].text for i in subcluster]))
            div_len = len(div_txt)
            
            if div_len < 60 or div_len > 3000:
                continue
            
            clusters_lens.append(div_len)
            final_texts.append(div_txt)
            
    else:
        clusters_lens.append(cluster_len)
        final_texts.append(cluster_txt)

In [15]:
print(len(final_texts))
print(len(final_texts[0]))
print(clusters_lens[0])


128
1006
1006


In [41]:
final_texts_lengths = [len(chunk) for chunk in final_texts]
print(final_texts_lengths)

[1006, 2672, 2451, 64, 1371, 874, 335, 529, 502, 231, 1672, 2077, 1420, 530, 1499, 2176, 278, 398, 2431, 233, 189, 75, 91, 62, 241, 612, 730, 2301, 2505, 1336, 182, 130, 799, 162, 913, 122, 866, 88, 93, 75, 567, 83, 264, 234, 154, 66, 79, 156, 809, 333, 369, 222, 65, 641, 450, 539, 1880, 324, 716, 396, 1175, 885, 478, 217, 195, 81, 128, 917, 365, 167, 785, 157, 161, 120, 207, 331, 456, 64, 373, 106, 113, 122, 504, 155, 281, 181, 155, 817, 156, 247, 221, 75, 645, 166, 740, 264, 240, 960, 162, 74, 941, 62, 224, 62, 805, 92, 735, 1190, 128, 608, 771, 692, 312, 536, 490, 71, 132, 265, 1067, 126, 82, 127, 351, 206, 617, 185, 1790, 544]
