In [28]:
import math
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np

# Function to preprocess the text
def preprocess_text(text):
    # Tokenize text into sentences
    sentences = sent_tokenize(text)

    # Tokenize sentences into words and remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word_tokenize(sentence.lower()) for sentence in sentences]
    words = [[word for word in sentence if word not in stop_words] for sentence in words]

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    words = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in words]

    return words

def get_cos_similar(v1: list, v2: list):
    num = float(np.dot(v1, v2))  # 向量点乘
    denom = np.linalg.norm(v1) * np.linalg.norm(v2)  # 求模长的乘积
    return 0.5 + 0.5 * (num / denom) if denom != 0 else 0

# Function to calculate sentence similarity using cosine similarity
def sentence_similarity(sentence1, sentence2, sim_model='sbert', model=None):
    # Convert sentences to sets of unique words
    if sim_model=='sbert' and model:
        sentence_embeddings = model.encode([sentence1, sentence2])
        return float(get_cos_similar(sentence_embeddings[0], sentence_embeddings[1]))
    else:
        '''
        $$ \text{similarity}(s_1, s_2) = \frac{\sum_{w \in s_1 \cap s_2} c_1(w) \times c_2(w)}{\sqrt{\sum_{w \in s_1} c_1(w)^2} \times \sqrt{\sum_{w \in s_2} c_2(w)^2}} $$
        '''
        s1 = set(sentence1)
        s2 = set(sentence2)

        # Calculate the intersection of the two sets
        intersection = s1.intersection(s2)

        # Calculate the cosine similarity
        numerator = sum([sentence1.count(word) * sentence2.count(word) for word in intersection])
        denominator = math.sqrt(sum([sentence1.count(word)**2 for word in sentence1])) * math.sqrt(sum([sentence2.count(word)**2 for word in sentence2]))

        if denominator == 0:
            return 0
        else:
            return numerator / denominator

# Function to calculate sentence scores using TextRank algorithm
def textrank(sentences, d=0.85, max_iter=100, sim_model='sbert', model=None):
    # Preprocess the text
    if sim_model=='sbert' and model:
        words=sentences.copy()
    else:
        words = preprocess_text(' '.join(sentences))

    # Create a graph with sentences as nodes and edges between similar sentences
    graph = {}
    for i in range(len(words)):
        for j in range(i+1, len(words)):
            similarity = sentence_similarity(words[i], words[j], sim_model=sim_model, model=model)
            if similarity > 0:
                if i not in graph:
                    graph[i] = {}
                if j not in graph:
                    graph[j] = {}
                graph[i][j] = similarity
                graph[j][i] = similarity

    # Apply the TextRank algorithm
    scores = [1.0] * len(words)
    for i in range(max_iter):
        new_scores = [0.0] * len(words)
        for j in range(len(words)):
            for k in graph[j]:
                new_scores[k] += d * graph[j][k] * scores[j]
            new_scores[j] += 1 - d
        scores = new_scores

    # Normalize the scores
    max_score = max(scores)
    if max_score > 0:
        scores = [score / max_score for score in scores]

    # Map scores to sentences
    sentence_scores = {}
    for i in range(len(sentences)):
        sentence_scores[sentences[i]] = scores[i]

    return sentence_scores

In [35]:
def split_sentences(text):
    if text.strip() == "":
        return None
    else:
        sentences = nltk.sent_tokenize(text)
        if len(sentences) == 0:
            return None
        elif len(sentences) == 1:
            return sentences[0]
        else:
            last_sentence = sentences[-1]
            if last_sentence.strip() != text.strip():
                return " ".join(sentences[:-1])
            else:
                return text

In [30]:
# Example usage
import random
import numpy as np
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

sentences = [
"The cat is on the mat.",
"The feline is resting on the carpet.",
"The kitty is lounging on the rug.",
"The puss is reclining on the doormat.",
"The tomcat is lying on the floor covering.",
"The man went to the store to buy some bread.",
"The woman drove to the beach to go for a swim.",
"The dog barked at the mailman when he delivered the mail.",
"The bird sang a sweet melody in the morning.",
"The tree lost its leaves in the autumn breeze."
]
#random.shuffle(sentences)
scores = textrank(sentences, sim_model='sbert', model=model)

for sentence, score in scores.items():
    print(f"{sentence}: {score}")

The cat is on the mat.: 1.0
The feline is resting on the carpet.: 0.9927521924046663
The kitty is lounging on the rug.: 0.9864328292499707
The puss is reclining on the doormat.: 0.9249253655469024
The tomcat is lying on the floor covering.: 0.9285733841261431
The man went to the store to buy some bread.: 0.8565111567116346
The woman drove to the beach to go for a swim.: 0.8106649435816999
The dog barked at the mailman when he delivered the mail.: 0.930270909468893
The bird sang a sweet melody in the morning.: 0.8869145616308369
The tree lost its leaves in the autumn breeze.: 0.8791913421101928


In [3]:
words = preprocess_text(' '.join(sentences))

In [36]:
split_sentences('I agree. I agree. I think the real disadvantages of the mobile phone are that, although it can also distract the mind and help others, it is')

'I agree. I agree.'

In [17]:
a[0]

0.118813254

This is a sentence.


This is the second sentence. This is the third sentence. This is the first sentence.


'The ス shaft golden # # movie stan.'