In [1]:
import numpy as np
import networkx as nx

In [2]:
def cosine_similarity(vec1, vec2):
    """Calculate cosine similarity between two vectors."""
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product / (norm1 * norm2)

In [3]:
def build_similarity_matrix(sentences, threshold=0.1):
    """Build the similarity matrix of sentences."""
    n = len(sentences)
    similarity_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            if i == j:
                continue
            similarity = cosine_similarity(sentences[i], sentences[j])
            if similarity > threshold:
                similarity_matrix[i][j] = similarity
    return similarity_matrix

In [4]:
def lexrank(sentences, threshold=0.1, damping_factor=0.85, max_iter=100):
    """Calculate LexRank scores for sentences."""
    # Build similarity matrix
    similarity_matrix = build_similarity_matrix(sentences, threshold=threshold)

    # Normalize rows of similarity matrix
    row_sums = similarity_matrix.sum(axis=1, keepdims=True)
    similarity_matrix = np.divide(similarity_matrix, row_sums)

    # Initialize scores
    n = len(sentences)
    scores = np.ones(n) / n

    # Iterate until convergence or max iterations reached
    for i in range(max_iter):
        new_scores = np.zeros(n)
        for j in range(n):
            # Calculate score for sentence j
            for k in range(n):
                if similarity_matrix[k][j] > 0:
                    new_scores[j] += similarity_matrix[k][j] * scores[k]
            # Apply damping factor
            new_scores[j] = (1 - damping_factor) + damping_factor * new_scores[j]
        # Check for convergence
        if np.allclose(new_scores, scores):
            break
        scores = new_scores

    # Return sentence scores
    return scores

In [5]:
sentences = [
    np.array([0.1, 0.2, 0.3]),
    np.array([0.2, 0.3, 0.4]),
    np.array([0.3, 0.4, 0.5]),
    np.array([0.4, 0.5, 0.6])
]
scores = lexrank(sentences)
print(scores)

[0.99372239 1.00418618 1.00261895 0.9992307 ]


In [6]:
import gensim.downloader as api

In [7]:
model = api.load('word2vec-google-news-300')



In [8]:
text = """
Astronomers have used the James Webb Space Telescope to peer back in time to the early days of the universe — and they spotted something unexpected.
The space observatory revealed six massive galaxies that existed between 500 million and 700 million years after the big bang that created the universe. The discovery is completely upending existing theories about the origins of galaxies, according to a new study published Wednesday in the journal Nature.
“These objects are way more massive​ than anyone expected,” said study coauthor Joel Leja, assistant professor of astronomy and astrophysics at Penn State University, in a statement. “We expected only to find tiny, young, baby galaxies at this point in time, but we’ve discovered galaxies as mature as our own in what was previously understood to be the dawn of the universe.”
The telescope observes the universe in infrared light, which is invisible to the human eye, and is capable of detecting the faint light from ancient stars and galaxies. By peering into the distant universe, the observatory can essentially see back in time up to about 13.5 billion years ago. (Scientists have determined the universe is about 13.7 billion years old.)
"""

In [9]:
# Split text into sentences
sentences = text.split(".")

# Generate embeddings for each sentence
sentence_embeddings = []
for sentence in sentences:
    words = sentence.split()
    embeddings = [model[word] for word in words if word in model.vocab]
    if len(embeddings) > 0:
        sentence_embeddings.append(np.mean(embeddings, axis=0))

In [12]:
sentence_embeddings

[array([ 0.04854911,  0.06217811,  0.04470898,  0.00428409, -0.05721029,
        -0.0589338 ,  0.00829933, -0.07845198,  0.10017468,  0.04996745,
         0.01584734, -0.05689058, -0.00793566, -0.00384812, -0.09521484,
         0.02199228,  0.02779279,  0.08077858,  0.10377938, -0.03385598,
        -0.0547776 ,  0.0312064 ,  0.01397414, -0.06247929,  0.02141462,
        -0.0384754 , -0.03550139,  0.03773281,  0.05035764, -0.0303926 ,
        -0.06173125, -0.03448196, -0.06996954, -0.08807518,  0.00815546,
        -0.06808181, -0.0149478 , -0.08262707,  0.04833004,  0.06607419,
         0.06942894,  0.01046753,  0.05590275,  0.01919992,  0.03637059,
        -0.01935759, -0.04982213,  0.01182338,  0.06706019,  0.03096226,
        -0.02910796,  0.02063279, -0.02948579, -0.00578962, -0.07834298,
         0.0223385 ,  0.07271903, -0.06565057,  0.0241721 , -0.11053067,
        -0.06929089,  0.05693527, -0.07825869, -0.10689581, -0.08275205,
        -0.02676246, -0.05849202,  0.11204311, -0.0

In [10]:
# Calculate LexRank scores for sentences
scores = lexrank(sentence_embeddings)

# Sort sentences by score and get top n sentences as summary
n = 2
top_sentences = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:n]
summary = [sentences[i].strip() for i in top_sentences]

In [11]:
# Print summary
print("Summary:")
print("\n".join(summary))

Summary:
The space observatory revealed six massive galaxies that existed between 500 million and 700 million years after the big bang that created the universe
“We expected only to find tiny, young, baby galaxies at this point in time, but we’ve discovered galaxies as mature as our own in what was previously understood to be the dawn of the universe
