# CS-651-A TEXT BASED ANALYSIS
# Sai Vandana - 0939231
# Assignment - 6 5/4/2024

#### Import necessary libraries

In [2]:
from nltk.corpus import stopwords #you can remove stop words for speed
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx

#### The code reads text from a file, splits it into sentences, cleans each sentence by removing non-alphabetical characters, splits them into words, and prints both the original and cleaned versions of each sentence.

In [3]:
import re  # Import regular expressions to handle non-alphabetical removal

# Open the file with appropriate encoding
file = open("EnglishText.txt", "r", encoding='utf-8')
# Read all lines from the file
filedata = file.read()  # Changed to read() to handle multiple lines if necessary
# Split into sentences by period followed by a space or at the end of paragraphs
article = re.split(r'\.\s+|\.$', filedata.strip())

sentences = []
# Iterate over each sentence in the article
for sentence in article:
    # Check if the sentence is not just whitespace
    if sentence.strip():
        # Print the original sentence
        print("Original:", sentence)
        # Clean the sentence by removing non-alphabetical characters, then split into words
        cleaned_sentence = re.sub("[^a-zA-Z]", " ", sentence).split()
        # Append the cleaned words of the sentence to the list
        sentences.append(cleaned_sentence)
        # Print the cleaned and split sentence
        print("Processed:", cleaned_sentence)
    

Original: In the heart of an ancient forest, a mysterious and untouched land hides secrets of the ages
Processed: ['In', 'the', 'heart', 'of', 'an', 'ancient', 'forest', 'a', 'mysterious', 'and', 'untouched', 'land', 'hides', 'secrets', 'of', 'the', 'ages']
Original: This forest, known only to the old wise owl and the creatures that inhabit its dark recesses, holds the key to a magical world where time stands still and nature speaks
Processed: ['This', 'forest', 'known', 'only', 'to', 'the', 'old', 'wise', 'owl', 'and', 'the', 'creatures', 'that', 'inhabit', 'its', 'dark', 'recesses', 'holds', 'the', 'key', 'to', 'a', 'magical', 'world', 'where', 'time', 'stands', 'still', 'and', 'nature', 'speaks']


#### The code reads text from a file, splits it into Spanish sentences using appropriate punctuation, cleans each sentence by removing non-alphabetical characters (preserving Spanish accents and ñ), splits them into words, and prints both the original and cleaned versions of each sentence.

In [4]:
import re  # Import regular expressions to handle non-alphabetical removal

# Open the file with appropriate encoding
file = open("SpanishText.txt", "r", encoding='utf-8')
# Read all lines from the file
filedata = file.read()  # Changed to read() to handle multiple lines if necessary
# Split into sentences considering Spanish punctuation
article = re.split(r'(?<=[.!?¿¡])\s+', filedata.strip())

sentences = []
# Iterate over each sentence in the article
for sentence in article:
    # Check if the sentence is not just whitespace
    if sentence.strip():
        # Print the original sentence
        print("Original:", sentence)
        # Clean the sentence by removing non-alphabetical characters, then split into words
        cleaned_sentence = re.sub("[^a-zA-ZáéíóúñÁÉÍÓÚÑ]", " ", sentence).split()
        # Append the cleaned words of the sentence to the list
        sentences.append(cleaned_sentence)
        # Print the cleaned and split sentence
        print("Processed:", cleaned_sentence)
  

Original: En el corazón de un bosque antiguo, una tierra misteriosa e intacta esconde secretos de los tiempos.
Processed: ['En', 'el', 'corazón', 'de', 'un', 'bosque', 'antiguo', 'una', 'tierra', 'misteriosa', 'e', 'intacta', 'esconde', 'secretos', 'de', 'los', 'tiempos']
Original: Este bosque, conocido solo por el viejo búho sabio y las criaturas que habitan sus oscuros recovecos, guarda la llave de un mundo mágico donde el tiempo se detiene y la naturaleza habla.
Processed: ['Este', 'bosque', 'conocido', 'solo', 'por', 'el', 'viejo', 'búho', 'sabio', 'y', 'las', 'criaturas', 'que', 'habitan', 'sus', 'oscuros', 'recovecos', 'guarda', 'la', 'llave', 'de', 'un', 'mundo', 'mágico', 'donde', 'el', 'tiempo', 'se', 'detiene', 'y', 'la', 'naturaleza', 'habla']


#### The code reads a Telugu text file, splits it into sentences, cleans each by removing non-Telugu characters (using Unicode ranges), splits them into words, and prints both the original and processed versions of each sentence.

In [5]:
import re

# Open the file with appropriate encoding
file = open("TeluguText.txt", "r", encoding='utf-8')
# Read all lines from the file
filedata = file.readlines()
# Assuming the text is in the first paragraph and split into sentences by a period followed by a space
article = re.split(r'\.\s+', filedata[0].strip())  # Using regex to split on '. ' for Telugu might still work

sentences = []
# Iterate over each sentence in the article
for sentence in article:
    if sentence.strip():  # Check if the sentence is not just whitespace
        # Print the original sentence
        print("Original:", sentence)
        # Clean the sentence by removing non-alphabetical characters, then split into words
        # In Telugu, this regex will be more complex due to Unicode characters
        cleaned_sentence = re.sub(r"[^\u0C00-\u0C7F]+", " ", sentence).split()
        # Append the cleaned words of the sentence to the list
        sentences.append(cleaned_sentence)
        # Print the cleaned and split sentence
        print("Processed:", cleaned_sentence)
    else:
        print("Skipping empty or whitespace-only sentence.")


Original: తెలుగు భాష అందమైనది
Processed: ['తెలుగు', 'భాష', 'అందమైనది']
Original: ఇది భారతదేశంలో వాడబడుతున్న ఒక ప్రముఖ భాష.
Processed: ['ఇది', 'భారతదేశంలో', 'వాడబడుతున్న', 'ఒక', 'ప్రముఖ', 'భాష']


In [6]:
print("Sentences are ", sentences)

Sentences are  [['తెలుగు', 'భాష', 'అందమైనది'], ['ఇది', 'భారతదేశంలో', 'వాడబడుతున్న', 'ఒక', 'ప్రముఖ', 'భాష']]


#### The function sentence_similarity calculates the cosine similarity between two sentences by first creating frequency vectors for the words in both sentences and then computing the cosine similarity 

In [7]:
from scipy.spatial.distance import cosine

def sentence_similarity(sent1, sent2):
    # Create a set of all words in both sentences
    all_words = list(set(sent1 + sent2))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)

    # Build the vector for the first sentence
    for w in sent1:
        vector1[all_words.index(w)] += 1

    # Build the vector for the second sentence
    for w in sent2:
        vector2[all_words.index(w)] += 1

    # Compute cosine similarity (1 - cosine distance)
    # Adding a small epsilon to avoid division by zero error in cosine calculation
    cosine_similarity = 1 - cosine(vector1, vector2) if sum(vector1) and sum(vector2) else 0
    return cosine_similarity


#### The code calculates and displays a similarity matrix for a list of sentences by applying the sentence_similarity function to each pair of sentences, excluding comparisons with themselves.

In [8]:
similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
             if idx1 == idx2: #ignore if both are same sentences
                continue 
             similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2])

print("Smilarity matrix \n", similarity_matrix)


Smilarity matrix 
 [[0.         0.23570226]
 [0.23570226 0.        ]]


#### The code creates a graph from a similarity matrix and computes the PageRank scores for each sentence, then prints these scores. 

In [9]:
# Step 3 - Rank sentences in similarity martix
sentence_similarity_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(sentence_similarity_graph)
print("scores", scores)

scores {0: 0.49999999999999994, 1: 0.49999999999999994}


#### The code sorts sentences based on their PageRank scores from highest to lowest and prints the ranked list, combining each sentence's score with its original text for clarity.

In [10]:
# Step 4 - Sort the rank and pick top sentences
ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    
print("Indexes of top ranked_sentence order are \n\n", ranked_sentence)


Indexes of top ranked_sentence order are 

 [(0.49999999999999994, ['తెలుగు', 'భాష', 'అందమైనది']), (0.49999999999999994, ['ఇది', 'భారతదేశంలో', 'వాడబడుతున్న', 'ఒక', 'ప్రముఖ', 'భాష'])]


#### The code prompts the user to specify how many top-ranked sentences to include in a summary, then constructs the summary by concatenating the specified number of highest-ranked sentences.

In [11]:
#Step 5 - How many sentences to pick
n = int(input("How many sentences do you want in the summary? "))
#n=2
summarize_text = []
for i in range(n):
      summarize_text.append(" ".join(ranked_sentence[i][1]))

How many sentences do you want in the summary? 2


#### The code outputs the final summarized text by concatenating the selected top-ranked sentences into a cohesive paragraph.

In [12]:
# Step 6 - Offcourse, output the summarize text
print("Summarize Text: \n", ". ".join(summarize_text))

Summarize Text: 
 తెలుగు భాష అందమైనది. ఇది భారతదేశంలో వాడబడుతున్న ఒక ప్రముఖ భాష
