In [1]:
import numpy as np
import pandas as pd
import spacy
import torch
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.tokenizer import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

#Evaluation matrices, rogue score etc

from sklearn.cluster import KMeans
from collections import defaultdict
from string import punctuation
from gensim import corpora, models
import networkx as nx
from collections import defaultdict
# Abstractive Summarization with Seq2Seq
from transformers import T5ForConditionalGeneration, T5Tokenizer

In [2]:
# Load the English language model
nlp = spacy.load("en_core_web_lg")

df = pd.read_csv("./Text_Summarization/BigBangTheory.csv", encoding='latin-1')

# Assuming your text corpus is in a column named "text"
text_corpus = df["document"].tolist()
#text_corpus = df["document"].str.encode('utf-8')

# Custom stop words
custom_stop_words = {"$", "£", "€"}  # Define additional stop words as needed

# Custom tokenizer to handle compound words
tokenizer = Tokenizer(nlp.vocab)


In [3]:
print(type(df))

<class 'pandas.core.frame.DataFrame'>


In [4]:

def preprocess_and_annotate(corpus):
    annotated_corpus = []
    for text in corpus:
        # Tokenize text with custom tokenizer to handle compound words
        doc = tokenizer(text)
        
        # Remove stop words, lemmatize, and extract POS tags
        tokens = []
        pos_tags = []
        for token in doc:
            if token.text.lower() not in STOP_WORDS.union(custom_stop_words) and token.text not in punctuation:
                tokens.append((token.text, token.lemma_))
                pos_tags.append((token.text, token.pos_))
        
        # Extract Named Entities
        entities = [(ent.text, ent.label_) for ent in nlp(text).ents]
        
        # Word embeddings
        word_embeddings = np.array([token.vector for token in doc if not token.is_stop])
        
        annotated_corpus.append({
            'text': text,
            'tokens': tokens,
            'pos_tags': pos_tags,
            'entities': entities,
            'word_embeddings' : word_embeddings
        })
    return annotated_corpus

preprocessed_corpus = []

# Preprocess and annotate text corpus
annotated_corpus = preprocess_and_annotate(text_corpus)
text_filtered = []
for annotation in annotated_corpus:
    y=[]
    y.append(annotation['tokens'])
    for x in annotation['tokens']:
        #print(x[0])
        preprocessed_corpus.append(x[0])
        
    text_filtered.append(preprocessed_corpus)
        


In [5]:

# Topic Modeling with LDA
dictionary = corpora.Dictionary(text_filtered)
#corpus = [dictionary.doc2bow(text.split()) for text in text_filtered]
corpus = [dictionary.doc2bow(text) for text in text_filtered]
lda_model = models.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=5)


In [6]:
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx} \nWords: {topic}")

Topic: 0 
Words: 0.062*" " + 0.026*"Big" + 0.023*"Bang" + 0.017*"universe" + 0.013*"cosmic" + 0.011*"years" + 0.011*"models" + 0.011*"" + 0.010*"billion" + 0.008*"expansion"
Topic: 1 
Words: 0.043*" " + 0.020*"Big" + 0.017*"Bang" + 0.015*"universe" + 0.010*"years" + 0.009*"models" + 0.009*"" + 0.008*"cosmic" + 0.008*"known" + 0.007*"light"
Topic: 2 
Words: 0.075*" " + 0.025*"universe" + 0.023*"Bang" + 0.020*"Big" + 0.013*"cosmic" + 0.013*"billion" + 0.010*"" + 0.010*"years" + 0.010*"models" + 0.009*"known"
Topic: 3 
Words: 0.091*" " + 0.028*"Big" + 0.019*"universe" + 0.018*"Bang" + 0.012*"cosmic" + 0.011*"" + 0.010*"models" + 0.010*"years" + 0.009*"expansion" + 0.009*"cosmological"
Topic: 4 
Words: 0.050*" " + 0.021*"Big" + 0.018*"Bang" + 0.017*"universe" + 0.011*"cosmic" + 0.011*"models" + 0.009*"years" + 0.009*"billion" + 0.008*"" + 0.007*"expansion"


T5-Small: 60 million parameters 242 mb

T5-Base: 220 million parameters 892 mb

T5-Large: 770 million parameters 2.95 gb

T5-3B: 3 billion parameters 11.4 gb

T5-11B: 11 billion parameters 45.2 gb

In [14]:
model_type ="t5-large" 

# Set the model_max_length parameter to a value suitable for your use case
model_max_length = 512

model = T5ForConditionalGeneration.from_pretrained(model_type)
tokenizer = T5Tokenizer.from_pretrained(model_type, model_max_length=model_max_length)

In [18]:
print(text_filtered)
print(len(text_filtered[0]))

[['Big', 'Bang', 'Theory', 'leading', 'explanation', 'universe', 'began.', 'Simply', 'put,', 'says', 'universe', 'know', 'started', 'infinitely', 'hot', 'dense', 'single', 'point', 'inflated', 'stretched', '\x97', 'unimaginable', 'speeds,', 'measurable', 'rate', '\x97', '13.7', 'billion', 'years', 'still-expanding', 'cosmos', 'know', 'today.Crucially,', 'models', 'compatible', '\xa0', 'Hubble\x96Lemaître', 'law\x97the', 'observation', 'farther', 'away', '\xa0', 'galaxy', '\xa0', 'is,', 'faster', 'moving', 'away', 'Earth.', 'Extrapolating', '\xa0', 'cosmic', 'expansion', '\xa0', 'backwards', 'time', 'known', '\xa0', 'laws', 'physics,', 'models', 'describe', 'increasingly', 'concentrated', 'cosmos', 'preceded', '\xa0', 'singularity', '\xa0', '\xa0', 'space', 'time', '\xa0', 'lose', 'meaning', '(typically', 'named', '"the', 'Big', 'Bang', 'singularity").[5]', '\xa0', '1964', 'CMB', 'discovered,', 'convinced', 'cosmologists', 'competing', '\xa0', 'steady-state', 'model', '\xa0', 'cosmic', 

In [20]:
# Convert each sublist to a string
formatted_inputs = ["summarize: " + " ".join(sublist) for sublist in text_filtered]

In [21]:
summaries = []
for formatted_input in formatted_inputs:
    # Tokenize the input text
    inputs = tokenizer(formatted_input, return_tensors="pt", padding=True, truncation=True)

    # Generate summary using the model
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=150,
        num_beams=2,
        early_stopping=True
    )

    # Decode the generated tokens to text
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    summaries.append(summary)

In [22]:
print(summaries)

["astronomers believe universe began 13.7 billion years ago, expanding at unimaginable speed. Big Bang theory predicts uniform background radiation caused high temperatures. existing technology doesn't allow astronomers literally peer universe's birth.", "astronomers believe universe began 13.7 billion years ago, expanding at unimaginable speed. Big Bang theory predicts uniform background radiation caused high temperatures. existing technology doesn't allow astronomers literally peer universe's birth.", "astronomers believe universe began 13.7 billion years ago, expanding at unimaginable speed. Big Bang theory predicts uniform background radiation caused high temperatures. existing technology doesn't allow astronomers literally peer universe's birth.", "astronomers believe universe began 13.7 billion years ago, expanding at unimaginable speed. Big Bang theory predicts uniform background radiation caused high temperatures. existing technology doesn't allow astronomers literally peer uni

In [17]:
inputs = tokenizer(text_filtered, return_tensors="pt", padding=True, truncation=True)

# Generate summary
outputs = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=150,
    num_beams=2,
    early_stopping=True
)


# Decode the output
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Summary:", summary)


ValueError: too many values to unpack (expected 2)

In [None]:
def generate_abstract_summary(text, max_length=100):
    input_ids = tokenizer.encode(text, return_tensors="pt")
    output_ids = model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return summary



cosine similarity(v 
1
​
 ,v 
2
​
 )= 
∥v 
1
​
 ∥∥v 
2
​
 ∥
/(v 
1
​
 ⋅v 
2)
​
 
​
where:

In [None]:
# Graph-based Summarization with TextRank
def build_sentence_graph(text):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    
    sentence_vectors = [nlp(sent).vector for sent in sentences]
    similarity_matrix = np.array([[np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) for v1 in sentence_vectors] for v2 in sentence_vectors])
    
    graph = nx.from_numpy_array(similarity_matrix)
    return graph, sentences



In [None]:
text = """
The Transformers library by Hugging Face is a popular library for natural language processing (NLP) tasks. 
It provides pre-trained models for various tasks such as text generation, translation, summarization, and question answering. 
The library is built on top of the PyTorch and TensorFlow deep learning frameworks and is known for its user-friendly API and extensive documentation. 
The models in the Transformers library are state-of-the-art and have been fine-tuned on large datasets, making them highly accurate and efficient for a wide range of NLP applications.
"""

graph, sentences = build_sentence_graph(corpus)

# Now you can use the graph with a ranking algorithm, e.g., TextRank
scores = nx.pagerank(graph)

# Sort the sentences by their score
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

# Print the top-ranked sentences as the summary
summary = " ".join([s for score, s in ranked_sentences[:3]])
print("Summary:", summary)


In [None]:
def extract_textrank_summary(graph, sentences, top_n=3):
    scores = nx.pagerank(graph)
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    
    summary_sentences = [sentences[idx] for idx, _ in sorted_scores[:top_n]]
    return ". ".join(summary_sentences) + "."



In [None]:

def summarize_section(tokens, depth, max_depth):
    if depth >= max_depth:
        return " ".join([token.text for token in tokens])
    
    summary = []
    for token in tokens:
        child_tokens = [child for child in token.children if child in tokens]
        if child_tokens:
            summary.append(summarize_section(child_tokens, depth + 1, max_depth))
        else:
            summary.append(token.text)
    return ". ".join(summary) + "."

In [None]:
# Hierarchical Summarization
def generate_hierarchical_summary(text, max_depth=2):
    doc = nlp(text)
    #return summarize_section(doc.sents, 0, max_depth)
    return summarize_section(list(doc), 0, max_depth)

In [None]:
graph, sentences = build_sentence_graph(corpus)
summary_corpus = extract_textrank_summary(graph, sentences)
summary_corpus_1 = generate_hierarchical_summary(corpus)

In [None]:
print("Graph Edges with Weights:")
for u, v, weight in graph.edges(data=True):
    print(f"({u}, {v}, {weight['weight']})")

In [None]:
print(summary_corpus)

In [None]:
print(summary_corpus_1)

In [None]:
text = "Enterprise documents like financial reports, legal contracts, project plans, etc. contain valuable insights and key details related to business operations and decisions. However, these documents tend to be much longer compared to generic corpora. Manually analyzing such long enterprise documents is incredibly time-consuming and labor-intensive. Text summarization methods can help by automatically identifying and extracting the most important information from enterprise documents and generating concise overviews. This provides analysts and decision-makers quick access to the key details without needing to read the full documents. However, standard text summarization techniques often perform poorly on enterprise documents containing unique industry-specific vocabulary and entities not found in their training data. For instance, in credit rating agencies, there are large internal repositories of documents, wikis, and knowledge transfer content developed over many years that are critical for onboarding new analysts. But reading and absorbing all the material is infeasible. Text summarization tailored to such content could help accelerate understanding. Likewise, many large enterprises have company-specific terminologies and named entities which pose challenges for off-the-shelf natural language processing tools. There is a need for customized techniques"

In [None]:
from docx import Document

# Specify the path to your .docx file
docx_file_path = './Text_Summarization/Sample.docx'

# Open the .docx file
doc = Document(docx_file_path)

# Initialize an empty string to store the contents
docx_contents = ""

# Iterate through paragraphs in the document and concatenate them to the string
for paragraph in doc.paragraphs:
    docx_contents += paragraph.text + "\n"

# Print or use the string containing the .docx file contents
print(docx_contents)

In [None]:
#text = "The first stars created bigger atoms and groups of atoms. That led to more stars being born. At the same time, galaxies were crashing and grouping together. As new stars were being born and dying, then things like asteroids, comets, planets, and black holes formed!"
summary = generate_abstract_summary(docx_contents)
print(summary)

Need to learn

1. input_ids = tokenizer.encode(text, return_tensors="pt")
2. output_ids = model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True)
3. summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
4. similarity_matrix = np.array([[np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) for v1 in sentence_vectors] for v2 in sentence_vectors])
5. graph = nx.from_numpy_array(similarity_matrix)

In [None]:

import gensim
from gensim import corpora
from gensim.models import LdaMulticore
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

# Download NLTK data files (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Sample documents
documents = [
    "Human machine interface for lab ABC computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user-perceived response time to error measurement",
    "The generation of random, binary, unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV: Widths of trees and well-quasi-ordering",
    "Graph minors: A survey",
]

# Preprocess documents
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(doc):
    tokens = word_tokenize(doc.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]
    return tokens

processed_docs = [preprocess(doc) for doc in documents]

# Create a dictionary and corpus
dictionary = corpora.Dictionary(processed_docs)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Train LDA model using LdaMulticore
lda_model = LdaMulticore(corpus, num_topics=3, id2word=dictionary, passes=10, workers=2)

# Print the topics
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx}\nWords: {topic}\n")

# Get the topic distribution for the first document
doc_topics = lda_model.get_document_topics(corpus[0])
print(f"Topic distribution for the first document: {doc_topics}")

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the pre-trained T5 model and tokenizer
model_name = 't5-small'  # You can also use 't5-base', 't5-large', etc.
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Sample input text (a long paragraph)
input_text = """
The Transformers library by Hugging Face is a popular library for natural language processing (NLP) tasks. 
It provides pre-trained models for various tasks such as text generation, translation, summarization, and question answering. 
The library is built on top of the PyTorch and TensorFlow deep learning frameworks and is known for its user-friendly API and extensive documentation. 
The models in the Transformers library are state-of-the-art and have been fine-tuned on large datasets, making them highly accurate and efficient for a wide range of NLP applications.
"""

# Preprocess the input text
input_ids = tokenizer.encode("summarize: " + input_text, return_tensors="pt", max_length=512, truncation=True)

# Generate the summary
summary_ids = model.generate(input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)

# Decode and print the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("Summary:", summary)