<a href="https://colab.research.google.com/github/Seanabuklau/LLM_Document_Summariser/blob/main/COR_LAW_2215.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**1. Install pre-requisite libraries**

In [None]:
# Install requisite libraries
!pip install spacy -q
!pip install nltk -q
!pip install langchain -q
!pip install "shapely==1.8.5"
!pip install google-cloud-aiplatform --upgrade -q
!pip install tiktoken -q
!pip install transformers -q
!pip install PyPDF2 -q
!pip install scikit-learn -q



**2. Pre-Processing**

*Step 1: Parse Judgement Case*

In [None]:
import PyPDF2

pdf_file_path = ""
extracted_text = ""

with open(pdf_file_path, "rb") as pdf_file:
    reader = PyPDF2.PdfReader(pdf_file)
    num_pages = len(reader.pages)
    for page_number in range(num_pages):
        page = reader.pages[page_number]
        extracted_text += page.extract_text()

*Step 2: Sentence Tokenization*

In [None]:
import spacy

# Load the spaCy language model
nlp = spacy.load("en_core_web_sm")

# Process the text with spaCy
doc = nlp(extracted_text)

# Extract sentences from the processed text
sentences = [sent.text for sent in doc.sents]

*Step 3: Sentence Cleansing*

In [None]:
import re

def clean_sentences(sentences):
    cleaned_sentences = []
    for sentence in sentences:
        # Remove special characters and extra whitespace
        clean_sentence = re.sub(r"[^a-zA-Z0-9\s]", "", sentence)
        clean_sentence = clean_sentence.strip()
        if clean_sentence:
            cleaned_sentences.append(clean_sentence)
    return cleaned_sentences

cleaned_sentences = clean_sentences(sentences)

*Step 4: Stop Word Removal and Lemmitization*

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(cleaned_sentences):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words("english"))
    word_list = [sentence.lower().split() for sentence in cleaned_sentences]
    ans = [lemmatizer.lemmatize(word) for words in word_list for word in words if word not in stop_words]
    return " ".join(ans)

preprocessed_text = preprocess_text(cleaned_sentences)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**3. In-Processing**

In [None]:
import langchain
import vertexai
from langchain.llms import VertexAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.mapreduce import MapReduceChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain, LLMChain, StuffDocumentsChain
from langchain.prompts import PromptTemplate
from langchain.document_loaders import WebBaseLoader

# Authentication
from google.colab import auth as google_auth
google_auth.authenticate_user()
vertexai.init(project="GCP_PROJECT_ID", location="asia-southeast1")

# LLM Selection
llm = VertexAI(
    model_name="text-bison@001",
    max_output_tokens=1024,
    temperature=0.3,
    top_p=0.8,
    top_k=40,
    verbose=True,
)

# Initialise Map Objects
map_template = """The following is a court judgement case:
{preprocessed_text}
Based on this, please summarise the case as much as possible while ensuring it is as verbose as possible.
Include the case facts, issue(s), court's ruling and explanation, laws used and the case outcome
"""
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)

# Initialise Reduce Objects
reduce_template = """The following is a set of summaries:
{doc_summaries}
Take these and distill it into a final, consolidated summary.
Include the case facts, issue(s), court's ruling and explanation, laws used and the case outcome
"""
reduce_prompt = PromptTemplate.from_template(reduce_template)
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Initialise StuffDocumentsChain:
# -> This will take a list of documents, combine them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="doc_summaries"
)

# Initialise Iterator Reducer
# -> Combines and iteravely reduces the mapped documents with the help of StuffDocumentsChain
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=8000,
)

# Initialise Map Reduce Chain
# -> Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="preprocessed_text",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=20000, chunk_overlap=10)
chunks = text_splitter.create_documents([preprocessed_text])

# Intiatiate Map Reduce Chain
summary = map_reduce_chain.run(chunks)

**4. Post-Processing**

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.schema.document import Document

doc = [Document(page_content=summary, metadata={"source": "local"})]

refine_template = (
    "Your job is to produce a final summary\n"
    "We have provided an existing summary up to a certain point:\n"
    "{summary}"
    "\nYou have the opportunity to refine the existing summary"
)
refine_prompt = PromptTemplate.from_template(refine_template)

chain = load_summarize_chain(
    llm=llm,
    chain_type="refine",
    refine_prompt=refine_prompt,
    input_key="input_documents",
    output_key="output_text",
)

result = chain({"input_documents": doc}, return_only_outputs=True)

**5. Evaluation**

1. Jaccard Similarity Score

In [None]:
from sklearn.metrics import jaccard_score
from sklearn.feature_extraction.text import CountVectorizer

# Parse Benchmark text
benchmark_text = ""
with open("[COMPARE] sample_law_case_3_updated.txt", "r") as f:
    benchmark_text = f.read()
    f.close()


# Tokenize the text into words or tokens
vectorizer = CountVectorizer()

tokens1 = set(vectorizer.build_analyzer()(summary))
tokens2 = set(vectorizer.build_analyzer()(benchmark_text))

# Calculate the Jaccard similarity score
jaccard_similarity = len(tokens1.intersection(tokens2)) / len(tokens1.union(tokens2))

print(f"Jaccard Similarity: {jaccard_similarity:.2f}")


Jaccard Similarity: 0.12


2. Cosine Similarity Score

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the documents to TF-IDF vectors
tfidf_matrix = tfidf_vectorizer.fit_transform([summary, benchmark_text])

# Calculate cosine similarity between the documents
cosine_similarity_score = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])

print(f"Cosine Similarity between summary and benchmark_text: {cosine_similarity_score[0][0]:.2f}")


Cosine Similarity between summary and benchmark_text: 0.75
