Connecting Drive to Colab for files access


In [None]:
import os
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


Libraries Installation

In [None]:
!pip install pydub
!pip install PyPDF2==3.0.1
!pip install pdfplumber
!pip install transformers torch pdfplumber
!pip install textblob==0.19.0
!python -m textblob.download_corpora

Collecting PyPDF2==3.0.1
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m204.8/232.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.3

Prprocessinf of Text and
Text Extraction using BERT


1.   Extraction using BERT Question Answering (If extraction is less than 400 goto 2)
2.   Rule based Extraction (If combination of 1 and 2 is less than 400 goto 3)
3.   Extraction from PDF document related to key allegations and all



In [None]:
import os
import re
import torch
import pdfplumber
from transformers import BertTokenizer, BertForQuestionAnswering

# Load BERT model & tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = BertForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = " ".join(page.extract_text() for page in pdf.pages if page.extract_text())
    return text

# Remove URLs & unnecessary text
def clean_text(text):
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'Page\s*\d+', '', text)  # Remove page numbers
    text = re.sub(r'\s+', ' ', text.strip())  # Remove excessive spaces
    return text

# Sliding Window Tokenization for BERT (Handles Long Documents)
def split_text_sliding_window(text, max_length=512, overlap=50):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = " ".join(words[i:i + max_length])
        chunks.append(chunk)
        i += max_length - overlap  # Sliding window (overlapping text)
    return chunks

# Extract judgment text using BERT
def extract_judgment_with_bert(context):
    chunks = split_text_sliding_window(context)
    answers = []

    for chunk in chunks:
        inputs = tokenizer.encode_plus("final judgment", chunk, add_special_tokens=True, return_tensors="pt", truncation=True, max_length=512)
        input_ids = inputs["input_ids"]
        outputs = model(**inputs)
        start_index = torch.argmax(outputs.start_logits)
        end_index = torch.argmax(outputs.end_logits)
        answer_tokens = input_ids[0][start_index:end_index + 1]
        answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
        answers.append(answer)

    # Merge extracted chunks & check word count
    extracted_text = " ".join(answers).strip()
    return extracted_text

# Rule-based fallback if BERT fails (prioritize summary or key arguments)
def rule_based_fallback(text, word_limit=900):
    # Keywords for identifying summary or key arguments
    summary_keywords = [
        "summary", "conclusion", "key argument", "final decision", "decision",
        "court's findings", "main points", "court ruled", "reasoning", "holding"
    ]

    sentences = re.split(r'(?<=[.!?])\s+', text)

    # Extract sentences that contain any of the summary or key argument keywords
    extracted_sentences = [s for s in sentences if any(keyword in s.lower() for keyword in summary_keywords)]

    if not extracted_sentences:
        print("Fallback: No key sections found. Extracting first 900 words as fallback.")
        extracted_sentences = sentences[:word_limit]

    extracted_text = " ".join(extracted_sentences)
    return extracted_text

# Extract fallback content if both BERT & rule-based extraction fail
def fallback_to_text_length_based(text, min_words=500, max_words=900):
    words = text.split()
    if len(words) >= min_words and len(words) <= max_words:
        return text  # If the text is within the required word count
    else:
        print(f"Fallback: Extracting a portion of the document between {min_words} and {max_words} words.")
        return " ".join(words[:max_words]) if len(words) > max_words else text

# Ensure the final text is between 500 and 900 words
def ensure_word_count(text, min_words=500, max_words=900):
    words = text.split()
    if len(words) < min_words:
        print(f"Final Fallback: Text is less than {min_words} words, increasing content.")
        return " ".join(words[:max_words])  # Take up to max_words if text is too short
    elif len(words) > max_words:
        return " ".join(words[:max_words])  # Trim if it's too long
    return text

# Process PDFs & save extracted judgments
def process_pdfs_in_folder(input_folder, output_folder, file_list):
    os.makedirs(output_folder, exist_ok=True)

    for pdf_file in file_list:
        pdf_path = os.path.join(input_folder, pdf_file)
        print(f"Processing file: {pdf_file}")

        raw_text = extract_text_from_pdf(pdf_path)
        if not raw_text.strip():
            print(f"Skipping {pdf_file} as no extractable text was found.")
            continue

        cleaned_text = clean_text(raw_text)

        # First extraction: BERT model
        judgment_text_bert = extract_judgment_with_bert(cleaned_text)

        # Check if BERT extraction is less than 500 words
        words_bert = judgment_text_bert.split()
        if len(words_bert) < 500:
            print(f"First extraction (BERT) is less than 500 words. Applying rule-based fallback.")
            # Second extraction: Rule-based fallback
            judgment_text_rule_based = rule_based_fallback(cleaned_text)
            combined_text = judgment_text_bert + " " + judgment_text_rule_based
        else:
            combined_text = judgment_text_bert

        # If combined text is still less than 500 words, fallback to length-based extraction
        words_combined = combined_text.split()
        if len(words_combined) < 500:
            print(f"Combined text is still less than 500 words. Applying length-based extraction.")
            judgment_text_length_based = fallback_to_text_length_based(cleaned_text)
            combined_text = combined_text + " " + judgment_text_length_based

        # Ensure the final text is between 500 and 900 words
        final_text = ensure_word_count(combined_text)

        # Save the extracted text
        output_file = os.path.join(output_folder, f"{os.path.splitext(pdf_file)[0]}.txt")
        with open(output_file, "w") as file:
            file.write(final_text)

        print(f"Processed & saved: {output_file}")

# Get all PDF files in the folder (no limit)
def get_all_files(input_folder):
    pdf_files = [f for f in os.listdir(input_folder) if f.lower().endswith('.pdf')]
    return pdf_files

# Paths
input_folder = "/content/drive/MyDrive/Dataset/Case_Files/PDFs"
output_folder = "/content/extracted_text"

# Get all PDF files
all_files = get_all_files(input_folder)

# Process PDFs and save to extracted_text folder
process_pdfs_in_folder(input_folder, output_folder, all_files)


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Processing file: Case_1.pdf
First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Fallback: No key sections found. Extracting first 900 words as fallback.
Combined text is still less than 500 words. Applying length-based extraction.
Fallback: Extracting a portion of the document between 500 and 900 words.
Processed & saved: /content/extracted_text/Case_1.txt
Processing file: Case_2.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Processed & saved: /content/extracted_text/Case_2.txt
Processing file: Case_3.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Processed & saved: /content/extracted_text/Case_3.txt
Processing file: Case_4.pdf
First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Fallback: No key sections found. Extracting first 900 words as fallback.
Combined text is still less than 500 words. Applying length-based extraction.
Fallback: Extracting a portion of the document between 500 and 900 words.
Processed & saved: /content/extracted_text/Case_4.txt
Processing file: Case_5.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Combined text is still less than 500 words. Applying length-based extraction.
Fallback: Extracting a portion of the document between 500 and 900 words.
Processed & saved: /content/extracted_text/Case_5.txt
Processing file: Case_6.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Fallback: No key sections found. Extracting first 900 words as fallback.
Processed & saved: /content/extracted_text/Case_6.txt
Processing file: Case_7.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Fallback: No key sections found. Extracting first 900 words as fallback.
Processed & saved: /content/extracted_text/Case_7.txt
Processing file: Case_8.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Fallback: No key sections found. Extracting first 900 words as fallback.
Processed & saved: /content/extracted_text/Case_8.txt
Processing file: Case_9.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Fallback: No key sections found. Extracting first 900 words as fallback.
Processed & saved: /content/extracted_text/Case_9.txt
Processing file: Case_10.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Combined text is still less than 500 words. Applying length-based extraction.
Fallback: Extracting a portion of the document between 500 and 900 words.
Processed & saved: /content/extracted_text/Case_10.txt
Processing file: Case_11.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Processed & saved: /content/extracted_text/Case_11.txt
Processing file: Case_12.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Fallback: No key sections found. Extracting first 900 words as fallback.
Processed & saved: /content/extracted_text/Case_12.txt
Processing file: Case_13.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Fallback: No key sections found. Extracting first 900 words as fallback.
Processed & saved: /content/extracted_text/Case_13.txt
Processing file: Case_14.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Fallback: No key sections found. Extracting first 900 words as fallback.
Processed & saved: /content/extracted_text/Case_14.txt
Processing file: Case_15.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Fallback: No key sections found. Extracting first 900 words as fallback.
Combined text is still less than 500 words. Applying length-based extraction.
Fallback: Extracting a portion of the document between 500 and 900 words.
Final Fallback: Text is less than 500 words, increasing content.
Processed & saved: /content/extracted_text/Case_15.txt
Processing file: Case_16.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Fallback: No key sections found. Extracting first 900 words as fallback.
Processed & saved: /content/extracted_text/Case_16.txt
Processing file: Case_17.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Processed & saved: /content/extracted_text/Case_17.txt
Processing file: Case_18.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Processed & saved: /content/extracted_text/Case_18.txt
Processing file: Case_19.pdf
First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Fallback: No key sections found. Extracting first 900 words as fallback.
Combined text is still less than 500 words. Applying length-based extraction.
Fallback: Extracting a portion of the document between 500 and 900 words.
Final Fallback: Text is less than 500 words, increasing content.
Processed & saved: /content/extracted_text/Case_19.txt
Processing file: Case_20.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Processed & saved: /content/extracted_text/Case_20.txt
Processing file: Case_21.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Fallback: No key sections found. Extracting first 900 words as fallback.
Combined text is still less than 500 words. Applying length-based extraction.
Fallback: Extracting a portion of the document between 500 and 900 words.
Processed & saved: /content/extracted_text/Case_21.txt
Processing file: Case_22.pdf
First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Fallback: No key sections found. Extracting first 900 words as fallback.
Processed & saved: /content/extracted_text/Case_22.txt
Processing file: Case_23.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Combined text is still less than 500 words. Applying length-based extraction.
Fallback: Extracting a portion of the document between 500 and 900 words.
Processed & saved: /content/extracted_text/Case_23.txt
Processing file: Case_24.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Combined text is still less than 500 words. Applying length-based extraction.
Fallback: Extracting a portion of the document between 500 and 900 words.
Processed & saved: /content/extracted_text/Case_24.txt
Processing file: Case_25.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Processed & saved: /content/extracted_text/Case_25.txt
Processing file: Case_26.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Combined text is still less than 500 words. Applying length-based extraction.
Fallback: Extracting a portion of the document between 500 and 900 words.
Processed & saved: /content/extracted_text/Case_26.txt
Processing file: Case_27.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Processed & saved: /content/extracted_text/Case_27.txt
Processing file: Case_28.pdf
First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Fallback: No key sections found. Extracting first 900 words as fallback.
Processed & saved: /content/extracted_text/Case_28.txt
Processing file: Case_29.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Combined text is still less than 500 words. Applying length-based extraction.
Fallback: Extracting a portion of the document between 500 and 900 words.
Processed & saved: /content/extracted_text/Case_29.txt
Processing file: Case_30.pdf
First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Fallback: No key sections found. Extracting first 900 words as fallback.
Combined text is still less than 500 words. Applying length-based extraction.
Fallback: Extracting a portion of the document between 500 and 900 words.
Final Fallback: Text is less than 500 words, increasing content.
Processed & saved: /content/extracted_text/Case_30.txt
Processing file: Case_31.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Processed & saved: /content/extracted_text/Case_31.txt
Processing file: Case_32.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Combined text is still less than 500 words. Applying length-based extraction.
Fallback: Extracting a portion of the document between 500 and 900 words.
Processed & saved: /content/extracted_text/Case_32.txt
Processing file: Case_33.pdf
First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Fallback: No key sections found. Extracting first 900 words as fallback.
Combined text is still less than 500 words. Applying length-based extraction.
Fallback: Extracting a portion of the document between 500 and 900 words.
Processed & saved: /content/extracted_text/Case_33.txt
Processing file: Case_34.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Processed & saved: /content/extracted_text/Case_34.txt
Processing file: Case_35.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Processed & saved: /content/extracted_text/Case_35.txt
Processing file: Case_36.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Processed & saved: /content/extracted_text/Case_36.txt
Processing file: Case_37.pdf
First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Fallback: No key sections found. Extracting first 900 words as fallback.
Combined text is still less than 500 words. Applying length-based extraction.
Fallback: Extracting a portion of the document between 500 and 900 words.
Processed & saved: /content/extracted_text/Case_37.txt
Processing file: Case_38.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Processed & saved: /content/extracted_text/Case_38.txt
Processing file: Case_39.pdf
First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Fallback: No key sections found. Extracting first 900 words as fallback.
Combined text is still less than 500 words. Applying length-based extraction.
Fallback: Extracting a portion of the document between 500 and 900 words.
Processed & saved: /content/extracted_text/Case_39.txt
Processing file: Case_40.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Fallback: No key sections found. Extracting first 900 words as fallback.
Processed & saved: /content/extracted_text/Case_40.txt
Processing file: Case_41.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Fallback: No key sections found. Extracting first 900 words as fallback.
Processed & saved: /content/extracted_text/Case_41.txt
Processing file: Case_42.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Processed & saved: /content/extracted_text/Case_42.txt
Processing file: Case_43.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Fallback: No key sections found. Extracting first 900 words as fallback.
Processed & saved: /content/extracted_text/Case_43.txt
Processing file: Case_44.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Processed & saved: /content/extracted_text/Case_44.txt
Processing file: Case_45.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Processed & saved: /content/extracted_text/Case_45.txt
Processing file: Case_46.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Processed & saved: /content/extracted_text/Case_46.txt
Processing file: Case_47.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Processed & saved: /content/extracted_text/Case_47.txt
Processing file: Case_48.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Processed & saved: /content/extracted_text/Case_48.txt
Processing file: Case_49.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


First extraction (BERT) is less than 500 words. Applying rule-based fallback.
Combined text is still less than 500 words. Applying length-based extraction.
Fallback: Extracting a portion of the document between 500 and 900 words.
Processed & saved: /content/extracted_text/Case_49.txt
Processing file: Case_50.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Processed & saved: /content/extracted_text/Case_50.txt
Processing file: Case_51.pdf


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Processed & saved: /content/extracted_text/Case_51.txt


Post processing of extracted text

1.   Grammar Correction using TextBlob model
2.   Text cleaing (Removing unwanted characters, IndianKanoon word, tariling full stops after heading, excessive spaces)




In [None]:
import os
import re
from textblob import TextBlob

# Correct grammar using TextBlob (for spelling and punctuation)
def correct_grammar(text):
    corrected_text = TextBlob(text).correct()
    return str(corrected_text)

# Clean text by removing unwanted characters (brackets, special chars, etc.)
def clean_extracted_text(text):
    # Remove unwanted characters like brackets, etc.
    text = re.sub(r'[^\w\s,.-]', '', text)  # Remove unwanted characters (e.g., brackets)
    text = re.sub(r'\[.*?\]', '', text)  # Remove anything within square brackets
    text = re.sub(r'\s+', ' ', text.strip())  # Remove excessive spaces
    text = re.sub(r'\s+\.\s*$', '', text)  # Remove trailing full stops after headings
    text = re.sub(r'IndianKanoon\s*-\s*', '', text)  # Remove "Indian Kanoon" references

    return text

# Ensure text meets the word count limits (min 300, max 800 words)
def enforce_word_count(text, min_words=300, max_words=800):
    word_list = text.split()

    # Truncate if too long
    if len(word_list) > max_words:
        text = ' '.join(word_list[:max_words])
    # Ensure text is at least 200 words
    elif len(word_list) < min_words:
        # We won't add padding text. If the text is less than min_words, just return it as it is.
        print(f"Warning: Text has less than {min_words} words. This may be problematic.")

    return text

# Process extracted text and save it
def process_and_save_text(input_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)

    all_files = os.listdir(input_folder)

    for file in all_files:
        # Skip if it's a directory
        if os.path.isdir(os.path.join(input_folder, file)):
            continue

        with open(os.path.join(input_folder, file), 'r') as f:
            text = f.read()

        # Clean and correct grammar
        cleaned_text = clean_extracted_text(text)
        corrected_text = correct_grammar(cleaned_text)

        # Ensure word count is within range (min 300, max 800)
        final_text = enforce_word_count(corrected_text, min_words=300, max_words=800)

        # Save processed text
        with open(os.path.join(output_folder, file), 'w') as f:
            f.write(final_text)
        print(f"Processed & saved: {file}")

# Paths for processed text
output_folder = "/content/postprocessed_text"

# Process the extracted text and save it in processed_text folder, without excluding any files
process_and_save_text("/content/extracted_text", output_folder)


Processed & saved: Case_49.txt
Processed & saved: Case_13.txt
Processed & saved: Case_3.txt
Processed & saved: Case_10.txt
Processed & saved: Case_4.txt
Processed & saved: Case_2.txt
Processed & saved: Case_41.txt
Processed & saved: Case_15.txt
Processed & saved: Case_38.txt
Processed & saved: Case_27.txt
Processed & saved: Case_16.txt
Processed & saved: Case_17.txt
Processed & saved: Case_19.txt
Processed & saved: Case_34.txt
Processed & saved: Case_46.txt
Processed & saved: Case_36.txt
Processed & saved: Case_30.txt
Processed & saved: Case_50.txt


This code is just to check the file count in a folder

In [None]:
import os

def count_files_in_directory(directory_path):
    try:
        # List all files and directories in the given path
        files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]
        # Count the number of files
        return len(files)
    except FileNotFoundError:
        return f"The directory {directory_path} does not exist."
    except Exception as e:
        return str(e)

# Replace with the path to your directory
directory_path = '/content/drive/MyDrive/Dataset/Case_Files/Final_audio_files'
file_count = count_files_in_directory(directory_path)
print(f"Number of files in the directory: {file_count}")


Number of files in the directory: 51


Checks for files that are not in word count range

In [None]:
import os

# Function to calculate word count of a file
def get_word_count(file_path):
    with open(file_path, 'r') as f:
        text = f.read()
    words = text.split()
    return len(words)

# Function to check files based on word count
def check_files_for_word_count(input_folder, min_words=400, max_words=1000):
    files_out_of_range = []

    # Get all files in the folder
    all_files = os.listdir(input_folder)

    for file in all_files:
        file_path = os.path.join(input_folder, file)

        # Check if it's a text file
        if os.path.isfile(file_path) and file.lower().endswith('.txt'):
            word_count = get_word_count(file_path)

            # Check if the word count is out of the specified range
            if word_count < min_words or word_count > max_words:
                files_out_of_range.append((file, word_count))

    return files_out_of_range

# Paths
input_folder = "/content/drive/MyDrive/Dataset/Case_Files/PT"  # Set to the folder containing the text files

# Get files with word count less than 300 or greater than 800
files_out_of_range = check_files_for_word_count(input_folder)

# Print the result
if files_out_of_range:
    print("Files with word count out of range (less than 300 or more than 800 words):")
    for file, word_count in files_out_of_range:
        print(f"{file}: {word_count} words")
else:
    print("All files are within the word count range.")


All files are within the word count range.


In [None]:
!pip install transformers language-tool-python

Collecting language-tool-python
  Downloading language_tool_python-2.9.0-py3-none-any.whl.metadata (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.5/54.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Downloading language_tool_python-2.9.0-py3-none-any.whl (49 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.2/49.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: language-tool-python
Successfully installed language-tool-python-2.9.0


GTTS Model installation

In [None]:
!pip install gTTS==2.5.4
!pip install pydub==0.25.1

Collecting gTTS==2.5.4
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Installing collected packages: gTTS
Successfully installed gTTS-2.5.4
Collecting pydub==0.25.1
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


Google Text to sppech conversion of extracted text files

In [None]:
import os
import time  # Importing time module for delay
from gtts import gTTS
from pydub import AudioSegment

def generate_audio_from_text_file(processed_text_file_path, output_audio_file_path):
    """
    Generates audio from a text file and saves it as an audio file.
    """
    # Ensure the processed text file exists
    if not os.path.exists(processed_text_file_path):
        print(f"Error: File '{processed_text_file_path}' not found.")
        return

    # Read the processed text file
    with open(processed_text_file_path, 'r', encoding='utf-8') as file:
        processed_text_lines = file.readlines()

    # Combine all lines into one text block
    final_judgment_text = " ".join([line.strip() for line in processed_text_lines if line.strip()])

    if not final_judgment_text:
        print("Error: No valid text to convert into speech.")
        return

    # Convert text to speech
    tts = gTTS(text=final_judgment_text, lang="en", slow=False)

    # Save the audio as an MP3 file first
    temp_mp3 = "temp_audio.mp3"
    tts.save(temp_mp3)

    # Convert MP3 to FLAC using pydub
    sound = AudioSegment.from_mp3(temp_mp3)
    sound.export(output_audio_file_path, format="flac")
    os.remove(temp_mp3)  # Clean up temporary MP3 file

    print(f"Audio saved as '{output_audio_file_path}'")

def process_all_text_files(input_text_folder, output_audio_folder):
    """
    Processes all judgment text files in the input folder and saves the corresponding audio files in the output folder.
    """
    # Check if the output folder exists, if not, create it
    os.makedirs(output_audio_folder, exist_ok=True)

    # Process all text files in the input folder
    all_files = os.listdir(input_text_folder)

    for text_file in all_files:
        # Only process .txt files
        if not text_file.endswith('.txt'):
            continue

        text_file_path = os.path.join(input_text_folder, text_file)

        # Ensure the file exists before proceeding
        if not os.path.exists(text_file_path):
            print(f"Error: File '{text_file}' not found in the input folder.")
            continue

        # Remove the .txt extension from the text file name
        audio_file_name = os.path.splitext(text_file)[0]

        # Output audio file path using the same name as the text file
        output_audio_file_path = os.path.join(output_audio_folder, f"{audio_file_name}.flac")

        # Check if the audio file already exists, if yes, skip it
        if os.path.exists(output_audio_file_path):
            print(f"Audio for '{text_file}' already exists. Skipping.")
            continue

        # Generate audio from the text file and save it
        generate_audio_from_text_file(text_file_path, output_audio_file_path)

        # Introduce a 5-second delay after saving each audio file
        time.sleep(5)

# Path to the input folder containing processed judgment text files
input_text_folder = "/content/drive/MyDrive/Dataset/Case_Files/postprocessed_text"  # Update with your folder path

# Path to the output folder for saving audio files
output_audio_folder = "/content/drive/MyDrive/Dataset/Case_Files/FAF"  # Update with your folder path

# Process all judgment text files and generate audio files
process_all_text_files(input_text_folder, output_audio_folder)


Audio saved as '/content/drive/MyDrive/Dataset/Case_Files/FAF/Case_7.flac'
Audio saved as '/content/drive/MyDrive/Dataset/Case_Files/FAF/Case_39.flac'
Audio saved as '/content/drive/MyDrive/Dataset/Case_Files/FAF/Case_6.flac'
Audio saved as '/content/drive/MyDrive/Dataset/Case_Files/FAF/Case_28.flac'
Audio saved as '/content/drive/MyDrive/Dataset/Case_Files/FAF/Case_4.flac'
Audio for 'Case_17.txt' already exists. Skipping.
Audio saved as '/content/drive/MyDrive/Dataset/Case_Files/FAF/Case_19.flac'
Audio saved as '/content/drive/MyDrive/Dataset/Case_Files/FAF/Case_30.flac'
Audio saved as '/content/drive/MyDrive/Dataset/Case_Files/FAF/Case_1.flac'


Calculates the whole duration of audio files in a folder

In [None]:
import os
from pydub.utils import mediainfo

def get_audio_duration(file_path):
    """
    Returns the duration of the audio file in seconds.
    """
    info = mediainfo(file_path)
    return float(info['duration'])

def calculate_total_duration(folder_path):
    total_duration = 0  # Initialize total duration in seconds

    # Loop through all files in the folder
    for filename in os.listdir(folder_path):
        # Only consider .flac files
        if filename.endswith('.flac'):
            file_path = os.path.join(folder_path, filename)
            total_duration += get_audio_duration(file_path)  # Add duration of this file

    return total_duration

def convert_seconds_to_hms(total_seconds):
    """
    Converts total seconds to hours, minutes, and seconds.
    """
    hours = total_seconds // 3600
    minutes = (total_seconds % 3600) // 60
    seconds = total_seconds % 60
    return hours, minutes, seconds

# Example usage:
folder_path = '/content/drive/MyDrive/Dataset/Case_Files/FAF'  # Replace with your folder path
total_duration = calculate_total_duration(folder_path)

# Convert total duration to hours, minutes, and seconds
hours, minutes, seconds = convert_seconds_to_hms(total_duration)

print(f"Total duration of all .flac files: {int(hours)} hours, {int(minutes)} minutes, and {int(seconds)} seconds")


Total duration of all .flac files: 5 hours, 20 minutes, and 46 seconds


In [None]:
!pip install pydub pyvad soundfile librosa
!sudo apt-get install ffmpeg

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting pyvad
  Downloading pyvad-0.2.0-py3-none-any.whl.metadata (2.5 kB)
Collecting librosa
  Downloading librosa-0.9.2-py3-none-any.whl.metadata (8.2 kB)
Collecting webrtcvad<3.0.0,>=2.0.10 (from pyvad)
  Downloading webrtcvad-2.0.10.tar.gz (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.2/66.2 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting resampy>=0.2.2 (from librosa)
  Downloading resampy-0.4.3-py3-none-any.whl.metadata (3.0 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Downloading pyvad-0.2.0-py3-none-any.whl (4.7 kB)
Downloading librosa-0.9.2-py3-none-any.whl (214 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m214.3/214.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading resampy-0.4.3-py3-none-any.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.


Whisper Model installation

In [None]:
!pip install whisper
!pip install git+https://github.com/openai/whisper.git

Collecting whisper
  Downloading whisper-1.1.10.tar.gz (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: whisper
  Building wheel for whisper (setup.py) ... [?25l[?25hdone
  Created wheel for whisper: filename=whisper-1.1.10-py3-none-any.whl size=41120 sha256=1a69e0da355a152a42e44ddc0e5a529bcb6ffd5969a2d6d0a40d0a1e74f620b8
  Stored in directory: /root/.cache/pip/wheels/21/65/ee/4e6672aabfa486d3341a39a04f8f87c77e5156149299b5a7d0
Successfully built whisper
Installing collected packages: whisper
Successfully installed whisper-1.1.10
Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-_4rp8ev_
  Running command git clone --filter=bl

Transcribing audio files using Whisper 'Medium' model

In [None]:
import os
import whisper

# Load the Whisper model
model = whisper.load_model("medium")

def transcribe_audio(audio_path):
    """
    Transcribes the audio file using Whisper.
    """
    print(f"Transcribing: {audio_path}")
    result = model.transcribe(audio_path)
    return result['text']

def process_audio_files_in_folder(input_folder, output_folder):
    """
    Processes all audio files in the input folder and saves only the transcriptions in the output folder.
    """
    # Check if the output folder exists, if not, create it
    os.makedirs(output_folder, exist_ok=True)

    # Get all audio files in the input folder (filtering for FLAC files)
    audio_files = [f for f in os.listdir(input_folder) if f.endswith('.flac')]

    # Process each audio file
    for audio_file in audio_files:
        audio_file_path = os.path.join(input_folder, audio_file)

        # Remove the .flac extension from the audio file name to create the output file name
        base_name = os.path.splitext(audio_file)[0]  # Remove the extension

        # Define the output file path for the transcription
        transcription_output_path = os.path.join(output_folder, f"{base_name}.txt")

        # 1. Transcribe Audio using Whisper
        transcribed_text = transcribe_audio(audio_file_path)

        # Save the transcription text to the output folder
        with open(transcription_output_path, 'w', encoding='utf-8') as file:
            file.write(transcribed_text)

        print(f"Processed file {audio_file} and saved transcription in {transcription_output_path}")

# Input folder containing .flac files
input_folder = "/content/drive/MyDrive/Dataset/Case_Files/FAF"  # Update with your input folder path

# Output folder where transcription text files will be saved
output_folder = "/content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files"  # Update with your output folder path

# Process the audio files and save transcriptions only
process_audio_files_in_folder(input_folder, output_folder)


100%|██████████████████████████████████████| 1.42G/1.42G [00:10<00:00, 151MiB/s]
  checkpoint = torch.load(fp, map_location=device)


Transcribing: /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_33.flac




Processed file Case_33.flac and saved transcription in /content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files/Case_33.txt
Transcribing: /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_35.flac




Processed file Case_35.flac and saved transcription in /content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files/Case_35.txt
Transcribing: /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_40.flac




Processed file Case_40.flac and saved transcription in /content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files/Case_40.txt
Transcribing: /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_9.flac




Processed file Case_9.flac and saved transcription in /content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files/Case_9.txt
Transcribing: /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_37.flac




Processed file Case_37.flac and saved transcription in /content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files/Case_37.txt
Transcribing: /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_31.flac




Processed file Case_31.flac and saved transcription in /content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files/Case_31.txt
Transcribing: /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_44.flac




Processed file Case_44.flac and saved transcription in /content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files/Case_44.txt
Transcribing: /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_25.flac




Processed file Case_25.flac and saved transcription in /content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files/Case_25.txt
Transcribing: /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_42.flac




Processed file Case_42.flac and saved transcription in /content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files/Case_42.txt
Transcribing: /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_24.flac




Processed file Case_24.flac and saved transcription in /content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files/Case_24.txt
Transcribing: /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_32.flac




Processed file Case_32.flac and saved transcription in /content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files/Case_32.txt
Transcribing: /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_21.flac




Processed file Case_21.flac and saved transcription in /content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files/Case_21.txt
Transcribing: /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_18.flac




Processed file Case_18.flac and saved transcription in /content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files/Case_18.txt
Transcribing: /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_15.flac




Processed file Case_15.flac and saved transcription in /content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files/Case_15.txt
Transcribing: /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_38.flac




Processed file Case_38.flac and saved transcription in /content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files/Case_38.txt
Transcribing: /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_27.flac




Processed file Case_27.flac and saved transcription in /content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files/Case_27.txt
Transcribing: /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_17.flac




Processed file Case_17.flac and saved transcription in /content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files/Case_17.txt
Transcribing: /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_7.flac




Processed file Case_7.flac and saved transcription in /content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files/Case_7.txt
Transcribing: /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_39.flac




Processed file Case_39.flac and saved transcription in /content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files/Case_39.txt
Transcribing: /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_6.flac




Processed file Case_6.flac and saved transcription in /content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files/Case_6.txt
Transcribing: /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_28.flac




Processed file Case_28.flac and saved transcription in /content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files/Case_28.txt
Transcribing: /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_4.flac




Processed file Case_4.flac and saved transcription in /content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files/Case_4.txt
Transcribing: /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_19.flac




Processed file Case_19.flac and saved transcription in /content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files/Case_19.txt
Transcribing: /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_30.flac




Processed file Case_30.flac and saved transcription in /content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files/Case_30.txt
Transcribing: /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_1.flac




Processed file Case_1.flac and saved transcription in /content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files/Case_1.txt


Vosk model installation

In [None]:
!wget https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip
!unzip vosk-model-small-en-us-0.15.zip -d /content
!pip install vosk

--2025-03-11 05:46:59--  https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip
Resolving alphacephei.com (alphacephei.com)... 188.40.21.16, 2a01:4f8:13a:279f::2
Connecting to alphacephei.com (alphacephei.com)|188.40.21.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 41205931 (39M) [application/zip]
Saving to: ‘vosk-model-small-en-us-0.15.zip’


2025-03-11 05:47:01 (21.4 MB/s) - ‘vosk-model-small-en-us-0.15.zip’ saved [41205931/41205931]

Archive:  vosk-model-small-en-us-0.15.zip
   creating: /content/vosk-model-small-en-us-0.15/
   creating: /content/vosk-model-small-en-us-0.15/am/
  inflating: /content/vosk-model-small-en-us-0.15/am/final.mdl  
   creating: /content/vosk-model-small-en-us-0.15/graph/
  inflating: /content/vosk-model-small-en-us-0.15/graph/disambig_tid.int  
  inflating: /content/vosk-model-small-en-us-0.15/graph/HCLr.fst  
  inflating: /content/vosk-model-small-en-us-0.15/graph/Gr.fst  
   creating: /content/vosk-model-small-

Transcribing audio files using Vosk 'vosk-model-small-en-us-0.15' model

In [None]:
import os
import wave
import json
from pydub import AudioSegment
from vosk import Model, KaldiRecognizer

# Load the Vosk model
model = Model("vosk-model-small-en-us-0.15")  # Make sure to provide the correct model path

def convert_flac_to_wav(input_file, output_file):
    """
    Converts FLAC audio file to WAV format using pydub.
    """
    audio = AudioSegment.from_file(input_file, format="flac")
    audio.export(output_file, format="wav")
    print(f"Converted {input_file} to {output_file}")
    return output_file

def transcribe_audio(audio_path):
    """
    Transcribes the audio file using Vosk.
    """
    print(f"Transcribing: {audio_path}")

    # Open audio file
    wf = wave.open(audio_path, "rb")
    rec = KaldiRecognizer(model, wf.getframerate())

    transcription = []

    # Read audio and process it
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            result = json.loads(rec.Result())
            transcription.append(result.get('text', ''))

    # Final transcription
    final_result = json.loads(rec.FinalResult())
    transcription.append(final_result.get('text', ''))

    return " ".join(transcription)

def process_audio_files_in_folder(input_folder, output_folder):
    """
    Processes the first 10 audio files in the input folder and saves only the transcriptions in the output folder.
    """
    # Check if the output folder exists, if not, create it
    os.makedirs(output_folder, exist_ok=True)

    # Get all audio files in the input folder (filtering for FLAC files)
    audio_files = [f for f in os.listdir(input_folder) if f.endswith('.flac')]

    # Process each audio file
    for audio_file in audio_files:
        audio_file_path = os.path.join(input_folder, audio_file)

        # Remove the .flac extension from the audio file name to create the output file name
        base_name = os.path.splitext(audio_file)[0]  # Remove the extension

        # Define the output file path for the WAV conversion
        wav_file_path = os.path.join(output_folder, f"{base_name}.wav")

        # 1. Convert FLAC to WAV
        convert_flac_to_wav(audio_file_path, wav_file_path)

        # 2. Transcribe Audio using Vosk
        transcribed_text = transcribe_audio(wav_file_path)

        # Define the output file path for the transcription
        transcription_output_path = os.path.join(output_folder, f"{base_name}.txt")

        # Save the transcription text to the output folder
        with open(transcription_output_path, 'w', encoding='utf-8') as file:
            file.write(transcribed_text)

        print(f"Processed file {audio_file} and saved transcription in {transcription_output_path}")

# Input folder containing .flac files
input_folder = "/content/drive/MyDrive/Dataset/Case_Files/FAF"  # Update with your input folder path

# Output folder where transcription text files will be saved
output_folder = "/content/drive/MyDrive/Dataset/Case_Files/vosk_text_25Files"  # Update with your output folder path

# Process the first 10 audio files and save transcriptions only
process_audio_files_in_folder(input_folder, output_folder)


Converted /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_33.flac to /content/drive/MyDrive/Dataset/Case_Files/vosk_text_25Files/Case_33.wav
Transcribing: /content/drive/MyDrive/Dataset/Case_Files/vosk_text_25Files/Case_33.wav
Processed file Case_33.flac and saved transcription in /content/drive/MyDrive/Dataset/Case_Files/vosk_text_25Files/Case_33.txt
Converted /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_35.flac to /content/drive/MyDrive/Dataset/Case_Files/vosk_text_25Files/Case_35.wav
Transcribing: /content/drive/MyDrive/Dataset/Case_Files/vosk_text_25Files/Case_35.wav
Processed file Case_35.flac and saved transcription in /content/drive/MyDrive/Dataset/Case_Files/vosk_text_25Files/Case_35.txt
Converted /content/drive/MyDrive/Dataset/Case_Files/FAF/Case_40.flac to /content/drive/MyDrive/Dataset/Case_Files/vosk_text_25Files/Case_40.wav
Transcribing: /content/drive/MyDrive/Dataset/Case_Files/vosk_text_25Files/Case_40.wav
Processed file Case_40.flac and saved transcription in /

Wav2vac2.0 libraries installation

In [None]:
!pip install torch torchaudio transformers librosa

Transcribing audio files using Wav2vac2.0 model

In [None]:
import os
import torch
import warnings
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import soundfile as sf
import librosa
from concurrent.futures import ProcessPoolExecutor
import re

# Suppress warning for weights initialization issues
warnings.filterwarnings("ignore", message="Some weights of Wav2Vec2ForCTC were not initialized")

# Load the Wav2Vec 2.0 model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

# Define the target sampling rate (16kHz for Wav2Vec2)
TARGET_SAMPLING_RATE = 16000

def transcribe_audio(audio_path):
    """
    Transcribes the audio file using Wav2Vec 2.0.
    """
    try:
        print(f"Transcribing: {audio_path}")

        # Read the audio file and resample to the required sampling rate
        audio_input, original_sr = sf.read(audio_path)
        if original_sr != TARGET_SAMPLING_RATE:
            audio_input = librosa.resample(audio_input, orig_sr=original_sr, target_sr=TARGET_SAMPLING_RATE)

        # Process the audio for the model
        inputs = processor(audio_input, return_tensors="pt", sampling_rate=TARGET_SAMPLING_RATE, padding=True)

        # Get model predictions
        with torch.no_grad():
            logits = model(input_values=inputs.input_values).logits

        # Get the predicted ids from logits
        predicted_ids = torch.argmax(logits, dim=-1)

        # Decode the predicted ids to text
        transcription = processor.decode(predicted_ids[0])

        return transcription
    except Exception as e:
        print(f"Error transcribing {audio_path}: {e}")
        return None

def process_audio_files_in_folder(input_folder, output_folder):
    """
    Processes the audio files in the input folder and saves transcriptions in the output folder.
    """
    # Check if the output folder exists, if not, create it
    os.makedirs(output_folder, exist_ok=True)

    # Get all audio files in the input folder (filtering for FLAC files)
    audio_files = [f for f in os.listdir(input_folder) if f.endswith('.flac')]

    # Using ProcessPoolExecutor to transcribe files in parallel
    with ProcessPoolExecutor(max_workers=1) as executor:  # Reduced max_workers for better stability
        # Process each audio file in parallel
        futures = []
        for audio_file in audio_files:
            audio_file_path = os.path.join(input_folder, audio_file)

            # Remove the .flac extension from the audio file name to create the output file name
            base_name = os.path.splitext(audio_file)[0]  # Remove the extension

            # Define the output file path for the transcription
            transcription_output_path = os.path.join(output_folder, f"{base_name}.txt")

            # Submit transcription task to the executor
            futures.append(executor.submit(transcribe_and_save, audio_file_path, transcription_output_path))

        # Wait for all tasks to complete
        for future in futures:
            try:
                future.result()  # Get the result (or raise any exception)
            except Exception as e:
                print(f"Error processing file: {e}")

def transcribe_and_save(audio_file_path, transcription_output_path):
    """
    Transcribes an audio file and saves the transcription to a file.
    """
    try:
        # 1. Transcribe Audio using Wav2Vec 2.0
        transcribed_text = transcribe_audio(audio_file_path)

        if transcribed_text:  # Check if transcription was successful
            # 2. Convert the transcription to a more readable form (fix case sensitivity)
            corrected_text = correct_case(transcribed_text)

            # 3. Save the transcription text to the output folder
            with open(transcription_output_path, 'w', encoding='utf-8') as file:
                file.write(corrected_text)

            print(f"Processed file {audio_file_path} and saved transcription in {transcription_output_path}")
        else:
            print(f"Skipping {audio_file_path} due to transcription error.")
    except Exception as e:
        print(f"Error transcribing file {audio_file_path}: {e}")

def correct_case(text):
    """
    Corrects the case of the transcription. This function aims to make the transcription more readable.
    It capitalizes the first word of each sentence and leaves the rest in lowercase.
    """
    # Convert to lowercase
    text = text.lower()

    # Capitalize the first letter of each sentence
    text = re.sub(r'([.!?]\s+|^)([a-z])', lambda match: match.group(1) + match.group(2).upper(), text)

    return text

# Input folder containing .flac files
input_folder = "/content/drive/MyDrive/Dataset/Case_Files/final_audio_files"  # Update with your input folder path

# Output folder where transcription text files will be saved
output_folder = "/content/wav2vec2.0_transcribed_text"  # Update with your output folder path

# Process the audio files and save transcriptions only
process_audio_files_in_folder(input_folder, output_folder)

Installation for evaluation metrics

In [None]:
!pip install nltk editdistance sacrebleu
!pip install jiwer

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.1.1 sacrebleu-2.5.1


Comparing 2 files

In [None]:
import difflib
import jiwer
import nltk
from nltk.translate.bleu_score import sentence_bleu
import numpy as np

# Make sure to download necessary NLTK data
nltk.download('punkt')

def compare_files(file1_path, file2_path):
    """
    Compare two transcription text files and calculate:
    - Similarity ratio (difflib)
    - Word Error Rate (WER)
    - Character Error Rate (CER)
    - BLEU score
    """
    # Read the contents of both files
    with open(file1_path, 'r', encoding='utf-8') as file1:
        text1 = file1.read()

    with open(file2_path, 'r', encoding='utf-8') as file2:
        text2 = file2.read()

    # 1. Similarity ratio using difflib
    sequence_matcher = difflib.SequenceMatcher(None, text1, text2)
    similarity_ratio = sequence_matcher.ratio() * 100
    print(f"Similarity ratio: {similarity_ratio:.2f}%")

    # 2. Word Error Rate (WER) using jiwer
    wer = jiwer.wer(text1, text2)
    print(f"Word Error Rate (WER): {wer:.4f}")

    # 3. Character Error Rate (CER)
    def cer(reference, hypothesis):
        """Calculate the Character Error Rate"""
        ref = list(reference.replace(" ", ""))
        hyp = list(hypothesis.replace(" ", ""))
        distance = np.sum([1 for a, b in zip(ref, hyp) if a != b])
        return distance / float(len(ref))

    cer_value = cer(text1, text2)
    print(f"Character Error Rate (CER): {cer_value:.4f}")

    # 4. BLEU score (using sentence_bleu from nltk)
    reference = text1.split()  # Reference text (split into words)
    hypothesis = text2.split()  # Hypothesis text (split into words)
    bleu_score = sentence_bleu([reference], hypothesis)  # BLEU score computation
    print(f"BLEU score: {bleu_score:.4f}")


# Example usage
file1 = "/content/drive/MyDrive/Dataset/Case_Files/PT/Case_2.txt"  # Replace with the path to your first transcription file
file2 = "/content/drive/MyDrive/Dataset/Case_Files/vosk_text_25Files/Case_2.txt"  # Replace with the path to your second transcription file

compare_files(file1, file2)

Similarity ratio: 3.78%
Word Error Rate (WER): 0.2300
Character Error Rate (CER): 0.9232
BLEU score: 0.6239


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Comparing 2 folders (The text is converted to lower text and then compared)(postprocessed text and Wav2vac2.0 transcribed text)

In [None]:
import os
import editdistance
import sacrebleu
import string

# Function to compute Word Error Rate (WER)
def compute_wer(reference, hypothesis):
    reference_words = reference.split()
    hypothesis_words = hypothesis.split()
    return editdistance.eval(reference_words, hypothesis_words) / len(reference_words)

# Function to compute Character Error Rate (CER)
def compute_cer(reference, hypothesis):
    return editdistance.eval(reference, hypothesis) / len(reference)

# Function to compute BLEU score
def compute_bleu(reference, hypothesis):
    bleu = sacrebleu.corpus_bleu([hypothesis], [[reference]])
    return bleu.score

# Function to preprocess the text (remove punctuation and convert to lowercase)
def preprocess_text(text):
    # Convert text to lowercase and remove punctuation
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    return text

# Read the original text and transcribed text
def read_text_files(original_file, transcribed_file):
    with open(original_file, 'r') as f:
        original_text = f.read().strip()

    with open(transcribed_file, 'r') as f:
        transcribed_text = f.read().strip()

    return original_text, transcribed_text

# Function to evaluate metrics for a single file
def evaluate_single_file(reference, hypothesis):
    # Preprocess the reference and hypothesis text
    reference = preprocess_text(reference)
    hypothesis = preprocess_text(hypothesis)

    # Compute WER
    wer = compute_wer(reference, hypothesis)

    # Compute CER
    cer = compute_cer(reference, hypothesis)

    # Compute BLEU score
    bleu_score = compute_bleu(reference, hypothesis)

    return wer, cer, bleu_score

# Main function to evaluate metrics for all matching files in two folders
def evaluate_metrics(input_folder, output_folder):
    # Get all text files in the input folder
    input_files = [f for f in os.listdir(input_folder) if f.endswith('.txt')]

    # Initialize variables to accumulate total scores
    total_wer = 0
    total_cer = 0
    total_bleu = 0
    file_count = 0

    # Loop through each file in the input folder
    for input_file in input_files:
        # Create the corresponding output file path
        input_file_path = os.path.join(input_folder, input_file)
        output_file_path = os.path.join(output_folder, input_file)

        # Check if the corresponding file exists in the output folder
        if os.path.exists(output_file_path):
            # Read original and transcribed text
            reference, hypothesis = read_text_files(input_file_path, output_file_path)

            # Print the file name being processed
            print(f"Evaluating file: {input_file}")

            # Evaluate and accumulate metrics for the current file
            wer, cer, bleu_score = evaluate_single_file(reference, hypothesis)
            total_wer += wer
            total_cer += cer
            total_bleu += bleu_score
            file_count += 1

            # Print evaluation metrics for the current file
            print(f"Word Error Rate (WER): {wer * 100:.2f}%")
            print(f"Character Error Rate (CER): {cer * 100:.2f}%")
            print(f"BLEU score: {bleu_score:.2f}")
            print("-" * 50)
        else:
            print(f"Warning: The file '{input_file}' is missing in the output folder.")

    # Calculate average metrics
    if file_count > 0:
        avg_wer = total_wer / file_count
        avg_cer = total_cer / file_count
        avg_bleu = total_bleu / file_count

        # Print average results
        print("\nAverage Metrics Across All Files:")
        print(f"Average Word Error Rate (WER): {avg_wer * 100:.2f}%")
        print(f"Average Character Error Rate (CER): {avg_cer * 100:.2f}%")
        print(f"Average BLEU score: {avg_bleu:.2f}")
    else:
        print("No files were processed.")

# Example usage:
# Replace '/path/to/extracted_text' and '/path/to/transcribed_text' with actual folder paths
input_folder = '/content/drive/MyDrive/Dataset/Case_Files/PT'  # Folder with original extracted texts
output_folder = '/content/drive/MyDrive/Dataset/Case_Files/wav2vac2.0_text_25Files'  # Folder with transcribed texts

evaluate_metrics(input_folder, output_folder)


Evaluating file: Case_9.txt
Word Error Rate (WER): 79.90%
Character Error Rate (CER): 32.75%
BLEU score: 15.99
--------------------------------------------------
Evaluating file: Case_37.txt
Word Error Rate (WER): 97.54%
Character Error Rate (CER): 61.58%
BLEU score: 20.16
--------------------------------------------------
Evaluating file: Case_31.txt
Word Error Rate (WER): 53.11%
Character Error Rate (CER): 22.72%
BLEU score: 29.24
--------------------------------------------------
Evaluating file: Case_44.txt
Word Error Rate (WER): 63.99%
Character Error Rate (CER): 34.94%
BLEU score: 23.75
--------------------------------------------------
Evaluating file: Case_25.txt
Word Error Rate (WER): 34.09%
Character Error Rate (CER): 16.87%
BLEU score: 51.53
--------------------------------------------------
Evaluating file: Case_42.txt
Word Error Rate (WER): 53.25%
Character Error Rate (CER): 29.33%
BLEU score: 27.92
--------------------------------------------------
Evaluating file: Case_2

Comparing 2 folders (The text is converted to lower text and then compared)(postprocessed text and whisper transcribed text)

In [None]:
import os
import editdistance
import sacrebleu
import string

# Function to compute Word Error Rate (WER)
def compute_wer(reference, hypothesis):
    reference_words = reference.split()
    hypothesis_words = hypothesis.split()
    return editdistance.eval(reference_words, hypothesis_words) / len(reference_words)

# Function to compute Character Error Rate (CER)
def compute_cer(reference, hypothesis):
    return editdistance.eval(reference, hypothesis) / len(reference)

# Function to compute BLEU score
def compute_bleu(reference, hypothesis):
    bleu = sacrebleu.corpus_bleu([hypothesis], [[reference]])
    return bleu.score

# Function to preprocess the text (remove punctuation and convert to lowercase)
def preprocess_text(text):
    # Convert text to lowercase and remove punctuation
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    return text

# Read the original text and transcribed text
def read_text_files(original_file, transcribed_file):
    with open(original_file, 'r') as f:
        original_text = f.read().strip()

    with open(transcribed_file, 'r') as f:
        transcribed_text = f.read().strip()

    return original_text, transcribed_text

# Function to evaluate metrics for a single file
def evaluate_single_file(reference, hypothesis):
    # Preprocess the reference and hypothesis text
    reference = preprocess_text(reference)
    hypothesis = preprocess_text(hypothesis)

    # Compute WER
    wer = compute_wer(reference, hypothesis)

    # Compute CER
    cer = compute_cer(reference, hypothesis)

    # Compute BLEU score
    bleu_score = compute_bleu(reference, hypothesis)

    return wer, cer, bleu_score

# Main function to evaluate metrics for all matching files in two folders
def evaluate_metrics(input_folder, output_folder):
    # Get all text files in the input folder
    input_files = [f for f in os.listdir(input_folder) if f.endswith('.txt')]

    # Initialize variables to accumulate total scores
    total_wer = 0
    total_cer = 0
    total_bleu = 0
    file_count = 0

    # Loop through each file in the input folder
    for input_file in input_files:
        # Create the corresponding output file path
        input_file_path = os.path.join(input_folder, input_file)
        output_file_path = os.path.join(output_folder, input_file)

        # Check if the corresponding file exists in the output folder
        if os.path.exists(output_file_path):
            # Read original and transcribed text
            reference, hypothesis = read_text_files(input_file_path, output_file_path)

            # Print the file name being processed
            print(f"Evaluating file: {input_file}")

            # Evaluate and accumulate metrics for the current file
            wer, cer, bleu_score = evaluate_single_file(reference, hypothesis)
            total_wer += wer
            total_cer += cer
            total_bleu += bleu_score
            file_count += 1

            # Print evaluation metrics for the current file
            print(f"Word Error Rate (WER): {wer * 100:.2f}%")
            print(f"Character Error Rate (CER): {cer * 100:.2f}%")
            print(f"BLEU score: {bleu_score:.2f}")
            print("-" * 50)
        else:
            print(f"Warning: The file '{input_file}' is missing in the output folder.")

    # Calculate average metrics
    if file_count > 0:
        avg_wer = total_wer / file_count
        avg_cer = total_cer / file_count
        avg_bleu = total_bleu / file_count

        # Print average results
        print("\nAverage Metrics Across All Files:")
        print(f"Average Word Error Rate (WER): {avg_wer * 100:.2f}%")
        print(f"Average Character Error Rate (CER): {avg_cer * 100:.2f}%")
        print(f"Average BLEU score: {avg_bleu:.2f}")
    else:
        print("No files were processed.")

# Example usage:
# Replace '/path/to/extracted_text' and '/path/to/transcribed_text' with actual folder paths
input_folder = '/content/drive/MyDrive/Dataset/Case_Files/PT'  # Folder with original extracted texts
output_folder = '/content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files'  # Folder with transcribed texts

evaluate_metrics(input_folder, output_folder)


Evaluating file: Case_9.txt
Word Error Rate (WER): 20.73%
Character Error Rate (CER): 8.78%
BLEU score: 70.78
--------------------------------------------------
Evaluating file: Case_37.txt
Word Error Rate (WER): 23.77%
Character Error Rate (CER): 7.52%
BLEU score: 60.46
--------------------------------------------------
Evaluating file: Case_31.txt
Word Error Rate (WER): 4.94%
Character Error Rate (CER): 1.41%
BLEU score: 91.26
--------------------------------------------------
Evaluating file: Case_44.txt
Word Error Rate (WER): 8.78%
Character Error Rate (CER): 1.89%
BLEU score: 85.11
--------------------------------------------------
Evaluating file: Case_25.txt
Word Error Rate (WER): 3.08%
Character Error Rate (CER): 0.85%
BLEU score: 94.14
--------------------------------------------------
Evaluating file: Case_42.txt
Word Error Rate (WER): 6.24%
Character Error Rate (CER): 1.49%
BLEU score: 88.21
--------------------------------------------------
Evaluating file: Case_24.txt
Word

Comparing 2 folders (The text is converted to lower text and then compared)(postprocessed text and Vosk transcribed text)

In [None]:
import os
import editdistance
import sacrebleu
import string

# Function to compute Word Error Rate (WER)
def compute_wer(reference, hypothesis):
    reference_words = reference.split()
    hypothesis_words = hypothesis.split()
    return editdistance.eval(reference_words, hypothesis_words) / len(reference_words)

# Function to compute Character Error Rate (CER)
def compute_cer(reference, hypothesis):
    return editdistance.eval(reference, hypothesis) / len(reference)

# Function to compute BLEU score
def compute_bleu(reference, hypothesis):
    bleu = sacrebleu.corpus_bleu([hypothesis], [[reference]])
    return bleu.score

# Function to preprocess the text (remove punctuation and convert to lowercase)
def preprocess_text(text):
    # Convert text to lowercase and remove punctuation
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    return text

# Read the original text and transcribed text
def read_text_files(original_file, transcribed_file):
    with open(original_file, 'r') as f:
        original_text = f.read().strip()

    with open(transcribed_file, 'r') as f:
        transcribed_text = f.read().strip()

    return original_text, transcribed_text

# Function to evaluate metrics for a single file
def evaluate_single_file(reference, hypothesis):
    # Preprocess the reference and hypothesis text
    reference = preprocess_text(reference)
    hypothesis = preprocess_text(hypothesis)

    # Compute WER
    wer = compute_wer(reference, hypothesis)

    # Compute CER
    cer = compute_cer(reference, hypothesis)

    # Compute BLEU score
    bleu_score = compute_bleu(reference, hypothesis)

    return wer, cer, bleu_score

# Main function to evaluate metrics for all matching files in two folders
def evaluate_metrics(input_folder, output_folder):
    # Get all text files in the input folder
    input_files = [f for f in os.listdir(input_folder) if f.endswith('.txt')]

    # Initialize variables to accumulate total scores
    total_wer = 0
    total_cer = 0
    total_bleu = 0
    file_count = 0

    # Loop through each file in the input folder
    for input_file in input_files:
        # Create the corresponding output file path
        input_file_path = os.path.join(input_folder, input_file)
        output_file_path = os.path.join(output_folder, input_file)

        # Check if the corresponding file exists in the output folder
        if os.path.exists(output_file_path):
            # Read original and transcribed text
            reference, hypothesis = read_text_files(input_file_path, output_file_path)

            # Print the file name being processed
            print(f"Evaluating file: {input_file}")

            # Evaluate and accumulate metrics for the current file
            wer, cer, bleu_score = evaluate_single_file(reference, hypothesis)
            total_wer += wer
            total_cer += cer
            total_bleu += bleu_score
            file_count += 1

            # Print evaluation metrics for the current file
            print(f"Word Error Rate (WER): {wer * 100:.2f}%")
            print(f"Character Error Rate (CER): {cer * 100:.2f}%")
            print(f"BLEU score: {bleu_score:.2f}")
            print("-" * 50)
        else:
            print(f"Warning: The file '{input_file}' is missing in the output folder.")

    # Calculate average metrics
    if file_count > 0:
        avg_wer = total_wer / file_count
        avg_cer = total_cer / file_count
        avg_bleu = total_bleu / file_count

        # Print average results
        print("\nAverage Metrics Across All Files:")
        print(f"Average Word Error Rate (WER): {avg_wer * 100:.2f}%")
        print(f"Average Character Error Rate (CER): {avg_cer * 100:.2f}%")
        print(f"Average BLEU score: {avg_bleu:.2f}")
    else:
        print("No files were processed.")

# Example usage:
# Replace '/path/to/extracted_text' and '/path/to/transcribed_text' with actual folder paths
input_folder = '/content/drive/MyDrive/Dataset/Case_Files/PT'  # Folder with original extracted texts
output_folder = '/content/drive/MyDrive/Dataset/Case_Files/vosk_text_25Files'  # Folder with transcribed texts

evaluate_metrics(input_folder, output_folder)


Evaluating file: Case_9.txt
Word Error Rate (WER): 56.26%
Character Error Rate (CER): 28.37%
BLEU score: 45.28
--------------------------------------------------
Evaluating file: Case_37.txt
Word Error Rate (WER): 84.43%
Character Error Rate (CER): 60.06%
BLEU score: 29.74
--------------------------------------------------
Evaluating file: Case_31.txt
Word Error Rate (WER): 20.53%
Character Error Rate (CER): 15.09%
BLEU score: 72.50
--------------------------------------------------
Evaluating file: Case_44.txt
Word Error Rate (WER): 41.73%
Character Error Rate (CER): 28.91%
BLEU score: 55.69
--------------------------------------------------
Evaluating file: Case_25.txt
Word Error Rate (WER): 21.97%
Character Error Rate (CER): 14.48%
BLEU score: 71.48
--------------------------------------------------
Evaluating file: Case_42.txt
Word Error Rate (WER): 35.99%
Character Error Rate (CER): 26.30%
BLEU score: 51.74
--------------------------------------------------
Evaluating file: Case_2

In [None]:
!pip install pandas openpyxl



Code that saves all the reults in a excel file

In [None]:
import os
import editdistance
import sacrebleu
import string
import pandas as pd

# Function to compute Word Error Rate (WER)
def compute_wer(reference, hypothesis):
    reference_words = reference.split()
    hypothesis_words = hypothesis.split()
    return editdistance.eval(reference_words, hypothesis_words) / len(reference_words)

# Function to compute Character Error Rate (CER)
def compute_cer(reference, hypothesis):
    return editdistance.eval(reference, hypothesis) / len(reference)

# Function to compute BLEU score
def compute_bleu(reference, hypothesis):
    bleu = sacrebleu.corpus_bleu([hypothesis], [[reference]])
    return bleu.score

# Function to preprocess the text (remove punctuation and convert to lowercase)
def preprocess_text(text):
    # Convert text to lowercase and remove punctuation
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    return text

# Read the original text and transcribed text
def read_text_files(original_file, transcribed_file):
    with open(original_file, 'r') as f:
        original_text = f.read().strip()

    with open(transcribed_file, 'r') as f:
        transcribed_text = f.read().strip()

    return original_text, transcribed_text

# Function to evaluate metrics for a single file
def evaluate_single_file(reference, hypothesis):
    # Preprocess the reference and hypothesis text
    reference = preprocess_text(reference)
    hypothesis = preprocess_text(hypothesis)

    # Compute WER
    wer = compute_wer(reference, hypothesis)

    # Compute CER
    cer = compute_cer(reference, hypothesis)

    # Compute BLEU score
    bleu_score = compute_bleu(reference, hypothesis)

    return wer, cer, bleu_score

# Function to evaluate metrics for a single model and store results
def evaluate_metrics(input_folder, output_folder, model_name, evaluation_data):
    # Get all text files in the input folder
    input_files = [f for f in os.listdir(input_folder) if f.endswith('.txt')]

    # Loop through each file in the input folder
    for input_file in input_files:
        # Create the corresponding output file path
        input_file_path = os.path.join(input_folder, input_file)
        output_file_path = os.path.join(output_folder, input_file)

        # Check if the corresponding file exists in the output folder
        if os.path.exists(output_file_path):
            # If the file doesn't exist in evaluation_data, initialize it
            if input_file not in evaluation_data:
                evaluation_data[input_file] = {}

            # Read original and transcribed text
            reference, hypothesis = read_text_files(input_file_path, output_file_path)

            # Evaluate and accumulate metrics for the current file
            wer, cer, bleu_score = evaluate_single_file(reference, hypothesis)

            # Store the result in the evaluation data under the model's columns
            evaluation_data[input_file][model_name] = {
                'WER': wer * 100,
                'CER': cer * 100,
                'BLEU': bleu_score
            }
        else:
            print(f"Warning: The file '{input_file}' is missing in the output folder.")

# Main function to evaluate all models and save results to a single file
def evaluate_all_models(input_folder, whisper_output_folder, vosk_output_folder, wav2vec_output_folder):
    # Initialize an empty dictionary to store evaluation results for all models
    evaluation_data = {}

    # Evaluate results for each model and append to the evaluation_data dictionary
    evaluate_metrics(input_folder, whisper_output_folder, "Whisper", evaluation_data)
    evaluate_metrics(input_folder, vosk_output_folder, "Vosk", evaluation_data)
    evaluate_metrics(input_folder, wav2vec_output_folder, "Wav2Vec2.0", evaluation_data)

    # If we have data to save
    if evaluation_data:
        # Prepare data in the format suitable for creating the DataFrame
        data = []
        for file_name, metrics in evaluation_data.items():
            row = {'File Name': file_name}
            for model, model_metrics in metrics.items():
                row[f'{model} WER'] = model_metrics['WER']
                row[f'{model} CER'] = model_metrics['CER']
                row[f'{model} BLEU'] = model_metrics['BLEU']
            data.append(row)

        # Create a DataFrame from the evaluation data
        df = pd.DataFrame(data)

        # Save the DataFrame to an Excel file
        output_excel_file = '/content/comparison_results.xlsx'
        df.to_excel(output_excel_file, index=False, engine='openpyxl')

        print(f"Results saved to {output_excel_file}")
    else:
        print("No files were processed.")

# Example usage:
# Replace '/path/to/extracted_text' and '/path/to/transcribed_text' with actual folder paths
input_folder = '/content/drive/MyDrive/Dataset/Case_Files/PT'  # Folder with original extracted texts
whisper_output_folder = '/content/drive/MyDrive/Dataset/Case_Files/whisper_text_25Files'  # Folder with Whisper transcribed texts
vosk_output_folder = '/content/drive/MyDrive/Dataset/Case_Files/vosk_text_25Files'  # Folder with Vosk transcribed texts
wav2vec_output_folder = '/content/drive/MyDrive/Dataset/Case_Files/wav2vac2.0_text_25Files'  # Folder with wav2vec transcribed texts

# Evaluate all models and save the results to a single file
evaluate_all_models(input_folder, whisper_output_folder, vosk_output_folder, wav2vec_output_folder)


Results saved to /content/comparison_results.xlsx


In [None]:
import pandas as pd

# Load the dataset
file_path = "/content/comparison_results.xlsx"  # Update with your actual file path
df = pd.read_excel(file_path)

# Display basic statistics
print(df.describe())

              WER         CER  BLEU Score
count  153.000000  153.000000  153.000000
mean    41.020133   22.981340   52.406504
std     25.108442   16.249775   22.530530
min      3.080082    0.845277   10.557728
25%     20.532319    7.584329   32.325785
50%     37.700535   21.417017   52.380023
75%     60.294118   37.411095   70.670207
max     99.478488   61.579347   94.140398


In [None]:
# Find the best performing case based on BLEU Score
best_case = df.loc[df["BLEU Score"].idxmax()]
print("Best Performing Case:")
print(best_case)

Best Performing Case:
File Name     Case_25.txt
Model             Whisper
WER              3.080082
CER              0.845277
BLEU Score      94.140398
Name: 4, dtype: object
