In [1]:
pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [2]:
import os
import PyPDF2
import pandas as pd

# Extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text

# Find the line containing the "APPEAL NO." text
def find_appeal_text_in_pdf(text, search_string="APPEAL NO."):
    lines = text.splitlines()
    for line in lines:
        if search_string in line:
            return line.strip()
    return ""

# Extract "REPORTABLE" and "APPELLATE JURISDICTION" information from a PDF
def extract_info_from_pdf(text):
    reportable_found = "NO"
    appellate_line = ""

    lines = text.splitlines()
    for line in lines:
        if "REPORTABLE" in line:
            reportable_found = "YES"
        if "NON-REPORTABLE" in line:
            reportable_found = "NO"

        if "APPELLATE JURISDICTION" in line:
            appellate_line = line.strip()

    return reportable_found, appellate_line

# Find the line with maximum overlap with the filename
def find_max_overlap_line(text, filename):
    lines = text.splitlines()
    max_overlap = 0
    line_to_remove = ""

    for line in lines:
        overlap = len(os.path.commonprefix([line, filename]))
        if overlap > max_overlap:
            max_overlap = overlap
            line_to_remove = line

    return line_to_remove

# Remove a specific line from the extracted text
def remove_line_from_text(text, line_to_remove):
    return text.replace(line_to_remove, "")

# Process all PDF files in a directory
def process_pdfs(directory):
    data = []

    for filename in os.listdir(directory):
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(directory, filename)
            text = extract_text_from_pdf(pdf_path)

            # Find and store the "APPEAL NO." line
            appeal_text = find_appeal_text_in_pdf(text)

            # Extract "REPORTABLE" and "APPELLATE JURISDICTION" information
            reportable, appellate_line = extract_info_from_pdf(text)

            # Find the line with maximum overlap with the filename
            line_to_remove = find_max_overlap_line(text, filename)

            # Remove the identified line from the text
            if line_to_remove:
                text = remove_line_from_text(text, line_to_remove)

            # Save updated text to a new file
            updated_text_path = os.path.join(directory, f"{os.path.splitext(filename)[0]}.txt")
            with open(updated_text_path, 'w') as file:
                file.write(text)

            # Add data to the list
            data.append({
                'File Name': filename,
                'Reportable': reportable,
                'Appellate Jurisdiction Line': appellate_line,
                'Appeal Text': appeal_text
            })

    # Create a DataFrame and save it to an Excel file
    df = pd.DataFrame(data)
    df.to_excel('pdf_analysis_results.xlsx', index=False)

    # Print the DataFrame to the console
    print(df)

# Example usage
directory_path = '/content/drive/MyDrive/delete/judgment'
process_pdfs(directory_path)


                                           File Name Reportable  \
0  A_M_Mohan_vs_The_State_Rep_By_Sho_on_20_March_...        YES   
1  Abdul_Jabbar_vs_The_State_Of_Haryana_on_5_Febr...        YES   

       Appellate Jurisdiction Line                         Appeal Text  
0  CRIMINAL APPELLATE JURISDICTION  CRIMINAL APPEAL NO.        OF 2024  
1  CRIMINAL APPELLATE JURISDICTION                                      


# Preprocessing & noise removal on the .txt file

In [3]:
import os
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('punkt')

# Paths
input_folder = "/content/drive/MyDrive/delete/judgment"  # Updated input folder containing .txt files
output_folder = "/content/drive/MyDrive/delete/judgment/preprocessed_judgments"

# Create output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Function to preprocess text
def preprocess_text(text):
    text = re.sub(r"\xa0", " ", text)
    text = [re.sub(r'[^a-zA-Z0-9.,)\-(/?\t ]', '', sentence) for sentence in text.split("\n")]
    text = [re.sub(r'(?<=[^0-9])/(?=[^0-9])', ' ', sentence) for sentence in text]
    text = [re.sub("\t+", " ", sentence) for sentence in text]
    text = [re.sub(" +", " ", sentence) for sentence in text]
    text = [re.sub("\.\.+", "", sentence) for sentence in text]
    text = [re.sub("\A ?", "", sentence) for sentence in text]
    text = [sentence for sentence in text if (len(sentence) != 1 and not re.fullmatch("(\d|\d\d|\d\d\d)", sentence))]
    text = [re.sub('\A\(?(\d|\d\d\d|\d\d|[a-zA-Z])(\.|\))\s?(?=[A-Z])', '\n', sentence) for sentence in text]
    text = [re.sub("\A\(([ivx]+)\)\s?(?=[a-zA-Z0-9])", '\n', sentence) for sentence in text]
    text = [sentence for sentence in text if not re.search(r"Indian Kanoon", sentence)]
    text = "\n".join(text)
    text = re.sub(r"[()[\]\"]", " ", text)  # removing ()[\]\"
    text = re.sub(r" no\.", " number", text)  # converting no., nos., co., ltd. to number, numbers, company, and limited
    text = re.sub(r" nos\.", " numbers", text)
    text = re.sub(r" co\.", " company", text)
    text = re.sub(r" ltd\.", " limited", text)

    # Remove multiple newlines
    text2 = []
    for index in range(len(text.splitlines())):
        if index > 0 and text.splitlines()[index] == '' and text.splitlines()[index - 1] == '':
            continue
        if index < len(text.splitlines()) - 1 and text.splitlines()[index + 1] != '' and text.splitlines()[index + 1][0] == '\n' and text.splitlines()[index] == '':
            continue
        text2.append(text.splitlines()[index])

    text = "\n".join(text2)

    # Ignore the text before JUDGMENT or ORDER
    lines = text.splitlines()
    start_idx = 0
    for i, line in enumerate(lines):
        if re.search(r"\A(ORDER|JUDGMENT|J U D G M E N T|O R D E R)", line):
            start_idx = i
            break

    # Start from the first occurrence of "ORDER" or "JUDGMENT"
    text = "\n".join(lines[start_idx:])

    sentences = sent_tokenize(text)
    sentences = [sentence.strip() for sentence in sentences]

    # Add a period to any sentence that does not end with one
    sentences = [sentence if sentence.endswith('.') else sentence + '.' for sentence in sentences]

    # Join sentences with a newline
    processed_text = "\n".join(sentences)

    # Check if the number of tokens is sufficient
    num_tokens = len(word_tokenize(processed_text))
    if num_tokens < 100:
        return None

    return processed_text

# Function to process .txt files
def process_txt_files(input_folder, output_folder):
    for filename in os.listdir(input_folder):
        if filename.endswith('.txt'):
            input_file_path = os.path.join(input_folder, filename)

            # Read the content of the txt file
            with open(input_file_path, 'r') as file:
                text = file.read()

            # Preprocess the text
            preprocessed_text = preprocess_text(text)
            #print(preprocessed_text)

            if preprocessed_text:
                # Save the preprocessed text to the output folder
                output_file_path = os.path.join(output_folder, filename)
                with open(output_file_path, 'w') as output_file:
                    output_file.write(preprocessed_text)
                print(f"Processed and saved: {filename}")
            else:
                print(f"Skipped {filename} due to insufficient token count.")

# Example usage
process_txt_files(input_folder, output_folder)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Processed and saved: A_M_Mohan_vs_The_State_Rep_By_Sho_on_20_March_2024.txt
Processed and saved: Abdul_Jabbar_vs_The_State_Of_Haryana_on_5_February_2024.txt


#Finding the abbriviations in the documents

In [38]:
# Define paths
input_folder = "/content/drive/MyDrive/My Dataset/2024/preprocessed_judgments"  # Replace with your input folder path
output_file = "/content/pdf_analysis_results.xlsx"  # Replace with your output file path


In [None]:
import os
import re
import pandas as pd

# Function to find unique abbreviations in a text
def find_unique_abbreviations(text):
    patterns = [
        r'\b(?:[A-Z]\.){2,}',  # U.S.A., Ph.D.
        #r'\b[A-Z]{2,}\b',      # NASA, CEO
        r'\b[A-Z][a-z]*\.',    # Inc., Ltd., Co.
        r'\b[A-Z][a-z]+\.'     # Mr., Dr., Ms.
    ]

    combined_pattern = '|'.join(patterns)
    abbreviations = re.findall(combined_pattern, text)
    unique_abbreviations = set(abbreviations)

    return unique_abbreviations

# Function to process all text files and save the abbreviations in a DataFrame
def process_text_files(input_folder, output_file):
    data = []

    # Iterate through each file in the folder
    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            file_path = os.path.join(input_folder, filename)

            # Read the content of the file
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()

            # Find unique abbreviations in the text
            abbreviations = find_unique_abbreviations(text)

            # Add the results to the data list
            data.append({
                "File Name": filename,
                "Abbreviations": ", ".join(abbreviations)
            })
            print(f"Processed and saved: {filename}")

    # Create a DataFrame from the data list
    df = pd.DataFrame(data)

    # Save the DataFrame to an Excel sheet
    df.to_excel(output_file, index=False)

# Process the text files and save the results
process_text_files(input_folder, output_file)


# Finding unique abbriviations


In [43]:
import pandas as pd

# Define paths
input_file = '/content/abbriviation list.xlsx'  # Replace with your input file path
output_file = 'output_file.xlsx'  # Replace with your output file path

# Read the Excel file into a DataFrame
df = pd.read_excel(input_file)

# Ensure the column containing abbreviations is a string
df['Abbreviations'] = df['Abbreviations'].astype(str)

# Combine all abbreviations into a single list
all_abbreviations = ' '.join(df['Abbreviations'].tolist())

# Split the abbreviations by delimiter (assuming comma or space as delimiter)
abbreviation_list = all_abbreviations.split(', ')  # Adjust the delimiter if necessary

# Remove duplicates by converting to a set and then back to a list
unique_abbreviations = list(set(abbreviation_list))

# Create a DataFrame for the unique abbreviations
unique_df = pd.DataFrame(unique_abbreviations, columns=['Unique Abbreviations'])

# Save the unique abbreviations to a new Excel file
unique_df.to_excel(output_file, index=False)

print(f"Unique abbreviations have been saved to {output_file}")


Unique abbreviations have been saved to output_file.xlsx


In [42]:
unique_df

Unnamed: 0,Unique Abbreviations
0,Orchard.
1,Mussoorie.
2,Tanjore.
3,Speech.
4,B.V. T.D.R.
...,...
12208,Presidency.
12209,Sambamurthy.
12210,Sahitya.
12211,Nahri.


# Making sentence including abbriviations for folder+sentence and token count

In [67]:
import os
import re
import pandas as pd
import nltk

nltk.download('punkt')

def load_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def save_text(file_path, text):
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(text)

def process_sentences(text, abbr_set):
    lines = text.splitlines()
    processed_lines = []
    current_sentence = ""
    in_sentence = False

    for i, line in enumerate(lines):
        line = line.strip()
        if not line:
            continue

        words = re.findall(r'\b\w+\b', line)
        if words:
            last_word = words[-1].upper()

            if (
                last_word in abbr_set or
                not line.endswith('.') or
                (line.endswith('No.') and i + 1 < len(lines) and re.match(r'^\d', lines[i + 1].strip()))
            ):
                current_sentence += " " + line
                in_sentence = True
            else:
                if in_sentence:
                    current_sentence += " " + line
                    processed_lines.append(current_sentence.strip())
                    current_sentence = ""
                    in_sentence = False
                else:
                    processed_lines.append(line.strip())
        else:
            if in_sentence:
                current_sentence += " " + line

    if current_sentence:
        processed_lines.append(current_sentence.strip())

    return "\n".join(processed_lines)

def count_tokens_sentences(text):
    sentences = nltk.sent_tokenize(text)
    tokens = nltk.word_tokenize(text)
    return len(tokens), len(sentences)

def process_files(input_folder, output_folder, abbr_set):
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # DataFrame to store the count of tokens and sentences for each file
    data = []

    for file_name in os.listdir(input_folder):
        if file_name.endswith('.txt'):
            input_file_path = os.path.join(input_folder, file_name)
            output_file_path = os.path.join(output_folder, file_name)

            text = load_text(input_file_path)
            processed_text = process_sentences(text, abbr_set)
            save_text(output_file_path, processed_text)

            # Count tokens and sentences
            token_count, sentence_count = count_tokens_sentences(processed_text)

            # Append results to the data list
            data.append({
                'File Name': file_name,
                'Token Count': token_count,
                'Sentence Count': sentence_count
            })

            print(f"Processed and saved: {file_name}")

    # Convert data list to DataFrame and save as Excel or CSV
    df = pd.DataFrame(data)
    output_stats_file = os.path.join(output_folder, 'file_stats.xlsx')
    df.to_excel(output_stats_file, index=False)

    print(f"Token and sentence counts have been saved to {output_stats_file}")

# Example usage
abbreviation_file = '/content/unique abbriviation.xlsx'  # Replace with your file path
df_abbr = pd.read_excel(abbreviation_file)
abbreviations = set(df_abbr['Unique Abbreviations'].str.strip().str.upper())

input_folder = '/content/drive/MyDrive/My Dataset/preprocessed_judgments'  # Replace with your input folder path
output_folder = '/content/drive/MyDrive/My Dataset/preprocessed_judgments/Final_judgments'  # Replace with your output folder path

process_files(input_folder, output_folder, abbreviations)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processed and saved: Birla_Institute_Of_Technology_vs_The_State_Of_Jharkhand_on_7_March_2019.txt
Processed and saved: Bir_Singh_vs_Mukesh_Kumar_on_6_February_2019.txt
Processed and saved: Bishambhar_Prasad_vs_M_S_Arfat_Petrochemicals_Private_on_20_April_2023.txt
Processed and saved: Board_Of_Governors_In_Supersession_Of_vs_Priyambada_Sharma_on_17_October_2022.txt
Processed and saved: Binay_Kumar_Dalei_vs_The_State_Of_Odisha_on_2_March_2022.txt
Processed and saved: Biltu_Bhattacharya_vs_The_State_Of_West_Bengal_on_10_May_2022.txt
Processed and saved: Biraji_Brijraji_vs_Surya_Pratap_And_Ors_on_3_November_2020.txt
Processed and saved: Bikram_Chatterji_vs_Union_Of_India_on_29_June_2021.txt
Processed and saved: Bina_Basak_And_Ors_vs_Sri_Bipul_Kanti_Basak_And_Ors_on_21_March_2024.txt
Processed and saved: Birbal_Nath_vs_The_State_Of_Rajasthan_on_30_October_2023.txt
Processed and saved: Bikram_Chatterji_vs_Union_Of_India_on_7_Nov