In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler, random_split
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
import nltk

import random
import time
import datetime
import shap

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yvarajan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df = pd.read_csv("./Final_Full_Dataset.csv")

df = df.dropna(subset=["Risk Level"])

label_encoder_level = LabelEncoder()
label_encoder_category = LabelEncoder()

df['Risk Level'] = label_encoder_level.fit_transform(df['Risk Level'])
df['Risk Category'] = label_encoder_category.fit_transform(df['Risk Category'])


risk_level_mapping = {i: label for i, label in enumerate(label_encoder_level.classes_)}
risk_category_mapping = {i: label for i, label in enumerate(label_encoder_category.classes_)}


In [4]:
print("Risk Level Encodings:")
for encoded, original in risk_level_mapping.items():
    print(f"{original}: {encoded}")

print("\nRisk Category Encodings:")
for encoded, original in risk_category_mapping.items():
    print(f"{original}: {encoded}")

Risk Level Encodings:
High: 0
Low: 1
Medium: 2

Risk Category Encodings:
Accelerated Payment Obligation: 0
Acceptance: 1
Acceptance Deadline: 2
Acceptance Period: 3
Acceptance Process: 4
Acceptance and Returns: 5
Access Control: 6
Access Restrictions: 7
Access Rights: 8
Access to Business Records: 9
Access to Financial Records: 10
Access to Information: 11
Access to Records: 12
Accountability Obligation: 13
Accounting: 14
Accounting & Compliance: 15
Acquisition & Ownership: 16
Acquisition & Ownership Change: 17
Acquisition Clause: 18
Acquisition Restriction: 19
Additional Charges: 20
Additional Compliance Requirements: 21
Additional Fees: 22
Additional Financial Burden: 23
Additional Insurance Requirement: 24
Additional Insured: 25
Additional Insured Coverage: 26
Additional Insured Requirement: 27
Additional Termination Fees: 28
Administrative Burden: 29
Administrative Compliance: 30
Advance Notice: 31
Advance Notice Obligation: 32
Advance Notice Requirement: 33
Advertisement Control: 

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z?.!,Â¿]+", " ", text)
    text = re.sub(r"http\S+", "", text)
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p, '')
    sw = stopwords.words('english')
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    text = " ".join(text)
    return text

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True, model_max_length=512)

In [7]:
class BertForMultiTaskClassification(BertForSequenceClassification):
    def __init__(self, config, num_labels_level, num_labels_category):
        super().__init__(config)
        self.num_labels_level = num_labels_level
        self.num_labels_category = num_labels_category
        self.classifier_level = torch.nn.Linear(config.hidden_size, num_labels_level)
        self.classifier_category = torch.nn.Linear(config.hidden_size, num_labels_category)

    def forward(self, input_ids, attention_mask=None, labels_level=None, labels_category=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]

        logits_level = self.classifier_level(pooled_output)
        logits_category = self.classifier_category(pooled_output)

        loss = None
        if labels_level is not None and labels_category is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss_level = loss_fct(logits_level.view(-1, self.num_labels_level), labels_level.view(-1))
            loss_category = loss_fct(logits_category.view(-1, self.num_labels_category), labels_category.view(-1))
            loss = loss_level + loss_category

        return {
            "logits_level": logits_level,
            "logits_category": logits_category,
            "loss": loss
        }

In [8]:
model = BertForMultiTaskClassification.from_pretrained(
    "bert-large-uncased",
    num_labels_level=len(label_encoder_level.classes_),
    num_labels_category=len(label_encoder_category.classes_)
)

Some weights of BertForMultiTaskClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'classifier_category.bias', 'classifier_category.weight', 'classifier_level.bias', 'classifier_level.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [11]:
model.load_state_dict(torch.load('Risk_analysis_BERT.pth',map_location=torch.device('cpu')))
model.eval() 
print("Model loaded successfully.")

  model.load_state_dict(torch.load('Risk_analysis_BERT.pth',map_location=torch.device('cpu')))


Model loaded successfully.


In [12]:
def predict_risk(text):
    cleaned_text = clean_text(text)
    encoded_dict = tokenizer.encode_plus(
        cleaned_text, 
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    input_ids = encoded_dict['input_ids'].to(device)
    attention_mask = encoded_dict['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
  
    predicted_level = np.argmax(outputs["logits_level"].cpu().numpy(), axis=1).flatten()[0]
    predicted_category = np.argmax(outputs["logits_category"].cpu().numpy(), axis=1).flatten()[0]

    return (
        risk_level_mapping.get(predicted_level, "Unknown"),
        risk_category_mapping.get(predicted_category, "Unknown")
    )

def predict(texts):
    if isinstance(texts, str):
        texts = [texts]
    elif isinstance(texts, np.ndarray):
        texts = list(texts)

    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        logits_level = model(input_ids, attention_mask=attention_mask)["logits_level"]
    
    return torch.softmax(logits_level, dim=-1).cpu().numpy()

In [None]:
input_text = input("Enter text: ")
risk_level, risk_category = predict_risk(input_text)
print(f"\nPredicted Risk Level: {risk_level}")
print(f"Predicted Risk Category: {risk_category}")

explainer = shap.Explainer(predict, tokenizer)
shap_values = explainer([input_text])
shap.plots.text(shap_values)



In [None]:
import torch
from transformers import BertTokenizer
import numpy as np
import shap


tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)

def split_into_overlapping_chunks(text, max_length=512, overlap=256):
    tokens = tokenizer.tokenize(text)
    chunks = []
    start = 0

    while start < len(tokens):
        end = start + max_length
        chunk = tokens[start:end]
        chunks.append(chunk)
        start += (max_length - overlap)  

    
    chunk_texts = [tokenizer.convert_tokens_to_string(chunk) for chunk in chunks]
    return chunk_texts

def predict_risk_for_chunks(chunks):
    risk_levels = []
    risk_categories = []
    shap_values_list = []


    explainer = shap.Explainer(predict, tokenizer)

    for chunk in chunks:

        risk_level, risk_category = predict_risk(chunk)
        risk_levels.append(risk_level)
        risk_categories.append(risk_category)

        shap_values = explainer([chunk])
        shap_values_list.append(shap_values)

    return risk_levels, risk_categories, shap_values_list


def display_results(chunks, risk_levels, risk_categories, shap_values_list):
    print("\nResults for Each Chunk:")
    for i, (chunk, risk_level, risk_category) in enumerate(zip(chunks, risk_levels, risk_categories)):
        print(f"\nChunk {i + 1}:")
        print(f"Text: {chunk}")
        print(f"Predicted Risk Level: {risk_level}")
        print(f"Predicted Risk Category: {risk_category}")
        print("SHAP Text Plot:")
        shap.plots.text(shap_values_list[i])

input_document = input("Enter the full document text: ")
if input_document.strip():
    chunks = split_into_overlapping_chunks(input_document, max_length=512, overlap=256)
    risk_levels, risk_categories, shap_values_list = predict_risk_for_chunks(chunks)

    display_results(chunks, risk_levels, risk_categories, shap_values_list)
else:
    print("Input text cannot be empty. Please try again.")

Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:50, 50.28s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:44, 44.37s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:44, 44.42s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:44, 44.39s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:44, 44.77s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:44, 44.59s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:44, 44.47s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:44, 44.96s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:44, 44.84s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:44, 44.65s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:44, 44.46s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:45, 45.11s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:41, 41.10s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

In [None]:
import torch
from transformers import BertTokenizer
import numpy as np
import shap

# Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)

# Function to split a document into overlapping chunks
def split_into_overlapping_chunks(text, max_length=512, overlap=256):
    tokens = tokenizer.tokenize(text)
    chunks = []
    start = 0

    while start < len(tokens):
        end = start + max_length
        chunk = tokens[start:end]
        chunks.append(chunk)
        start += (max_length - overlap)  # Move start by (max_length - overlap)

    # Convert chunks back to text
    chunk_texts = [tokenizer.convert_tokens_to_string(chunk) for chunk in chunks]
    return chunk_texts

# Function to predict risk level and category for each chunk
def predict_risk_for_chunks(chunks):
    risk_levels = []
    risk_categories = []
    shap_values_list = []

    # Initialize SHAP explainer
    explainer = shap.Explainer(predict, tokenizer)

    for chunk in chunks:
        # Predict risk level and category
        risk_level, risk_category = predict_risk(chunk)
        risk_levels.append(risk_level)
        risk_categories.append(risk_category)

        # Generate SHAP values for the chunk
        shap_values = explainer([chunk])
        shap_values_list.append(shap_values)

    return risk_levels, risk_categories, shap_values_list

# Function to combine and display results
def display_results(chunks, risk_levels, risk_categories, shap_values_list):
    print("\nResults for Each Chunk:")
    for i, (chunk, risk_level, risk_category) in enumerate(zip(chunks, risk_levels, risk_categories)):
        print(f"\nChunk {i + 1}:")
        print(f"Text: {chunk}")
        print(f"Predicted Risk Level: {risk_level}")
        print(f"Predicted Risk Category: {risk_category}")
        print("SHAP Text Plot:")
        shap.plots.text(shap_values_list[i])

# Example usage
input_document = input("Enter the full document text: ")
if input_document.strip():
    # Split the document into overlapping chunks
    chunks = split_into_overlapping_chunks(input_document, max_length=512, overlap=256)

    # Predict risk level and category for each chunk
    risk_levels, risk_categories, shap_values_list = predict_risk_for_chunks(chunks)

    # Display results for each chunk
    display_results(chunks, risk_levels, risk_categories, shap_values_list)
else:
    print("Input text cannot be empty. Please try again.")

# first html file with risk output with chunck

In [None]:
import torch
from transformers import BertTokenizer
import numpy as np
import shap
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image
from reportlab.lib.styles import getSampleStyleSheet

# Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)

# Function to split a document into overlapping chunks
def split_into_overlapping_chunks(text, max_length=512, overlap=256):
    tokens = tokenizer.tokenize(text)
    chunks = []
    start = 0

    while start < len(tokens):
        end = start + max_length
        chunk = tokens[start:end]
        chunks.append(chunk)
        start += (max_length - overlap)  # Move start by (max_length - overlap)

    # Convert chunks back to text
    chunk_texts = [tokenizer.convert_tokens_to_string(chunk) for chunk in chunks]
    return chunk_texts

# Function to predict risk level and category for each chunk
def predict_risk_for_chunks(chunks):
    risk_levels = []
    risk_categories = []
    shap_values_list = []

    # Initialize SHAP explainer
    explainer = shap.Explainer(predict, tokenizer)

    for chunk in chunks:
        # Predict risk level and category
        risk_level, risk_category = predict_risk(chunk)
        risk_levels.append(risk_level)
        risk_categories.append(risk_category)

        # Generate SHAP values for the chunk
        shap_values = explainer([chunk])
        shap_values_list.append(shap_values)

    return risk_levels, risk_categories, shap_values_list

# Function to save results as a single HTML file
def save_results_as_html(chunks, risk_levels, risk_categories, shap_values_list, filename="output.html"):
    html_content = """
    <html>
    <head>
        <title>Legal Risk Analysis Report</title>
        <style>
            body { font-family: Arial, sans-serif; }
            h1 { color: #2c3e50; }
            h2 { color: #34495e; }
            h3 { color: #7f8c8d; }
            iframe { border: 1px solid #ddd; margin-bottom: 20px; }
        </style>
    </head>
    <body>
        <h1>Legal Risk Analysis Report</h1>
    """

    for i, (chunk, risk_level, risk_category) in enumerate(zip(chunks, risk_levels, risk_categories)):
        html_content += f"<p><strong>Predicted Risk Level:</strong> {risk_level}</p>"
        html_content += f"<p><strong>Predicted Risk Category:</strong> {risk_category}</p>"

        # Generate SHAP plot as HTML and embed it directly
        shap_html = shap.plots.text(shap_values_list[i], display=False) 
        html_content += shap_html

    html_content += """
    </body>
    </html>
    """

    with open(filename, "w", encoding="utf-8") as f:
        f.write(html_content)
    print(f"Results saved to {filename}")

# Example usage
input_document = input("Enter the full document text: ")
if input_document.strip():
    # Split the document into overlapping chunks
    chunks = split_into_overlapping_chunks(input_document, max_length=512, overlap=256)

    # Predict risk level and category for each chunk
    risk_levels, risk_categories, shap_values_list = predict_risk_for_chunks(chunks)

    # Save results as a single HTML file
    save_results_as_html(chunks, risk_levels, risk_categories, shap_values_list, filename="legal_risk_analysis_report.html")
else:
    print("Input text cannot be empty. Please try again.")


In [None]:
%pip install groq

In [14]:
from groq import Groq

In [15]:
groq_API_KEY  = "YOUR API KEY"
client =Groq(api_key=groq_API_KEY)


# risk html with sugession for high riisk alone 

In [12]:
import torch
from transformers import BertTokenizer
import numpy as np
import shap

# Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)

# Function to split a document into overlapping chunks
def split_into_overlapping_chunks(text, max_length=512, overlap=256):
    tokens = tokenizer.tokenize(text)
    chunks = []
    start = 0

    while start < len(tokens):
        end = start + max_length
        chunk = tokens[start:end]
        chunks.append(chunk)
        start += (max_length - overlap)  # Move start by (max_length - overlap)

    # Convert chunks back to text
    chunk_texts = [tokenizer.convert_tokens_to_string(chunk) for chunk in chunks]
    return chunk_texts

# Function to predict risk level and category for each chunk
def predict_risk_for_chunks(chunks):
    risk_levels = []
    risk_categories = []
    shap_values_list = []

    # Initialize SHAP explainer
    explainer = shap.Explainer(predict, tokenizer)

    for chunk in chunks:
        # Predict risk level and category
        risk_level, risk_category = predict_risk(chunk)
        risk_levels.append(risk_level)
        risk_categories.append(risk_category)

        # Generate SHAP values for the chunk
        shap_values = explainer([chunk])
        shap_values_list.append(shap_values)

    return risk_levels, risk_categories, shap_values_list

# Function to modify high-risk text
def modifier_model(text, risk_level, risk_category):
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"""
                You are a legal risk mitigation expert. Your task is to modify the given contract text by removing or rewording high-risk clauses alone while making them favorable for the company side alone. and dont change remaing text\n
                Risk Level :\n {risk_level}\n
                Risk category :\n {risk_category}\n

                Modify the following text accordingly:
                {text}
                
                Provide only final text with modification and exclude explanation
                """
            }
        ],
        model="llama-3.3-70b-versatile",
    )
    return chat_completion.choices[0].message.content.strip()

# Function to save results as a single HTML file
def save_results_as_html(chunks, risk_levels, risk_categories, shap_values_list, filename="output.html"):
    html_content = """
    <html>
    <head>
        <title>Legal Risk Analysis Report</title>
        <style>
            body { font-family: Arial, sans-serif; }
            h1 { color: #2c3e50; }
            h2 { color: #34495e; }
            h3 { color: #7f8c8d; }
            .suggestion { background-color: #f9f9f9; padding: 10px; border-left: 4px solid #ccc; margin-top: 10px; }
            .modified-text { background-color: #e8f5e9; padding: 10px; border-left: 4px solid #4caf50; margin-top: 10px; }
        </style>
    </head>
    <body>
        <h1>Legal Risk Analysis Report</h1>
    """

    for i, (chunk, risk_level, risk_category) in enumerate(zip(chunks, risk_levels, risk_categories)):
        html_content += f"<h2>Chunk {i + 1}</h2>"
        html_content += f"<p><strong>Text:</strong> {chunk}</p>"
        html_content += f"<p><strong>Predicted Risk Level:</strong> {risk_level}</p>"
        html_content += f"<p><strong>Predicted Risk Category:</strong> {risk_category}</p>"

        # Generate SHAP plot as HTML and embed it directly
        shap_html = shap.plots.text(shap_values_list[i], display=False)
        html_content += "<h3>SHAP Text Plot</h3>"
        html_content += shap_html

        # Provide suggestions and modified text for high-risk chunks
        if risk_level == "High":
            suggestion = "This text is identified as high-risk. Consider modifying the following clauses:"
            modified_text = modifier_model(chunk, risk_level, risk_category)

            html_content += f"<div class='suggestion'><strong>Suggestion:</strong> {suggestion}</div>"
            html_content += f"<div class='modified-text'><strong>Modified Text:</strong> {modified_text}</div>"

    html_content += """
    </body>
    </html>
    """

    with open(filename, "w", encoding="utf-8") as f:
        f.write(html_content)
    print(f"Results saved to {filename}")

# Example usage
input_document = input("Enter the full document text: ")
if input_document.strip():
    # Split the document into overlapping chunks
    chunks = split_into_overlapping_chunks(input_document, max_length=512, overlap=256)

    # Predict risk level and category for each chunk
    risk_levels, risk_categories, shap_values_list = predict_risk_for_chunks(chunks)

    # Save results as a single HTML file
    save_results_as_html(chunks, risk_levels, risk_categories, shap_values_list, filename="legal_risk_analysis_report.html")
else:
    print("Input text cannot be empty. Please try again.")

Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:49, 49.47s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:43, 43.52s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:43, 43.75s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:43, 43.87s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:44, 44.03s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:43, 43.88s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:43, 43.86s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:43, 43.89s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:43, 43.80s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:32, 32.22s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

Results saved to legal_risk_analysis_report.html


In [16]:
import torch
from transformers import BertTokenizer
import numpy as np
import shap

# Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)

# Function to split a document into overlapping chunks
def split_into_overlapping_chunks(text, max_length=512, overlap=256):
    tokens = tokenizer.tokenize(text)
    chunks = []
    start = 0

    while start < len(tokens):
        end = start + max_length
        chunk = tokens[start:end]
        chunks.append(chunk)
        start += (max_length - overlap)  # Move start by (max_length - overlap)

    # Convert chunks back to text
    chunk_texts = [tokenizer.convert_tokens_to_string(chunk) for chunk in chunks]
    return chunk_texts

# Function to predict risk level and category for each chunk
def predict_risk_for_chunks(chunks):
    risk_levels = []
    risk_categories = []
    shap_values_list = []

    # Initialize SHAP explainer
    explainer = shap.Explainer(predict, tokenizer)

    for chunk in chunks:
        # Predict risk level and category
        risk_level, risk_category = predict_risk(chunk)
        risk_levels.append(risk_level)
        risk_categories.append(risk_category)

        # Generate SHAP values for the chunk
        shap_values = explainer([chunk])
        shap_values_list.append(shap_values)

    return risk_levels, risk_categories, shap_values_list

# Function to modify high or medium-risk text
#You are a legal risk mitigation expert. Your task is to modify the given contract text by removing or rewording high or medium-risk clauses alone while making them favorable for the company side alone. and dont change remaing text\n

def modifier_model(text, risk_level, risk_category):
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"""
                You are a legal risk mitigation expert. Your task is to modify the given contract text by removing or rewording high or medium-risk clauses alone while making them balanced for both side. and dont change remaing text\n
                Risk Level :\n {risk_level}\n
                Risk category :\n {risk_category}\n

                Modify the following text accordingly:
                {text}
                
                Provide only final text with modification and exclude explanation
                """
            }
        ],
        model="llama-3.3-70b-versatile",
    )
    return chat_completion.choices[0].message.content.strip()

# Function to save results as a single HTML file
def save_results_as_html(chunks, risk_levels, risk_categories, shap_values_list, filename="output.html"):
    html_content = """
    <html>
    <head>
        <title>Legal Risk Analysis Report</title>
        <style>
            body { font-family: Arial, sans-serif; }
            h1 { color: #2c3e50; }
            h2 { color: #34495e; }
            h3 { color: #7f8c8d; }
            .suggestion { background-color: #f9f9f9; padding: 10px; border-left: 4px solid #ccc; margin-top: 10px; }
            .modified-text { background-color: #e8f5e9; padding: 10px; border-left: 4px solid #4caf50; margin-top: 10px; }
            .original-text { background-color: #fff3e0; padding: 10px; border-left: 4px solid #ffa726; margin-top: 10px; }
            .full-document { background-color: #e3f2fd; padding: 10px; border-left: 4px solid #42a5f5; margin-top: 20px; }
        </style>
    </head>
    <body>
        <h1>Legal Risk Analysis Report</h1>
    """

    full_document_original = []
    full_document_modified = []

    for i, (chunk, risk_level, risk_category) in enumerate(zip(chunks, risk_levels, risk_categories)):
        html_content += f"<h2>Chunk {i + 1}</h2>"
        html_content += f"<p><strong>Text:</strong> {chunk}</p>"
        html_content += f"<p><strong>Predicted Risk Level:</strong> {risk_level}</p>"
        html_content += f"<p><strong>Predicted Risk Category:</strong> {risk_category}</p>"

        # Generate SHAP plot as HTML and embed it directly
        shap_html = shap.plots.text(shap_values_list[i], display=False)
        html_content += "<h3>SHAP Text Plot</h3>"
        html_content += shap_html

        # Provide suggestions and modified text for high or medium-risk chunks
        if risk_level in ["High", "Medium"]:
            suggestion = f"This text is identified as {risk_level.lower()}-risk. Consider modifying the following clauses:"
            modified_text = modifier_model(chunk, risk_level, risk_category)

            html_content += f"<div class='suggestion'><strong>Suggestion:</strong> {suggestion}</div>"
            html_content += f"<div class='modified-text'><strong>Modified Text:</strong> {modified_text}</div>"
        else:
            modified_text = chunk  # No modification for low-risk chunks

        # Append to full document text
        full_document_original.append(chunk)
        full_document_modified.append(modified_text)

    # Combine full document text
    full_document_original = " ".join(full_document_original)
    full_document_modified = " ".join(full_document_modified)

    # Add full document text to the HTML report
    html_content += "<h2>Full Document Text</h2>"
    html_content += "<div class='full-document'>"
    html_content += "<h3>Original Text</h3>"
    html_content += f"<div class='original-text'>{full_document_original}</div>"
    html_content += "<h3>Modified Text</h3>"
    html_content += f"<div class='modified-text'>{full_document_modified}</div>"
    html_content += "</div>"

    html_content += """
    </body>
    </html>
    """

    with open(filename, "w", encoding="utf-8") as f:
        f.write(html_content)
    print(f"Results saved to {filename}")

# Example usage
input_document = input("Enter the full document text: ")
if input_document.strip():
    # Split the document into overlapping chunks
    chunks = split_into_overlapping_chunks(input_document, max_length=512, overlap=256)

    # Predict risk level and category for each chunk
    risk_levels, risk_categories, shap_values_list = predict_risk_for_chunks(chunks)

    # Save results as a single HTML file
    save_results_as_html(chunks, risk_levels, risk_categories, shap_values_list, filename="legal_risk_analysis_report.html")
else:
    print("Input text cannot be empty. Please try again.")

Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:44, 44.14s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:44, 44.37s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:44, 44.42s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:44, 44.45s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:44, 44.50s/it]               


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:26, 26.46s/it]               


Results saved to legal_risk_analysis_report.html


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)


def split_into_overlapping_chunks(text, max_length=512, overlap=256):
    tokens = tokenizer.tokenize(text)
    chunks = []
    start = 0

    while start < len(tokens):
        end = start + max_length
        chunk = tokens[start:end]
        chunks.append(chunk)
        start += (max_length - overlap)  


    chunk_texts = [tokenizer.convert_tokens_to_string(chunk) for chunk in chunks]
    return chunk_texts


def predict_risk_for_chunks(chunks):
    risk_levels = []
    risk_categories = []
    shap_values_list = []
    explanations = []


    explainer = shap.Explainer(predict, tokenizer)

    for chunk in chunks:
     
        risk_level, risk_category = predict_risk(chunk)
        risk_levels.append(risk_level)
        risk_categories.append(risk_category)

        shap_values = explainer([chunk])
        shap_values_list.append(shap_values)
        
        explanation = explain_chunk(chunk, risk_level, risk_category)
        explanations.append(explanation)

    return risk_levels, risk_categories, shap_values_list, explanations


def explain_chunk(chunk, risk_level, risk_category):
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"""
                You are a legal risk expert. Analyze the following contract text chunk and provide a detailed explanation of:
                1. Why this text was classified as {risk_level} risk
                2. The specific clauses or phrases that contributed to this risk level
                3. The implications of this risk category ({risk_category})
                
                Provide this in clear, concise bullet points.
                
                Text chunk:
                {chunk}

                Exclude explanation
                """
            }
        ],
        model="llama-3.3-70b-versatile",
    )
    return chat_completion.choices[0].message.content.strip()


def modifier_model(text, risk_level, risk_category):
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"""
                You are a legal risk mitigation expert. Your task is to modify the given contract text by removing or rewording high or medium-risk clauses alone while making them balanced for both side. and dont change remaing text\n
                Risk Level :\n {risk_level}\n
                Risk category :\n {risk_category}\n

                Modify the following text accordingly:
                {text}
                
                Provide only final text with modification and exclude explanation
                """
            }
        ],
        model="llama-3.3-70b-versatile",
    )
    return chat_completion.choices[0].message.content.strip()

def save_results_as_html(chunks, risk_levels, risk_categories, shap_values_list, explanations, filename="output.html"):
    html_content = """
    <html>
    <head>
        <title>Legal Risk Analysis Report</title>
        <style>
            body { font-family: Arial, sans-serif; }
            h1 { color: #2c3e50; }
            h2 { color: #34495e; }
            h3 { color: #7f8c8d; }
            .explanation { background-color: #f0f7ff; padding: 10px; border-left: 4px solid #3498db; margin-top: 10px; }
            .modified-text { background-color: #e8f5e9; padding: 10px; border-left: 4px solid #4caf50; margin-top: 10px; }
            .original-text { background-color: #fff3e0; padding: 10px; border-left: 4px solid #ffa726; margin-top: 10px; }
            .full-document { background-color: #e3f2fd; padding: 10px; border-left: 4px solid #42a5f5; margin-top: 20px; }
            .risk-high { color: #e74c3c; font-weight: bold; }
            .risk-medium { color: #f39c12; font-weight: bold; }
            .risk-low { color: #2ecc71; font-weight: bold; }
        </style>
    </head>
    <body>
        <h1>Legal Risk Analysis Report</h1>
    """

    full_document_original = []
    full_document_modified = []

    for i, (chunk, risk_level, risk_category, explanation) in enumerate(zip(chunks, risk_levels, risk_categories, explanations)):

        risk_class = f"risk-{risk_level.lower()}"
        
        html_content += f"<h2>Chunk {i + 1}</h2>"
        html_content += f"<div class='original-text'><strong>Original Text:</strong> {chunk}</div>"
        html_content += f"<p><strong>Predicted Risk Level:</strong> <span class='{risk_class}'>{risk_level}</span></p>"
        html_content += f"<p><strong>Predicted Risk Category:</strong> {risk_category}</p>"
        

        html_content += "<h3>Risk Explanation</h3>"
        html_content += f"<div class='explanation'>{explanation}</div>"


        shap_html = shap.plots.text(shap_values_list[i], display=False)
        html_content += "<h3>SHAP Text Plot</h3>"
        html_content += shap_html


        if risk_level in ["High", "Medium", "Low"]:
            modified_text = modifier_model(chunk, risk_level, risk_category)
        else:
            modified_text = chunk  

        full_document_original.append(chunk)
        full_document_modified.append(modified_text)


    full_document_original = " ".join(full_document_original)
    full_document_modified = " ".join(full_document_modified)

    html_content += "<h2>Full Document Text</h2>"
    html_content += "<div class='full-document'>"
    html_content += "<h3>Original Text</h3>"
    html_content += f"<div class='original-text'>{full_document_original}</div>"
    html_content += "<h3>Modified Text</h3>"
    html_content += f"<div class='modified-text'>{full_document_modified}</div>"
    html_content += "</div>"

    html_content += """
    </body>
    </html>
    """

    with open(filename, "w", encoding="utf-8") as f:
        f.write(html_content)
    print(f"Results saved to {filename}")


input_document = input("Enter the full document text: ")
if input_document.strip():

    chunks = split_into_overlapping_chunks(input_document, max_length=512, overlap=256)


    risk_levels, risk_categories, shap_values_list, explanations = predict_risk_for_chunks(chunks)

    save_results_as_html(chunks, risk_levels, risk_categories, shap_values_list, explanations, filename="legal_risk_analysis_report.html")
else:
    print("Input text cannot be empty. Please try again.")

In [2]:
import requests

url = " https://1774-34-87-10-141.ngrok-free.app/upload"
files = {"file": open("E:/projects/Unisys_Dev/Trying/BackEnd/uploaded_files/SOFTWARE LICENSE AGREEMENT.docx", "rb")}

response = requests.post(url, files=files)
print(response.json())


{'detail': 'There was an error parsing the body'}
