In [28]:
import pandas as pd

file_path = '/content/drive/MyDrive/non compete clause project/upsampled_dataset.csv'

df = pd.read_csv(file_path)
print(df.head()) # Display the first few rows of the DataFrame

                                         clause_text  label
0  If there shall be any change in or affecting s...      0
1  This Agreement shall be binding upon and inure...      0
2  The Grantor irrevocably makes, constitutes and...      0
3  Neither party shall assign its rights or oblig...      0
4  A. CONTRACTOR shall provide Tobacco Use Preven...      0


In [29]:
df.value_counts('label')

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,126246
1,29778


In [30]:
# prompt: clean your data by removing irrelevant HTML tags, special characters, or noise that isn't part of the natural language.

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup

# Download required NLTK resources if not already present
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

file_path = '/content/drive/MyDrive/non compete clause project/upsampled_dataset.csv'

df = pd.read_csv(file_path)

def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    # Remove HTML tags
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text(separator=" ")

    # Remove special characters and punctuation (excluding spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    tokens = text.split()
    return " ".join(tokens)

df['processed_text'] = df['clause_text'].apply(preprocess_text)

# Example to show the first 5 processed texts
print(df['processed_text'].head())

df_processed= df.drop('clause_text',axis=1)
df=df.drop('processed_text',axis=1)
df.head()
df_processed.head()

  soup = BeautifulSoup(text, "html.parser")


0    If there shall be any change in or affecting s...
1    This Agreement shall be binding upon and inure...
2    The Grantor irrevocably makes constitutes and ...
3    Neither party shall assign its rights or oblig...
4    A CONTRACTOR shall provide Tobacco Use Prevent...
Name: processed_text, dtype: object


Unnamed: 0,label,processed_text
0,0,If there shall be any change in or affecting s...
1,0,This Agreement shall be binding upon and inure...
2,0,The Grantor irrevocably makes constitutes and ...
3,0,Neither party shall assign its rights or oblig...
4,0,A CONTRACTOR shall provide Tobacco Use Prevent...


In [2]:
import tensorflow as tf
from transformers import TFDistilBertForSequenceClassification, DistilBertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pandas as pd


In [5]:
# Load and preprocess data
data= df_processed

In [6]:
data.head()

Unnamed: 0,label,processed_text
0,0,If there shall be any change in or affecting s...
1,0,This Agreement shall be binding upon and inure...
2,0,The Grantor irrevocably makes constitutes and ...
3,0,Neither party shall assign its rights or oblig...
4,0,A CONTRACTOR shall provide Tobacco Use Prevent...


In [7]:

train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['processed_text'], data["label"], test_size=0.2, random_state=42
)

# Compute class weights
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_labels),
    y=train_labels,
)

In [8]:
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}
print("Class weights:", class_weights_dict)

Class weights: {0: 0.6175550915801661, 1: 2.626662457912458}


In [9]:

# Tokenize the data
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
def tokenize_texts(texts, labels):
    encodings = tokenizer(
        list(texts),
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="tf",
    )
    return tf.data.Dataset.from_tensor_slices((dict(encodings), labels))


In [11]:
train_dataset = tokenize_texts(train_texts, train_labels).batch(16)
val_dataset = tokenize_texts(val_texts, val_labels).batch(16)

# Load the model
model = TFDistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [12]:
# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [13]:
# Train the model with class weights
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=3,
    class_weight=class_weights_dict,  # Class weights for imbalanced data
)




Epoch 1/3
Epoch 2/3
Epoch 3/3


In [14]:
# Save the model
model.save_pretrained("/content/drive/MyDrive/non compete clause project/distilbert_model_non_compete")
tokenizer.save_pretrained("/content/drive/MyDrive/non compete clause project/distilbert_model_non_compete")

# Optional: Print training history for metrics
print("Training History:", history.history)

Training History: {'loss': [0.05455024912953377, 0.033995818346738815, 0.04092301428318024], 'sparse_categorical_accuracy': [0.9814772009849548, 0.9892163872718811, 0.9877903461456299], 'val_loss': [0.04444124177098274, 0.04560418054461479, 0.046736154705286026], 'val_sparse_categorical_accuracy': [0.9904822707176208, 0.9905784130096436, 0.9904822707176208]}


#Model Testing

In [32]:

import numpy as np
from transformers import TFDistilBertForSequenceClassification, DistilBertTokenizer

# Load the saved model and tokenizer
model_path = "/content/drive/MyDrive/non compete clause project/distilbert_model_non_compete"
model = TFDistilBertForSequenceClassification.from_pretrained(model_path)
tokenizer = DistilBertTokenizer.from_pretrained(model_path)

def predict_sentences(sentences):
    # Initialize lists to store sentences based on prediction
    class_1_sentences = []
    class_0_sentences = []

    for sentence in sentences:
        # Tokenize the input sentence
        inputs = tokenizer(sentence, return_tensors="tf", padding=True, truncation=True, max_length=128)

        # Get the model's output
        outputs = model(inputs)
        logits = outputs.logits

        # Get the predicted class (0 or 1)
        predicted_class = np.argmax(logits, axis=1).item()

        # Append the sentence to the corresponding list
        if predicted_class == 1:
            class_1_sentences.append(sentence)
        else:
            class_0_sentences.append(sentence)

    return class_1_sentences, class_0_sentences





Some layers from the model checkpoint at /content/drive/MyDrive/non compete clause project/distilbert_model_non_compete were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/non compete clause project/distilbert_model_non_compete and are newly initialized: ['dropout_59']
You should probably TRAIN this model on a down-stream task to

In [None]:
# Example usage:
sentence_to_predict = "this is company"  # Replace with your sentence
prediction = predict_sentence(sentence_to_predict)

if prediction == 1:
    print("The model predicts that this sentence is related to a non-compete clause.")
else:
    print("The model predicts that this sentence is NOT related to a non-compete clause.")

The model predicts that this sentence is NOT related to a non-compete clause.


In [33]:
!pip install pymupdf



In [34]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    # Open the provided PDF file
    doc = fitz.open(pdf_path)
    text = ""

    # Loop through each page and extract text
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text += page.get_text()

    return text




In [35]:
# Example usage
pdf_path = '/content/drive/MyDrive/non compete clause project/EmploymentContract (1).pdf'  # Replace with your PDF file path
extracted_text = extract_text_from_pdf(pdf_path)
print(extracted_text)

 
Meanbee Limited 
30-32 Westgate Buildings 
Bath, 
BA1 1EF 
 
+44 (0)1225 448824 
 
www.meanbee.com 
hello@meanbee.com 
 
Contract of Employment 
 
Incorporating terms and particulars required under Section 1 Employment Rights Act 1996 
 
This Contract is confidential and may not be copied or shown, and the contents may not be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
communicated
to
persons
other
than
the
Employee's
professional
advisers,
elected
 
 
 
 
 
 
 
 
 
 
representative or trade union. 
 
Schedule 
 
The Company/we/us 
Meanbee Limited 
Of 
30-32 Westgate Buildings, Bath, BA1 1EF 
The Employee/you 
 
Of 
 
Job Title 
 
Date of start of this employment 
 
Employment Termination Date (if applicable) 
Not applicable 
Place of Work 
The Meanbee Office 
Normal Hours of Work 
36 hours per week 
Basic rate of pay 
£xx,xxx per annum 
Holiday Entitlement 
28 days per annum including UK public 
holidays  
Pension 
Available on request 
 
Please note that the attached Job Description forms pa

In [36]:
import nltk
from nltk.tokenize import sent_tokenize
import string

nltk.download('punkt_tab')
import re

def preprocess_and_extract_sentences(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    # Apply preprocessing to each sentence and store in a new list
    processed_sentences = []
    for sent in sentences:
        # Remove HTML tags
        sent = re.sub(r'<.*?>', '', sent)

        # Remove special characters
        sent = re.sub(r'[^a-zA-Z0-9\s]', '', sent)

        # Remove extra spaces
        sent = re.sub(r'\s+', ' ', sent).strip()

        processed_sentences.append(sent)

    return processed_sentences




[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [37]:

sentences = preprocess_and_extract_sentences(extracted_text)
for sentence in sentences:
    print(sentence)


Meanbee Limited 3032 Westgate Buildings Bath BA1 1EF 44 01225 448824 wwwmeanbeecom hellomeanbeecom Contract of Employment Incorporating terms and particulars required under Section 1 Employment Rights Act 1996 This Contract is confidential and may not be copied or shown and the contents may not be communicated to persons other than the Employees professional advisers elected representative or trade union
Schedule The Companyweus Meanbee Limited Of 3032 Westgate Buildings Bath BA1 1EF The Employeeyou Of Job Title Date of start of this employment Employment Termination Date if applicable Not applicable Place of Work The Meanbee Office Normal Hours of Work 36 hours per week Basic rate of pay xxxxx per annum Holiday Entitlement 28 days per annum including UK public holidays Pension Available on request Please note that the attached Job Description forms part of this contract
Your employment is subject to receipt of satisfactory references and evidence of appropriate qualifications and any 

In [38]:
len(sentences)

184

In [39]:
# prompt: remove sentences list that are more than 10 words.no need function

# Filter sentences based on word count
filtered_sentences = [sentence for sentence in sentences if len(sentence.split()) >= 6]

# Print the filtered sentences
for sentence in filtered_sentences:
  print(sentence)

Meanbee Limited 3032 Westgate Buildings Bath BA1 1EF 44 01225 448824 wwwmeanbeecom hellomeanbeecom Contract of Employment Incorporating terms and particulars required under Section 1 Employment Rights Act 1996 This Contract is confidential and may not be copied or shown and the contents may not be communicated to persons other than the Employees professional advisers elected representative or trade union
Schedule The Companyweus Meanbee Limited Of 3032 Westgate Buildings Bath BA1 1EF The Employeeyou Of Job Title Date of start of this employment Employment Termination Date if applicable Not applicable Place of Work The Meanbee Office Normal Hours of Work 36 hours per week Basic rate of pay xxxxx per annum Holiday Entitlement 28 days per annum including UK public holidays Pension Available on request Please note that the attached Job Description forms part of this contract
Your employment is subject to receipt of satisfactory references and evidence of appropriate qualifications and any 

In [40]:


class_1_sentences, class_0_sentences = predict_sentences(filtered_sentences)

print("Class 1 Sentences:", class_1_sentences)


Class 1 Sentences: ['You covenant that you will not during the Restricted Period whether on your own account or on behalf of or in conjunction with any person firm company or other organisation in competition with the Company directly or indirectly i solicit the business of any Customer ii', 'You covenant that you will not during the Restricted Period whether on your own account or on behalf of or in conjunction with any other person firm company or other organisation whatsoever directly or indirectly induce or attempt to induce any Employee to leave his or her employment with the Company', 'Other that in the proper course of your employment you must not either during or at any time after the termination of your employment with the Company use exploit or disclose to anyone or through your negligence or inadvertence allow such use exploitation or disclosure of any Confidential Information of the Company its clients or suppliers and shall further not use any such Confidential Information

In [44]:
import fitz  # PyMuPDF
import nltk
from nltk.tokenize import sent_tokenize
import re


def highlight_sentences_in_pdf(pdf_path, sentences_to_highlight):
    # Open the provided PDF file
    doc = fitz.open(pdf_path)

    # Loop through each page
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)

        # Extract text from the page
        page_text = page.get_text("text")

        # Preprocess the extracted text
        sentences = preprocess_and_extract_sentences(page_text)

        # Highlight each sentence that matches the extracted sentences
        for sentence in sentences_to_highlight:
            if sentence in sentences:
                # Find the position of the sentence on the page
                text_instances = page.search_for(sentence)

                # Highlight the sentence on the page
                for inst in text_instances:
                    page.add_highlight_annot(inst)

    # Save the modified PDF with highlights
    doc.save("highlighted_output.pdf")



In [45]:
# Example usage
pdf_path = '/content/drive/MyDrive/non compete clause project/EmploymentContract (1).pdf'  # Replace with your PDF file path

highlight_sentences_in_pdf(pdf_path,class_1_sentences)
