In [None]:
import torch
import pandas as pd
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.optim import AdamW          # <-- FIXED IMPORT
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup

# Load your legal dataset (replace 'your_dataset.csv' with the actual file path)
df = pd.read_csv('new_legal_data.csv')

# Encode labels using LabelEncoder
le = LabelEncoder()
df['category'] = le.fit_transform(df['category'])

# Train-test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenizer and model initialization
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(le.classes_))

def prepare_data(df, tokenizer, max_length=128):
    # Drop rows with missing values
    df = df.dropna()

    input_ids = []
    attention_masks = []
    labels = []

    for text, label in zip(df['text'], df['category']):
        tokenized = tokenizer.encode_plus(
            text,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids.append(tokenized['input_ids'])
        attention_masks.append(tokenized['attention_mask'])
        labels.append(label)

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)

    return TensorDataset(input_ids, attention_masks, labels)

# Train the classifier
# Prepare training dataset
train_dataset = prepare_data(train_df, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)

# Train the classifier
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-5)

for epoch in range(3):
    model.train()
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

while True:
    user_input = input("Please describe your legal situation (or 'exit' to quit): ")

    if user_input.lower() == 'exit':
        break

    # Check if user input contains any legal keywords
    legal_keywords = [
        'dowry-deaths', 'adultery', 'bigamy', 'child-marriage', 'domestic-violence',
        'hindu-marriage', 'special-marriage', 'indian-divorce', 'muslim-personal-law',
        'adoption-law', 'family-courts-act'
    ]
    found_categories = []

    for keyword in legal_keywords:
        if keyword in user_input.lower():
            found_categories.append(keyword)

    if found_categories:
        # Filter dataset based on found categories
        filtered_df = df[df['category'].isin(found_categories)]
        if not filtered_df.empty:
            recommended_legal_case = filtered_df['category'].iloc[0]  # Take the first category found
        else:
            print("No matching legal case found in the dataset.")
            continue
    else:
        # Tokenize user input
        user_tokens = tokenizer.encode_plus(
            user_input,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Make predictions for user input
        model.eval()
        with torch.no_grad():
            input_ids = user_tokens['input_ids'].to(device)
            attention_mask = user_tokens['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)
            recommended_legal_case = le.inverse_transform(predictions.cpu().numpy())[0]

    print(f"Recommended Legal Case: {recommended_legal_case}")

    # Web scraping
    url = 'https://shaktistark.github.io/laws2/'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the div matching the recommended legal case
    div_content = None
    for div in soup.find_all('div', class_=recommended_legal_case.lower()):
        div_content = div.text
        break

    if div_content:
        print(f"Information for {recommended_legal_case}:")
        print(div_content)
    else:
        print("No information found for the recommended legal case.")

print("Exiting the program.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 377/377 [08:46<00:00,  1.40s/it]
Epoch 2: 100%|██████████| 377/377 [08:57<00:00,  1.43s/it]
Epoch 3: 100%|██████████| 377/377 [08:39<00:00,  1.38s/it]


Please describe your legal situation (or 'exit' to quit): exit
Exiting the program.


In [None]:
import pandas as pd
from fastai.text.all import *
from bs4 import BeautifulSoup
import requests

# Load your dataset from the CSV file
data = pd.read_csv('LAWS 2.csv', encoding='latin1')

# Preprocess your dataset (tokenization, label encoding, etc.)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['Output'] = le.fit_transform(data['Output'])

# Store the mapping between numerical labels and their text representations
label_map = {int(key): value for key, value in zip(range(len(le.classes_)), le.classes_)}

# Create DataLoaders for ULMFiT language model pretraining
dls_lm = TextDataLoaders.from_df(data, text_col='Text', is_lm=True, valid_pct=0.1)

# Pretrain the language model
learn_lm = language_model_learner(dls_lm, AWD_LSTM, metrics=[accuracy, Perplexity()])
learn_lm.fine_tune(3)

# Save the encoder
learn_lm.save_encoder('finetuned_lm')

# Load the saved encoder into a new language model learner
learn_lm_loaded = language_model_learner(dls_lm, AWD_LSTM)
learn_lm_loaded = learn_lm_loaded.load_encoder('finetuned_lm')

# Create DataLoaders for text classification
dls_clas = TextDataLoaders.from_df(data, text_col='Text', label_col='Output', valid_pct=0.1)

# Create a text classifier learner
learn_clas = text_classifier_learner(dls_clas, AWD_LSTM)

# Load the saved encoder into the text classifier learner
learn_clas = learn_clas.load_encoder('finetuned_lm')

# Web scraping function to fetch the <div> content
def scrape_related_div(legal_case):
    url = "https://shaktistark.github.io/laws2/"  # Replace with your actual URL

    try:
        response = requests.get(url)
        response.raise_for_status()

        # Parse the HTML content with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the <div> with a specific class that matches the recommended legal case
        related_div = soup.find('div', class_=legal_case)  # Modify the class here

        if related_div:
            return related_div.get_text()  # Convert the <div> to a string

    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")

    return "Could not retrieve related information."

# User input loop
while True:
    user_input = input("Please describe your legal situation (or 'exit' to quit): ")

    if user_input.lower() == 'exit':
        break

    # Tokenize user input and make predictions for user input
    pred = learn_clas.predict(user_input)
    recommended_legal_case = label_map[int(pred[0])]  # Decode numerical label into text representation

    # Web scraping
    related_div = scrape_related_div(recommended_legal_case)

    if related_div:
        print(f"Recommended Legal Case: {recommended_legal_case}")
        print("\nRelated Div:")
        print(related_div)
    else:
        print(f"No relevant information found for '{recommended_legal_case}'.")

print("Exiting the program.")

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,4.720233,4.750647,0.225543,115.659096,00:26


epoch,train_loss,valid_loss,accuracy,perplexity,time
0,4.579144,4.453135,0.232337,85.895805,00:26
1,4.417537,4.072173,0.233696,58.684353,00:24
2,4.217729,3.984534,0.234375,53.760208,00:25


Please describe your legal situation (or 'exit' to quit): exit
Exiting the program.
