In [40]:
pip install torch transformers langchain_pinecone datasets pinecone-client langchain-community scikit-learn langchain datasets

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
    --------------------------------------- 0.0/1.5 MB 320.0 kB/s eta 0:00:05
   - -------------------------------------- 0.1/1.5 MB 544.7 kB/s eta 0:00:03
   -------- ------------------------------- 0.3/1.5 MB 2.3 MB/s eta 0:00:01
   ----------------------------------- ---- 1.3/1.5 MB 7.1 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 6.9 MB/s eta 0:00:00
Installing collected packages: nltk
Successfully installed nltk-3.8.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
!pip install transformers==4.17

In [None]:

!pip install wandb --upgrade

In [None]:
pip install --upgrade transformers

In [None]:
pip install accelerate -U

In [41]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from datasets import Dataset
import pinecone
from langchain_pinecone import PineconeVectorStore  
from pinecone import Pinecone
from transformers import  pipeline
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import os
from sklearn.model_selection import train_test_split

# Pinecone

In [42]:
# Initialize Pinecone
pinecone = Pinecone(api_key = "03b29f67-c297-4462-825b-13ce23b3d577")

# Connect
index_name = 'lawllm-unstructured-database'
index = pinecone.Index(index_name)

# Index stats
index.describe_index_stats()

3.12.0 (tags/v3.12.0:0fb18b0, Oct  2 2023, 13:03:39) [MSC v.1935 64 bit (AMD64)]


## Embeddings

In [None]:
# Load the embedding model
embed_model_id = 'dariolopez/roberta-base-bne-finetuned-msmarco-qa-es-mnrl-mn'
embed_model = HuggingFaceEmbeddings(model_name=embed_model_id, model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'})

# Define query and retrieve documents
query = '¿Qué diferencia hay entre homicidio y asesinato según el Código Penal español, y cuáles son las penas asociadas a cada uno?'

# Generate embedding for the query
query_embedding = embed_model.embed_query(query)

## Retrieve Documents

In [None]:
# Query the Pinecone index with metadata included
similarity_output = index.query(vector=query_embedding, top_k=10, include_metadata=True)

# Process and display the results with checks
context_processed = [
    {"context": doc.metadata['text'], "score": doc.score}
    for doc in similarity_output.matches
    if doc.metadata and 'text' in doc.metadata
]

# Show the first few results
print(context_processed[1])

## Data

In [None]:
# Define the preprocess function
def preprocess_function(examples):
    inputs = tokenizer(
        examples["question"],
        examples["context"],
        max_length=512,
        truncation=True,
        padding="max_length",
        return_offsets_mapping=True,
        return_tensors="pt"
    )
    
    offset_mapping = inputs.pop("offset_mapping").tolist()
    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        answer = examples["answers"][i]
        start_char = answer["answer_start"]
        end_char = start_char + len(answer["text"])
        
        # Find the start and end token indices in the context
        sequence_ids = inputs.sequence_ids(i)
        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)
        
        token_start_index = context_start
        token_end_index = context_end - 1

        while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
            token_start_index += 1
        start_positions.append(token_start_index - 1)
        
        while token_end_index >= 0 and offsets[token_end_index][1] >= end_char:
            token_end_index -= 1
        end_positions.append(token_end_index + 1)
    
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, TrainingArguments, Trainer
from datasets import Dataset
from sklearn.model_selection import train_test_split

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Preprocessing function
def preprocess_function(examples):
    inputs = tokenizer(
        examples["question"],
        examples["context"],
        max_length=512,
        truncation="only_second",  # Truncate the context, not the question
        padding="max_length",
        return_offsets_mapping=True,
        return_tensors="pt"
    )

    offset_mapping = inputs.pop("offset_mapping").tolist()
    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        start_char = examples["answers"][i]["answer_start"]
        end_char = start_char + len(examples["answers"][i]["text"])

        # Find the start and end token indices that correspond to the start and end character positions
        start_token_idx = None
        end_token_idx = None

        for j, (start, end) in enumerate(offsets):
            if start <= start_char < end:
                start_token_idx = j
            if start < end_char <= end:
                end_token_idx = j
            if start_token_idx is not None and end_token_idx is not None:
                break

        if start_token_idx is None or end_token_idx is None:
            start_token_idx = 0
            end_token_idx = 0

        start_positions.append(start_token_idx)
        end_positions.append(end_token_idx)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs

# Prepare data for training
data = {
    "question": [
        "What is the difference between homicide and murder according to the Spanish Penal Code?",
        "What are the penalties for theft in Spain?",
        "How is fraud defined in Spanish law?",
        "What are the penalties for drug trafficking in Spain?",
        "How does Spanish law define sexual assault?",
        "What are the consequences of driving under the influence in Spain?",
        "What constitutes domestic violence under Spanish law?",
        "What are the penalties for bribery in Spain?",
        "How does Spanish law define and penalize terrorism?",
        "What are the legal consequences of vandalism in Spain?",
        "What are the requirements for obtaining Spanish citizenship?",
        "How are intellectual property rights protected in Spain?",
        "What is the legal process for forming a company in Spain?",
        "What are the regulations regarding environmental protection in Spain?",
        "How does Spanish law address discrimination in the workplace?",
        "What are the legal requirements for renting property in Spain?",
        "How are contracts enforced under Spanish law?",
        "What are the penalties for tax evasion in Spain?",
        "What is the procedure for obtaining a work visa in Spain?",
        "How are disputes resolved through arbitration in Spain?",
        "What are the regulations for starting a small business in Spain?",
        "How does Spanish law define intellectual property?",
        "What are the legal requirements for obtaining a driver's license in Spain?",
        "What are the penalties for cybercrime in Spain?",
        "How are contracts terminated under Spanish law?",
        "What are the procedures for filing for bankruptcy in Spain?",
        "How does Spanish law protect consumer rights?",
        "What are the legal grounds for divorce in Spain?",
        "What are the regulations for importing goods into Spain?",
        "How does Spanish law handle disputes over inheritance?"
    ],
    "context": [
        "According to the Spanish Penal Code, homicide is the act of killing a person without premeditation, while murder involves premeditation and other aggravating factors. The penalties for homicide range from 10 to 15 years of imprisonment, whereas murder can result in a sentence of 15 to 25 years.",
        "The Spanish Penal Code states that theft is unlawfully taking someone else's property with the intent to permanently deprive them of it. Penalties range from 6 to 18 months of imprisonment for basic theft, but can increase depending on aggravating circumstances.",
        "Fraud in Spanish law is defined as deceiving someone to gain a financial advantage. The penalties for fraud can range from 6 months to 6 years of imprisonment, depending on the severity and amount involved.",
        "Drug trafficking in Spain is a serious offense with penalties ranging from 3 to 6 years of imprisonment for basic offenses. Larger quantities and organized crime involvement can increase the penalties significantly.",
        "Sexual assault in Spanish law includes any non-consensual sexual act. The penalties for sexual assault can range from 1 to 5 years of imprisonment, but can be higher depending on the circumstances and harm caused.",
        "Driving under the influence (DUI) in Spain is defined as operating a vehicle with a blood alcohol content over the legal limit. Penalties include fines, loss of driving privileges, and imprisonment for up to 6 months.",
        "Domestic violence under Spanish law includes physical, psychological, or economic abuse by a family member or partner. Penalties can range from restraining orders and fines to several years of imprisonment.",
        "Bribery in Spain involves offering or accepting something of value to influence the actions of a public official. Penalties can range from 3 to 6 years of imprisonment, along with fines and disqualification from public office.",
        "Terrorism under Spanish law is defined as committing violent acts with the intent to disrupt public order or coerce authorities. Penalties are severe, ranging from 10 to 30 years of imprisonment.",
        "Vandalism in Spain is the intentional destruction or damage of property. Penalties can include fines, community service, and imprisonment for up to 3 years, depending on the extent of the damage.",
        "To obtain Spanish citizenship, one usually needs to reside in Spain for a certain period, demonstrate knowledge of the Spanish language and culture, and have a clean criminal record. There are exceptions for certain cases such as being married to a Spanish citizen.",
        "In Spain, intellectual property rights are protected through various laws and regulations, including patents, trademarks, and copyrights. Violations of these rights can lead to legal action and penalties.",
        "Forming a company in Spain involves several steps, including choosing a business structure, registering the company with the Commercial Registry, obtaining a tax identification number, and fulfilling any industry-specific requirements.",
        "Spain has strict regulations aimed at protecting the environment, covering areas such as pollution control, waste management, and conservation of natural resources. Violations of environmental laws can result in fines and other penalties.",
        "Discrimination in the workplace is prohibited under Spanish law, with specific protections for gender, age, disability, sexual orientation, and other characteristics. Employers found guilty of discrimination can face fines and legal action.",
        "Renting property in Spain requires a written contract between the landlord and tenant, outlining terms such as rent amount, duration, and responsibilities of both parties. Certain regulations also apply to rent increases and eviction procedures.",
        "Contracts in Spain are enforced through the courts, with parties able to seek damages or specific performance for breach of contract. Alternative dispute resolution methods such as arbitration are also available.",
        "Tax evasion in Spain is a criminal offense punishable by fines, penalties, and potential imprisonment. The severity of penalties depends on factors such as the amount of taxes evaded and the intent of the taxpayer.",
        "Obtaining a work visa in Spain typically involves securing a job offer from a Spanish employer, applying for the visa through the Spanish consulate or embassy, and meeting requirements such as proof of qualifications and medical examinations.",
        "Disputes in Spain can be resolved through arbitration, a private process where a neutral arbitrator or panel renders a decision. Arbitration offers a faster and more flexible alternative to traditional litigation.",
        "Starting a small business in Spain involves registering with the appropriate authorities, obtaining necessary licenses and permits, and complying with tax and labor regulations. There are different requirements depending on the type of business and its activities.",
        "Intellectual property in Spain encompasses copyrights, trademarks, patents, and trade secrets. These rights are protected through legislation and enforcement measures to prevent infringement.",
        "Obtaining a driver's license in Spain requires passing both theoretical and practical exams administered by the Directorate General of Traffic (DGT). Additional requirements may apply depending on the applicant's age and previous driving experience.",
        "Cybercrime in Spain covers offenses such as hacking, online fraud, identity theft, and cyberbullying. Penalties for cybercrime can include fines, imprisonment, and confiscation of electronic devices.",
        "Contracts can be terminated under Spanish law through mutual agreement, fulfillment of contractual obligations, or legal remedies for breach of contract. Specific termination procedures may vary depending on the type of contract and applicable laws.",
        "Filing for bankruptcy in Spain involves submitting an application to the commercial court, which evaluates the financial situation of the debtor and determines the appropriate proceedings. Bankruptcy proceedings can result in debt restructuring or liquidation of assets.",
        "Consumer rights in Spain are protected by legislation that regulates product quality, safety, advertising, and consumer contracts. Consumers have the right to refunds, repairs, and compensation for defective or misrepresented products.",
        "Divorce in Spain can be granted on grounds such as mutual consent, separation for a certain period, or fault-based reasons like infidelity or abuse. The legal process may involve mediation and division of assets and custody arrangements for any children.",
        "Importing goods into Spain requires compliance with customs regulations, including documentation, tariffs, and import duties. Certain products may be subject to additional regulations or restrictions.",
        "Disputes over inheritance in Spain are resolved through civil law procedures, including probate court proceedings and distribution of assets according to the deceased's will or intestacy laws. Legal challenges to wills or claims by heirs can prolong the process."
    ],
    "answers": [
        {"answer_start": 33, "text": "homicide is the act of killing a person without premeditation, while murder involves premeditation and other aggravating factors"},
        {"answer_start": 70, "text": "theft is unlawfully taking someone else's property with the intent to permanently deprive them of it"},
        {"answer_start": 32, "text": "deceiving someone to gain a financial advantage"},
        {"answer_start": 48, "text": "penalties ranging from 3 to 6 years of imprisonment for basic offenses"},
        {"answer_start": 32, "text": "any non-consensual sexual act"},
        {"answer_start": 32, "text": "operating a vehicle with a blood alcohol content over the legal limit"},
        {"answer_start": 27, "text": "physical, psychological, or economic abuse by a family member or partner"},
        {"answer_start": 0, "text": "offering or accepting something of value to influence the actions of a public official"},
        {"answer_start": 60, "text": "committing violent acts with the intent to disrupt public order or coerce authorities"},
        {"answer_start": 20, "text": "the intentional destruction or damage of property"},
        {"answer_start": 3, "text": "reside in Spain for a certain period, demonstrate knowledge of the Spanish language and culture, and have a clean criminal record"},
        {"answer_start": 7, "text": "protected through various laws and regulations, including patents, trademarks, and copyrights"},
        {"answer_start": 12, "text": "choosing a business structure, registering the company with the Commercial Registry, obtaining a tax identification number, and fulfilling any industry-specific requirements"},
        {"answer_start": 15, "text": "pollution control, waste management, and conservation of natural resources"},
        {"answer_start": 0, "text": "Discrimination in the workplace is prohibited under Spanish law, with specific protections for gender, age, disability, sexual orientation, and other characteristics"},
        {"answer_start": 0, "text": "requires a written contract between the landlord and tenant, outlining terms such as rent amount, duration, and responsibilities of both parties"},
        {"answer_start": 0, "text": "enforced through the courts, with parties able to seek damages or specific performance for breach of contract"},
        {"answer_start": 0, "text": "a criminal offense punishable by fines, penalties, and potential imprisonment"},
        {"answer_start": 0, "text": "typically involves securing a job offer from a Spanish employer, applying for the visa through the Spanish consulate or embassy, and meeting requirements such as proof of qualifications and medical examinations"},
        {"answer_start": 0, "text": "a private process where a neutral arbitrator or panel renders a decision"},
        {"answer_start": 0, "text": "registering with the appropriate authorities, obtaining necessary licenses and permits, and complying with tax and labor regulations"},
        {"answer_start": 0, "text": "encompasses copyrights, trademarks, patents, and trade secrets"},
        {"answer_start": 0, "text": "passing both theoretical and practical exams administered by the Directorate General of Traffic (DGT)"},
        {"answer_start": 0, "text": "covers offenses such as hacking, online fraud, identity theft, and cyberbullying"},
        {"answer_start": 0, "text": "through mutual agreement, fulfillment of contractual obligations, or legal remedies for breach of contract"},
        {"answer_start": 0, "text": "involves submitting an application to the commercial court, which evaluates the financial situation of the debtor and determines the appropriate proceedings"},
        {"answer_start": 0, "text": "protected by legislation that regulates product quality, safety, advertising, and consumer contracts"},
        {"answer_start": 0, "text": "granted on grounds such as mutual consent, separation for a certain period, or fault-based reasons like infidelity or abuse"},
        {"answer_start": 0, "text": "compliance with customs regulations, including documentation, tariffs, and import duties"},
        {"answer_start": 0, "text": "resolved through civil law procedures, including probate court proceedings and distribution of assets according to the deceased's will or intestacy laws"}
    ]
}

# Create a Dataset object
dataset = Dataset.from_dict(data)

# Split data into train and evaluation sets
train_dataset, eval_dataset = dataset.train_test_split(test_size=0.2).values()

# Apply the preprocessing function
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True)

# Fine-Tuning Model

In [None]:
# Define the model
model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",  # Specify the directory for logging
    logging_steps=100,      # Log training metrics every 100 steps
)

In [None]:
# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset
)

# Train the model
trainer.train()

In [None]:
from transformers import pipeline

# Create a pipeline for question answering
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

In [None]:
import torch
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer

# Move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Create a pipeline for question answering
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

# Define a function to ask a question
def ask_question(question, context):
    result = qa_pipeline(question=question, context=context)
    return result["answer"]

# Example usage
question = "What is the difference between homicide and murder according to the Spanish Penal Code?"
context = "According to the Spanish Penal Code, homicide is the act of killing a person without premeditation, while murder involves premeditation and other aggravating factors. The penalties for homicide range from 10 to 15 years of imprisonment, whereas murder can result in a sentence of 15 to 25 years."

answer = ask_question(question, context)
print(f"Question: {question}")
print(f"Answer: {answer}")