In [1]:
import os, torch, logging
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from evaluate import load

In [2]:
base_model_name = "meta-llama/Llama-2-7b-chat-hf"
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"  # Fix for fp16


In [3]:
torch.cuda.empty_cache()
# torch.cuda.set_per_process_memory_fraction(0.5, 0)
# torch.cuda.set_per_process_memory_fraction(0.5, 1)

In [4]:
rouge_metric = load("rouge")
rouge_metric

EvaluationModule(name: "rouge", module_type: "metric", features: [{'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id=None)}, {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}], usage: """
Calculates average rouge scores for a list of hypotheses and references
Args:
    predictions: list of predictions to score. Each prediction
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
    rouge_types: A list of rouge types to calculate.
        Valid names:
        `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
        `"rougeL"`: Longest common subsequence based scoring.
        `"rougeLsum"`: rougeLsum splits text using `"
"`.
        See details in https://github.com/huggingface/

In [5]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)


In [6]:
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    #quantization_config=quant_config,
    device_map='auto'
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
base_model = PeftModel.from_pretrained(
    base_model,
    os.path.join("model", "Llama-ft_1")
    # os.path.join('./results_modified_sarvam', "checkpoint-5000")
)


In [8]:
# query = '''Explain how the given case could be applied to a hypothetical fact pattern.	
# A government-owned company, XYZ Ltd., terminates the services of a permanent employee, Mr. A, without assigning any reason by invoking a clause in the employment contract that allows such termination. Mr. A challenges the termination and the validity of the clause in the High Court under Article 226.'''
# text_gen = pipeline(task="text-generation", model=base_model, tokenizer=llama_tokenizer, max_length=200,
#                   do_sample=True,
#                   top_k=10,
#                   num_beams=5,
#                   top_p=0.9,
#                   temperature=0.7,
#                   repetition_penalty=100.0,
#                   num_return_sequences=1)
# output = text_gen(f"{query}")
# print(output[0]['generated_text'])

In [18]:
# val_data = load_dataset('nisaar/LLAMA2_Legal_Dataset_4.4k_Instructions', split='train[70%:]')
val_data = load_dataset('csv', data_files='dataset/Q&A-singleturn.csv', split=['train'])
val_data = val_data[0].shuffle(seed=42)
display(val_data)
val_data[0]

Dataset({
    features: ['Unnamed: 0', 'prompt', 'response'],
    num_rows: 1320
})

{'Unnamed: 0': 697,
 'prompt': " No, that's all for now. Thanks.\n\n",
 'response': "You're welcome. If you have further questions or need more guidance later, don't hesitate to reach out. Good luck with your complaint.\n\n\n\n\n\n"}

In [19]:
len(val_data["prompt"])

1320

In [11]:
# generated_texts = []
# batch_size = 2
# for i in range(0, len(val_data["prompt"]), batch_size):
#     batch_prompts = val_data["prompt"][i:i+batch_size]
#     input_ids_batch = llama_tokenizer(batch_prompts, padding=True, return_tensors="pt", truncation=True)["input_ids"].to('cuda')
    
#     # Generate text for the batch
#     with torch.no_grad():
#         output_batch = base_model.generate(input_ids_batch)
    
#     # Decode the generated texts
#     for j in range(len(batch_prompts)):
#         generated_text = llama_tokenizer.decode(output_batch[j], skip_special_tokens=True)
#         generated_texts.append(generated_text)
#     print("Batch ", i//batch_size, " of ", len(val_data["prompt"])/8, " done")
# reference_texts = val_data["text"]      #Replace reference column name according to dataset
# rouge_scores = rouge_metric.compute(predictions=generated_texts[0: len(reference_texts)], 
#                                        references=reference_texts,
#                                       use_aggregator=True)

# print("Rouge Score:", rouge_scores)

In [12]:
# reference_texts = val_data["text"]      #Replace reference column name according to dataset
# rouge_scores = rouge_metric.compute(predictions=generated_texts, 
#                                        references=reference_texts[0: len(generated_texts)],
#                                       use_aggregator=True)

# print("Rouge Score:", rouge_scores)

In [21]:
generated_texts = []
base_model.eval()
count = 0
for example in val_data["prompt"][:50]:
    # Tokenize the input query
    input_text = example
    input_ids = llama_tokenizer(input_text, return_tensors="pt")["input_ids"].to('cuda')
    
    # Generate text using the model
    output = base_model.generate(input_ids)
    
    # Decode the generated text
    generated_text = llama_tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Append the generated text to the list
    generated_texts.append(generated_text)
    count+=1
    print("Inference", count, "is done")

#reference_texts = val_data["text"][:100]      #Replace reference column name according to dataset
reference_texts = val_data["response"][:50]      #Replace reference column name according to dataset
rouge_scores = rouge_metric.compute(predictions=generated_texts[0: len(reference_texts)], 
                                       references=reference_texts,
                                      use_aggregator=True)

print("Rouge Score:", rouge_scores)

Inference 1 is done
Inference 2 is done
Inference 3 is done
Inference 4 is done
Inference 5 is done
Inference 6 is done
Inference 7 is done
Inference 8 is done
Inference 9 is done
Inference 10 is done
Inference 11 is done
Inference 12 is done
Inference 13 is done
Inference 14 is done
Inference 15 is done
Inference 16 is done
Inference 17 is done
Inference 18 is done
Inference 19 is done
Inference 20 is done
Inference 21 is done
Inference 22 is done
Inference 23 is done
Inference 24 is done
Inference 25 is done
Inference 26 is done
Inference 27 is done
Inference 28 is done
Inference 29 is done
Inference 30 is done
Inference 31 is done
Inference 32 is done
Inference 33 is done
Inference 34 is done
Inference 35 is done
Inference 36 is done
Inference 37 is done
Inference 38 is done
Inference 39 is done
Inference 40 is done
Inference 41 is done
Inference 42 is done
Inference 43 is done
Inference 44 is done
Inference 45 is done
Inference 46 is done
Inference 47 is done
Inference 48 is done
I

In [14]:
reference_texts = val_data["text"][:100]      #Replace reference column name according to dataset
rouge_scores = rouge_metric.compute(predictions=generated_texts[0: len(reference_texts)], 
                                       references=reference_texts,
                                      use_aggregator=True)

print("Rouge Score:", rouge_scores)

Rouge Score: {'rouge1': 0.5497576292122677, 'rouge2': 0.3433784023641112, 'rougeL': 0.41054266855935584, 'rougeLsum': 0.5043940905243857}


In [None]:
# rouge_scores = rouge_metric.compute(predictions=generated_texts[0: len(reference_texts)], 
#                                        references=reference_texts,
#                                       use_aggregator=True)

# print("Rouge Score:", rouge_scores)

Rouge Score: {'rouge1': 0.42768532563811956, 'rouge2': 0.35632321992518257, 'rougeL': 0.39870255084373896, 'rougeLsum': 0.40893938205307623}


On nisaar legal dataset
Finetuned(nisaar) model(100): Rouge Score: {'rouge1': 0.5497576292122677, 'rouge2': 0.3433784023641112, 'rougeL': 0.41054266855935584, 'rougeLsum': 0.5043940905243857}
Without finetuning(100): Rouge Score: {'rouge1': 0.5601444346317581, 'rouge2': 0.35125095545555085, 'rougeL': 0.41783181053062163, 'rougeLsum': 0.5121870455981233}
Without finetuning(1000): Rouge Score: {'rouge1': 0.5908812129578511, 'rouge2': 0.4445513273550393, 'rougeL': 0.5108490265999285, 'rougeLsum': 0.5437536631076831}

On q_a dataset:
Finetuned(nissar): Rouge Score: {'rouge1': 0.06925840430674518, 'rouge2': 0.00974999329424761, 'rougeL': 0.050987059847001595, 'rougeLsum': 0.05898872485373344}


In [None]:
# reference_texts

['The case R. Rajagopal vs State Of T.N, (1994) 6 SCC 632, has had a significant influence on the principles of stare decisis in India. Stare decisis is a legal principle that refers to the doctrine of precedent, which means that courts should follow the decisions of higher courts in similar cases. In this case, the Supreme Court of India established the right to privacy as a fundamental right under Article 21 of the Constitution. This landmark decision has set a precedent for future cases involving the right to privacy and has become a binding authority for lower courts. As a result, any future cases involving the right to privacy will have to consider and apply the principles laid down in the R. Rajagopal case. This case has strengthened the principle of stare decisis in India by establishing a clear and authoritative precedent on the right to privacy, which must be followed by all courts in the country.',
 "In the case of Reserve Bank of India vs Palai Central Bank Ltd, the High Cou