### **Install Ludwig and Ludwig's LLM related dependencies.**


In [None]:
!pip uninstall -y tensorflow --quiet
!pip install --upgrade git+https://github.com/huggingface/transformers
!pip install --upgrade git+https://github.com/huggingface/peft.git
!pip install git+https://github.com/ludwig-ai/ludwig.git@master --quiet

# !pip show torch
# !pip show transformers

!pip install --upgrade datasets
!pip install py7zr
!pip install xformers
!pip install accelerate
# !pip install -i https://pypi.org/simple/ bitsandbytes
!pip install dask[dataframe]
!pip install -U bitsandbytes

!pip uninstall -y torch torchvision torchaudio torchtext --quiet
!pip install torch torchvision torchaudio torchtext --index-url https://download.pytorch.org/whl/cu118

!pip install pymongo

In [None]:
pip install --upgrade --force-reinstall scipy pandas matplotlib

In [None]:
import os
import copy
import gc
from typing import Any, Callable
import time
from functools import wraps
from inspect import ( BoundArguments, signature )
from collections import OrderedDict
from google.colab import data_table
import yaml
import numpy as np
import pandas as pd
import torch
from torch import Tensor
import datasets
from datasets import load_dataset, Dataset, DatasetDict
import transformers
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, LlamaForCausalLM, MistralForCausalLM, AutoTokenizer, LlamaTokenizerFast, GenerationConfig, TextGenerationPipeline, BatchEncoding
from transformers.generation.utils import GreedySearchDecoderOnlyOutput
from peft import PeftModel, PeftModelForCausalLM, PeftConfig, LoraConfig
from ludwig.api import LudwigModel, TrainingResults
import logging

import datasets
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset, DatasetDict
from google.colab import drive

import requests
import csv

import pymongo
from pymongo import MongoClient


os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

Enable text wrapping so we don't have to scroll horizontally and create a function to flush CUDA cache.

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset
import json

# Configuration
DATASET_NAME = "FreedomIntelligence/medical-o1-reasoning-SFT"
CONFIG_NAME = "en"  # 'en', 'zh', 'en_mix', or 'zh_mix'
OUTPUT_FILE = f"medical{CONFIG_NAME}.json"

# Load the dataset (no split)
print(f"Loading dataset: {DATASET_NAME} with config: {CONFIG_NAME}")
dataset = load_dataset(DATASET_NAME, CONFIG_NAME)

# Save to JSON — Hugging Face loads splits as a dict
# So we save each split (usually only 'train') separately
for split_name, split_data in dataset.items():
    split_output_file = f"{OUTPUT_FILE.rsplit('.', 1)[0]}_{split_name}.json"
    split_data.to_json(split_output_file)

print("Done.")

In [None]:
import time
import csv
from datasets import load_dataset

DELAY_SECONDS = 1
for split_name, split in dataset.items():
    with open(OUTPUT_FILE, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)

        headers = split.column_names
        writer.writerow(headers)

        for example in split:
            writer.writerow([example.get(col, "") for col in headers])
            time.sleep(0)  # Delay between rows

print(f"Extraction complete! Data saved to {OUTPUT_FILE}")

### Update Dataset to MongoDB

- generate the dataset in the form of questionarie using "question-generation" model.
- Pushing the current Abstracts to MongoDB to re-use the dataset for persistant storage option.
- increase the dataset size by collecting weekly to get new abstracts

In [None]:
client = pymongo.MongoClient("mongodb+srv://demo:demo@cluster.mongodb.net")

db = client["dataset_collection"]
collection = db["data"]

def generate_qa_pairs(abstract):
    nlp = transformers.pipeline("question-generation")
    qa_pairs = nlp(abstract)
    return "input: "+qa_pairs["question"] + '\n' + "answer: "+qa_pairs["answer"]

with open(OUTPUT_FILE, mode="r", newline="", encoding="utf-8") as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        train = row[0]
        Complex_CoT = row[1]
        question = row[2]
        respnse = row[3]
        document = {
            "train": train,
            "Complex_CoT": Complex_CoT,
            "question": question,
            "response": respnse,
            "input": generate_qa_pairs(abstract)
        }
        collection.insert_one(document)

In [None]:
def predict(model: LudwigModel, df_test: pd.DataFrame) -> list[list[str]]:
    return model.predict(df_test)[0]["answer_response"].tolist()

### **Import Dataset** 📋




#### Import Dataset from MongoDB and connect to Google Drive

- The current Dataset will be act as training dataset for the model that's get generated and stored in google drive.

In [None]:
drive.mount('/content/drive')

In [None]:
drive_path = '/content/drive/MyDrive/project-medichat/'
content_path = '/mnt/medical.json'

In [None]:
import json
cursor = collection.find()
data = []
for doc in cursor:
    print(doc)
    doc["train"] = str(doc["train"])
    data.append(doc)

with open(content_path, 'w') as file:
    json.dump(data, file, indent=4)

#### Preparing Dataset

train, test, validation, evaluation

- Observations:

     1. The maximum no.of train dataset which can be accompanined by the google colab pro is 25k records of abstracts based on model trainer configurations(effective_batch_size, epoch, train_steps).
     2. The model is out of memory >25K records where the CUDA GPU device not able to fit the memory during train process and required >40GB memory

In [None]:
import json
# Path to your file
content_path = '/mnt/medical.json'

# List to hold all records
alldata = []

with open(content_path, 'r', encoding='utf-8') as file:
    # Read each line and parse it as JSON
    for line in file:
        alldata.append(json.loads(line.strip()))  # Strip any extra whitespace/newlines

print("# Total alldata samples:", len(alldata))

In [None]:
total_samples = len(alldata)
train_split = 0.7  # 70% training
val_split = 0.15   # 15% validation
test_split = 0.15  # 15% testing

# Calculate indices
train_end = int(train_split * total_samples)
val_end = train_end + int(val_split * total_samples)

# Split the data
train_dataset = alldata[:train_end]
validation_dataset = alldata[train_end:val_end]
test_dataset = alldata[val_end:]

# Print the sizes
print("# train_dataset samples:", len(train_dataset))
print("# validation_dataset samples:", len(validation_dataset))
print("# test_dataset samples:", len(test_dataset))

In [None]:
from pathlib import Path
current_directory = Path.cwd()
print(current_directory)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Convert lists to dataframes
df_train = pd.DataFrame(train_dataset)
df_test = pd.DataFrame(test_dataset)
df_validation = pd.DataFrame(validation_dataset)
df_evaluation = pd.DataFrame(test_dataset)  # testset

### combining dataset to train, test, validation as df_dataset

As the initial process, we train the Mistral-7B base model with complete dataset of abstracts. Then we finetune the model with individual train & validation datasets for the Questionarie use case.

In [None]:
from google.colab import data_table;
data_table.enable_dataframe_formatter()

import numpy as np
np.random.seed(123)

In [None]:
# adding split column to train, test and validation
df_train["split"] = np.zeros(df_train.shape[0])
df_test["split"] = np.ones(df_test.shape[0])
df_validation["split"] = np.full(df_validation.shape[0], 2)

# creating a dataset dataframe
df_dataset = pd.concat([df_train, df_test, df_validation])

In [None]:
df_validation.head()

In [None]:
df_dataset['context'] = df_dataset['train'].apply(lambda x: x['Complex_CoT'])  # 'context' comes from 'Complex_CoT'
df_dataset['question'] = df_dataset['train'].apply(lambda x: x['Question'])  # 'question' comes from 'Question'
df_dataset['answer'] = df_dataset['train'].apply(lambda x: x['Response'])  # 'answer' comes from 'Response'
df_dataset['input'] = 'question: "' + df_dataset['question'] + '" \n context: "' + df_dataset['context'] + '"'

df_evaluation['context'] = df_evaluation['train'].apply(lambda x: x['Complex_CoT'])  # 'context' comes from 'Complex_CoT'
df_evaluation['question'] = df_evaluation['train'].apply(lambda x: x['Question'])  # 'question' comes from 'Question'
df_evaluation['answer'] = df_evaluation['train'].apply(lambda x: x['Response'])  # 'answer' comes from 'Response'
df_evaluation['input'] = 'question: "' + df_evaluation['question'] + '" \n context: "' + df_evaluation['context'] + '"'

In [None]:
df_dataset["split"] = df_dataset["split"].astype(int)

In [None]:
total_samples = len(df_dataset)
train_split = 0.7  # 70% training
val_split = 0.15   # 15% validation
test_split = 0.15  # 15% testing

# Calculate indices
train_end = int(train_split * total_samples)
val_end = train_end + int(val_split * total_samples)

# Split the data
train_dataset = df_dataset[:train_end]
validation_dataset = df_dataset[train_end:val_end]
test_dataset = df_dataset[val_end:]

# Print the sizes
print("# train_dataset samples:", len(train_dataset))
print("# validation_dataset samples:", len(validation_dataset))
print("# test_dataset samples:", len(test_dataset))

In [None]:
# Convert lists to dataframes
df_train = pd.DataFrame(train_dataset)
df_test = pd.DataFrame(test_dataset)
df_validation = pd.DataFrame(validation_dataset)
df_evaluation = pd.DataFrame(test_dataset)  # testset

In [None]:
# adding split column to train, test and validation
df_train["split"] = np.zeros(df_train.shape[0])
df_test["split"] = np.ones(df_test.shape[0])
df_validation["split"] = np.full(df_validation.shape[0], 2)

# creating a dataset dataframe
df_dataset = pd.concat([df_train, df_test, df_validation])

### Data visualization

In [None]:
df_dataset.shape

In [None]:
df_train.head(2)

In [None]:
df_test.head(2)

In [None]:
df_validation.head(2)

In [None]:
df_dataset.head(3)

In [None]:
# Calculating the length of each cell in each column
df_dataset['num_characters_context'] = df_dataset['context'].apply(lambda x: len(x))
df_dataset['num_characters_question'] = df_dataset['question'].apply(lambda x: len(x))
df_dataset['num_characters_answer'] = df_dataset['answer'].apply(lambda x: len(x))

# Show Distribution
df_dataset.hist(column=['num_characters_context', 'num_characters_question', 'num_characters_answer'])

# Calculating the average
average_chars_context = df_dataset['num_characters_context'].mean()
average_chars_question = df_dataset['num_characters_question'].mean()
average_chars_answer = df_dataset['num_characters_answer'].mean()

print(f'Average number of tokens in the context column: {(average_chars_context / 3):.0f}')
print(f'Average number of tokens in the question column: {(average_chars_question / 3):.0f}')
print(f'Average number of tokens in the answer column: {(average_chars_answer / 3):.0f}')

In [None]:
df_evaluation.head(2)

## Use base model to Inference

1. The model is having shradded version of Mistral-7B which using 7 billion parameters distributed (dividing the parameters) into 8 different parts.

2. This give us an advantage on efficient processing and training of very large models by distributing the computational load, especially when dealing with memory constraints on a single device in Google Colab Pro Subscription.

3. with quantization parameters(tensors) using 16bit float representation requires 40GB A-100 NVIDIA GPU RAM


### load base model

In [None]:
bnb_config_base_model: BitsAndBytesConfig = BitsAndBytesConfig(
    load_in_8bit=True,
)

In [None]:
mistral_7b_sharded_base_model_name: str = "alexsherstinsky/Mistral-7B-v0.1-sharded"

In [None]:
base_model_tokenizer: LlamaTokenizerFast = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=mistral_7b_sharded_base_model_name, trust_remote_code=True, padding_side="left")
print(base_model_tokenizer.eos_token)
base_model_tokenizer.pad_token = base_model_tokenizer.eos_token

In [None]:
base_model: MistralForCausalLM = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=mistral_7b_sharded_base_model_name, device_map="auto", torch_dtype=torch.float16, offload_folder="offload", trust_remote_code=True, low_cpu_mem_usage=True, quantization_config=bnb_config_base_model)

### Inference on Base Model

- The reason behind this step to understand how well the based model understands the context of the text present in the abstracts
- This model helps in transfer learning process to the new model once it gets train on the new abstract data.

In [None]:
df_inference_evaluation: pd.DataFrame = df_evaluation.head(10).copy()

In [None]:
prompt_template_inference: str = """
[INST] <<SYS>>
You are a helpful, detailed, and polite AI assistant.
Answer the question using only the provided context.
<</SYS>>

### Input: {input}

### Answer:
[/INST]
"""

In [None]:
df_inference_evaluation["prompt"] = df_inference_evaluation["input"].apply(lambda x: prompt_template_inference.format(**{"input": x}))

In [None]:
base_model_sequences_generator: TextGenerationPipeline = transformers.pipeline(
    task="text-generation",
    tokenizer=base_model_tokenizer,
    model=base_model,
    torch_dtype=torch.float16,
    device_map="auto",
)

In [None]:
base_model_sequence = base_model_sequences_generator(
    text_inputs=df_inference_evaluation["prompt"].to_list(),
    do_sample=True,
    top_k=50,
    num_return_sequences=1,
    eos_token_id=base_model_tokenizer.eos_token_id,
    max_length=512,  # you can keep this if needed
    max_new_tokens=100,  # Limit the number of tokens generated
    truncation=True,  # Ensure truncation of long inputs
    return_text=True,
)


In [None]:
print(f'\n[BASE_MODEL_EVALUATION_BEGIN]')
idx: int = 0
print(f'\n[=============EXAMPLE_{idx}_BEGIN=============]')
for _, answer in zip(df_inference_evaluation["input"].to_list(), base_model_sequence):
  print(f'\n[BASE_MODEL_EVALUATION] GENERATED_ANSWER:\n{answer[0]["generated_text"]}')
  print(f'\n[=============EXAMPLE_{idx}_END=============]')
  idx += 1

print(f'\n[BASE_MODEL_EVALUATION_END]')

## finetuning process.

1. Temperature in generation: The lower the temperature parameter, the more conservative and deterministic the text generated by the model is, and it is more likely to select the word with the highest probability as the next word; while the higher the temperature parameter, the more diverse and more deterministic the text generated by the model is. It is possible to select words with lower probability or do more random sampling.

2. Adapter is used for fine tuning, which allows the model to learn additional knowledge on a specific task or data set while maintaining minor modifications to the overall structure of the model. Adapters can be added to individual layers of a pretrained model to allow fine-tuning or scaling without affecting the overall parameters of the model.

3. This part is: qlora_fine_tuning_config: dict = yaml.safe_load(qlora_fine_tuning_yaml).
qlora_fine_tuning_yaml is a configuration file in YAML format, which contains configuration information related to migration learning tasks.

In [None]:
qlora_fine_tuning_config: dict = yaml.safe_load(
"""
model_type: llm
base_model: alexsherstinsky/Mistral-7B-v0.1-sharded

input_features:
  - name: prompt
    type: text
    preprocessing:
      max_sequence_length: 256

output_features:
  - name: answer
    type: text
    preprocessing:
      max_sequence_length: 256

prompt:
  template: |
    [INST] <<SYS>>
    You are a helpful, detailed, and polite AI assistant.
    Answer the question using only the provided context.
    <</SYS>>

    ### Question: {question}
    ### Context: {context}

    ### Answer:
    [/INST]

generation:
  temperature: 0.8
  # max_new_tokens: 128
  max_new_tokens: 150  # The max_token=177 of the data set answer is expected to be within this range.

adapter:
  type: lora
  postprocessor:
    merge_adapter_into_base_model: true
    progressbar: true

quantization:
  bits: 8

preprocessing:
  global_max_sequence_length: 256
  split:
    # type: random
    # probabilities: [0.7, 0.1, 0.2]  Originally 90% for training, 5% for validation, 5% for testing
    type: fixed

trainer:
  type: finetune
  train_steps: 50    # 3 individual epoch. train_steps * gradient_accumulation_steps * batch size = epoch * sample_train
  epochs: 3
  batch_size: 4
  # steps_per_checkpoint: 500 # A total of 15 checkpoints are saved (originally 500)
  checkpoints_per_epoch: 1
  # eval_steps: 500
  eval_batch_size: 8
  early_stop: 3
  gradient_accumulation_steps: 2  # effective batch size = batch size * gradient_accumulation_steps

  learning_rate: 2.0e-4
  enable_gradient_checkpointing: true
  learning_rate_scheduler:
    decay: cosine
    warmup_fraction: 0.03
    reduce_on_plateau: 0
  use_mixed_precision: true
  validation_field: combined
  validation_metric: loss
  enable_profiling: true  #Enable training process profiling using torch.profiler.profile
  profiler:
     wait: 1
     warmup: 1
     active: 3
     repeat: 5
     skip_first: 0
  skip_all_evaluation: false
"""
)

### Use LudwigModel for fine-tuning,

LudwigModel is a library that is used to training models and using them to predict and evaluate them. It is based on datatype abstraction, so that the same data preprocessing and postprocessing will be performed on different datasets that share datatypes and the same encoding and decoding models developed can be re-used across several tasks.

1. load the configuration file `qlora_fine_tuning_config` and build and train the model based on the parameters defined in it.


In [None]:
model: LudwigModel = LudwigModel(config=qlora_fine_tuning_config, logging_level=logging.INFO)

Check GPU usage and clear CUDA before finetuning

In [None]:
import torch

# Get CUDA memory usage before running the code
print("\nBefore clearing CUDA cache:")
print("Current CUDA memory allocated: {:.2f} GB".format(torch.cuda.memory_allocated() / 1024**3))
print("Max CUDA memory allocated: {:.2f} GB".format(torch.cuda.max_memory_allocated() / 1024**3))

# Clear CUDA cache
torch.cuda.empty_cache()

# Get CUDA memory usage after running the code
print("\nAfter clearing CUDA cache:")
print("Current CUDA memory allocated: {:.2f} GB".format(torch.cuda.memory_allocated() / 1024**3))
print("Max CUDA memory allocated: {:.2f} GB".format(torch.cuda.max_memory_allocated() / 1024**3))

# Get the number of available GPUs
num_gpus = torch.cuda.device_count()
print("\nNumber of available GPUs:", num_gpus)

# Iterate over each GPU and print its properties
for i in range(num_gpus):
    gpu_properties = torch.cuda.get_device_properties(i)
    print("GPU {} - Total memory: {:.2f} GB".format(i, gpu_properties.total_memory / 1024**3))


The train method of the LudwigModel object is called to train the model using the given data set df_dataset.

In [None]:
import gc # Replace with your actual variable names
gc.collect()
torch.cuda.empty_cache()

In [None]:
import torch._dynamo
torch._dynamo.config.suppress_errors = True

In [None]:
results: TrainingResults = model.train(dataset=df_dataset,llm_int8_enable_fp32_cpu_offload=True, device_map="from_pretrained")   # Will save relevant files in current path and create a ./results folder in current path

In [None]:
# Extract 'Question' and 'Complex_CoT' from the dictionary inside 'train' column

# If 'train' column is actually a dictionary, access its 'Question' key
df_dataset['input'] = df_dataset.apply(
    lambda row: f'Question: "{row["train"]["Question"]}" \n context: "{row["train"]["Complex_CoT"]}"', axis=1
)

# Same for df_evaluation
df_evaluation['input'] = df_evaluation.apply(
    lambda row: f'Question: "{row["train"]["Question"]}" \n context: "{row["train"]["Complex_CoT"]}"', axis=1
)


# Create the prompt template with placeholders
prompt_template = "Given the question: {input}, the expected answer is: {answer}"

# Pass the template to your training method
results: TrainingResults = model.train(
    dataset=df_dataset,
    llm_int8_enable_fp32_cpu_offload=True,
    device_map="from_pretrained",
    prompt_template=prompt_template  # Make sure you are passing the correct template
)

### Saving model to Drive

In [None]:
import shutil

# Define results saving path
destination_path = drive_path+'./mistral-7b-ml'

# Make sure the results save path exists
os.makedirs(destination_path, exist_ok=True)

# If the target path already exists, delete the contents in the target path first.
if os.path.exists(destination_path):
    shutil.rmtree(destination_path)

In [None]:
# Copy the results folder to the specified path
shutil.copytree('./results', destination_path)  #Manually add the path created by the model


### Perform Inference（after fine-tuning）

We can now use the model we finetuned above to make predictions on some test examples to see whether finetuning the large language model improve its ability to follow instructions/the tasks we're asking it to perform.

Use the trained Ludwig model to predict the evaluation data set df_evaluation

Use the model_predict method to make predictions on the evaluation data set df_evaluation. The returned result is a tuple containing two DataFrames: predictions_and_probabilities. The first DataFrame contains the prediction results, and the second DataFrame contains the corresponding probability values.

In [None]:
df_evaluation_1 = df_evaluation.head(20)

In [None]:
predictions_and_probabilities: tuple[pd.DataFrame, pd.DataFrame] = model.predict(df_evaluation_1)

Extract the DataFrame where the prediction results are located from the tuple predictions_and_probabilities and assign it to the variable df_predictions

In [None]:
df_predictions: pd.DataFrame = predictions_and_probabilities[0]

In [None]:
df_predictions.columns

In [None]:
print("\n\n")
for prompt_with_summary in zip(df_evaluation_1['input'], df_predictions['answer_response']):
  print(f"Input:\n{prompt_with_summary[0]}")
  print(f"Generated Answer:\n{prompt_with_summary[1][0]}")
  print("\n\n")


Evaluate:

In [None]:
!pip install rouge
!pip install bert-score

In [None]:
answer = df_predictions['answer_response'].apply(lambda x: x[0])  # Generated answer
ground_truth = df_evaluation.head(20)['answer']  # Refer to answer

In [None]:
!pip install rouge
!pip install bert-score

import nltk

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

In [None]:
# Optimized code
# semantic similarity (METEOR and BERTScore)
# word and phrase level overlap (BLEU and ROUGE scores)

import json
import nltk
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from rouge import Rouge
from bert_score import score



def calculate_scores(base_answers, predict_answers):
    total_bert_score = 0
    total_meteor_score = 0
    total_bleu_score = 0
    total_rouge_score = {'rouge-1': 0.0, 'rouge-2': 0.0, 'rouge-l': 0.0}
    num_results = len(predict_answers)

    for answer, ground_truth in zip(predict_answers, base_answers):
        if not answer or not ground_truth:
            continue  # Skip empty answers or ground truths

        # Tokenize hypothesis and reference
        hypothesis_tokens = word_tokenize(answer)
        reference_tokens = word_tokenize(ground_truth)

        # BERTScore
        _, _, F1 = score([answer], [ground_truth], lang='en', verbose=False)
        total_bert_score += F1.item()

        # METEOR
        meteor = meteor_score([reference_tokens], hypothesis_tokens)
        total_meteor_score += meteor

        # BLEU
        bleu_score = sentence_bleu([reference_tokens], hypothesis_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=None)
        total_bleu_score += bleu_score

        # ROUGE
        rouge = Rouge()
        rouge_scores = rouge.get_scores(answer, ground_truth)[0]
        for metric, scores in rouge_scores.items():
            total_rouge_score[metric] += scores['f']

    average_bert_score = total_bert_score / num_results
    average_meteor_score = total_meteor_score / num_results
    average_bleu_score = total_bleu_score / num_results
    average_rouge_score = {metric: score / num_results for metric, score in total_rouge_score.items()}

    return average_bert_score, average_meteor_score, average_bleu_score, average_rouge_score


def load_results(file_path):
    try:
        with open(file_path, "r") as f:
            results = json.load(f)
        return results
    except FileNotFoundError:
        print("File not found:", file_path)
    except json.JSONDecodeError:
        print("Invalid JSON format in file:", file_path)
    return []


average_bert_score, average_meteor_score, average_bleu_score, average_rouge_score = calculate_scores(ground_truth, answer)

# Print average scores
print("Average BERTScore:", average_bert_score)
print("Average METEOR score:", average_meteor_score)
print("Average BLEU score:", average_bleu_score)
print("Average Rouge score:", average_rouge_score)



Analyze and evaluate this result


These results are an evaluation of the model performance. The following is an analysis and evaluation for each indicator:

Average BERTScore (0.84):
BERTScore is a metric used to measure the semantic similarity between the generated text and the reference text. It uses the pre-trained BERT model to encode the sentences and calculate the similarity score between them. The average BERTScore here is 0.8682, indicating that the semantic similarity between the text generated by the model and the reference text is high.

Average METEOR Score (0.32):
The METEOR score is another metric for evaluating the quality of machine translation. It takes into account word-level alignment as well as sentence-level semantic similarity. The average METEOR score is 0.3815, which is relatively high, indicating that the text generated by the model is consistent with the reference text to a certain extent.

Average BLEU Score (0.0868):
The BLEU score is used to evaluate the quality of machine translation, and its range is usually between 0 and 1, where 1 indicates a perfect match. The average BLEU score here is about 0.1394, which means that the match between the text generated by the model and the reference text is relatively low. Possible reasons include differences in vocabulary selection, syntactic structure, etc.

Average Rouge Score:
ROUGE scores are used to evaluate the degree of overlap between the generated text and the reference text, including word-level and sentence-level overlap. The average scores of the three ROUGE indicators are provided here:

rouge-1: The average value is about 0.321, indicating that the overlap between the single words generated by the model and the single words in the reference text is good.

rouge-2: The average value is about 0.124, indicating that the overlap between the phrases composed of two words generated by the model and the phrases in the reference text is low.

rouge-l: The average value is about 0.295, indicating that the length of the longest common subsequence between the text generated by the model and the reference text is high, that is, the overlap at the sentence level is good.

Overall, the model performs well in terms of semantic similarity (high METEOR and BERTScore), but there may be room for improvement in terms of word and phrase-level overlap (relatively low BLEU and ROUGE scores). Possible improvements include model tuning, better training data, improved generation strategies, etc.

## Use model for question answering

In [None]:
prompt_template: str = """
You are a helpful, respectful and honest assistant. \
Your task is to generate an answer to the given question. \
And your answer should be based on the provided context only.

### input: {prompt}

### Answer:
"""

In [None]:
model_path = '/content/drive/MyDrive/project-kalki/mistral-7b-ml/api_experiment_run/model/model_weights'
tokenizer: LlamaTokenizerFast = AutoTokenizer.from_pretrained(
  pretrained_model_name_or_path = model_path,
  trust_remote_code=True,
  padding_side="left"
)

bnb_config_samsum_fine_tuned_model: BitsAndBytesConfig = BitsAndBytesConfig()

model_load: MistralForCausalLM = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_path,
    # torch_dtype=torch.float16,
    trust_remote_code=True,
    device_map="auto",
    quantization_config=bnb_config_samsum_fine_tuned_model,
    # low_cpu_mem_usage=True
)

generator: TextGenerationPipeline = transformers.pipeline(
    task="text-generation",
    tokenizer=tokenizer,
    model=model_load,
    # torch_dtype=torch.float16,
    device_map="auto",
)

In [None]:
def infer(user_input):
  prompt = prompt_template.format(prompt=user_input)
  print(prompt)
  return generator(user_input)[0]['generated_text']

while True:
  user_input = input('Please enter question for an article: ')

  if user_input == 'exit':
    break

  print(infer(user_input))