In [1]:
!pip install -q gradio
!pip install -q unsloth

In [2]:
from unsloth import FastLanguageModel
from transformers import TextStreamer
import random
import gradio as gr
import torch

max_seq_length = 3072
dtype = torch.bfloat16

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [3]:

immigration_model, immigration_tokenizer = FastLanguageModel.from_pretrained(
    model_name = "gdevakumar/llama3_1_immigration_lora",
    max_seq_length = max_seq_length,
    dtype = dtype
)

FastLanguageModel.for_inference(immigration_model)


insurance_model, insurance_tokenizer = FastLanguageModel.from_pretrained(
    model_name = "gdevakumar/llama3_1_insurance_lora",
    max_seq_length = max_seq_length,
    dtype = dtype
)

FastLanguageModel.for_inference(insurance_model)



==((====))==  Unsloth 2024.9.post4: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Device does not support bfloat16. Will change to float16.
Unsloth 2024.9.post4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


==((====))==  Unsloth 2024.9.post4: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Device does not support bfloat16. Will change to float16.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4bit(
      

In [4]:

insurance_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction: You are an Expert Insurance agent with real world expertise in Auto, Medical, Health, Property insurances. Based on the given input question, return a simple, clear, accurate and professional response that answers the question.

### Input:
{}

### Response: """


immigration_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction: You are an Expert US Immigration agent with real world expertise in dealing immigration based legal cases and consultations in the USA. Based on the given input question, return a simple, clear, accurate and professional response that answers the question.

### Input:
{}

### Response: """



In [5]:
def get_insurance_response(message, history):
    inputs = insurance_tokenizer(
    [
        insurance_prompt.format(
            message
        )
    ], return_tensors = "pt").to("cuda")
    outputs = insurance_model.generate(**inputs, max_new_tokens = 128, use_cache = True)
    answer = insurance_tokenizer.batch_decode(outputs)
    return answer[0].split("Response:")[-1]


def get_immigration_response(message, history):
    inputs = immigration_tokenizer(
    [
        immigration_prompt.format(
            message
        )
    ], return_tensors = "pt").to("cuda")
    outputs = immigration_model.generate(**inputs, max_new_tokens = 128, use_cache = True)
    answer = immigration_tokenizer.batch_decode(outputs)
    return answer[0].split("Response:")[-1]


In [6]:
inputs = insurance_tokenizer(
    [
        insurance_prompt.format(
            "Why is auto insurance important?"
        )
    ], return_tensors = "pt").to("cuda")
outputs = insurance_model.generate(**inputs, max_new_tokens = 128, use_cache = True,)
answer = insurance_tokenizer.batch_decode(outputs)

In [7]:
answer[0].split("Response:")[-1]

' 1. Often times an auto insurance policy is required by the state in which you reside. Thus, viewing ownership of an automobile as a privilege rather than a right, it is important to comply with state laws in order to maintain good standing with in the community in which you live. 2. In addition to state mandated regulations, it is important to have auto insurance in order to protect yourself against financial devistations should you experience a covered loss. Auto insurance can provide you with financial security should your car be involved in an accident, be damaged due to a fire or theft or be involved in any other covered event. <|end_of_text|>'

In [8]:
inputs = immigration_tokenizer(
    [
        immigration_prompt.format(
            "What is H1b visa?"
        )
    ], return_tensors = "pt").to("cuda")
outputs = immigration_model.generate(**inputs, max_new_tokens = 128, use_cache = True,)
answer = immigration_tokenizer.batch_decode(outputs)


In [None]:
answer[0].split("Response:")[-1]

In [10]:
insurance = gr.ChatInterface(get_insurance_response)
immigration = gr.ChatInterface(get_immigration_response)

demo = gr.TabbedInterface([insurance, immigration], ["Insurance", "Immigration"],)

In [11]:

if __name__ == "__main__":
    demo.launch(share=True, debug=True, )


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://49bf123296a3e39d1b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://49bf123296a3e39d1b.gradio.live
