In [1]:
import pandas as pd
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForCausalLM,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments, 
    Seq2SeqTrainer
)
import transformers
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig, prepare_model_for_kbit_training
import torch
import time

bnb_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

# torch.set_default_device("cuda")

model_id = "internlm/internlm2-chat-20b"

tokenizer = AutoTokenizer.from_pretrained(model_id,trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(model_id,
                                             device_map ='auto',
                                             quantization_config=bnb_config,
                                             use_cache=True,
                                             trust_remote_code=True
                                            )
model = model.eval()


Loading checkpoint shards:   0%|          | 0/21 [00:00<?, ?it/s]

In [3]:
model

InternLM2ForCausalLM(
  (model): InternLM2Model(
    (tok_embeddings): Embedding(92544, 6144, padding_idx=2)
    (layers): ModuleList(
      (0-47): 48 x InternLM2DecoderLayer(
        (attention): InternLM2Attention(
          (wqkv): Linear4bit(in_features=6144, out_features=8192, bias=False)
          (wo): Linear4bit(in_features=6144, out_features=6144, bias=False)
          (rotary_emb): InternLM2RotaryEmbedding()
        )
        (feed_forward): InternLM2MLP(
          (w1): Linear4bit(in_features=6144, out_features=16384, bias=False)
          (w3): Linear4bit(in_features=6144, out_features=16384, bias=False)
          (w2): Linear4bit(in_features=16384, out_features=6144, bias=False)
          (act_fn): SiLU()
        )
        (attention_norm): InternLM2RMSNorm()
        (ffn_norm): InternLM2RMSNorm()
      )
    )
    (norm): InternLM2RMSNorm()
  )
  (output): Linear(in_features=6144, out_features=92544, bias=False)
)

In [2]:
from datasets import Dataset
# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

CUTOFF_LEN = 1024

config  = LoraConfig(
    r=128,
    lora_alpha=256,
    lora_dropout=0.1,
    target_modules = ["wqkv","wo"],
    bias = 'none',
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)

In [3]:
from datasets import load_dataset

dataset = load_dataset('json', data_files='ft_dataset.json')

In [8]:
dataset['train'][0]['conversation']

[{'input': 'How was COVID-19 initially confirmed in the index patient?',
  'output': "The initial confirmation of COVID-19 was made by pan-coronavirus conventional polymerase chain reaction assay and sequencing of the PCR amplicons using a throat swab.This Solution proposed in 'Case of the Index Patient Who Caused Tertiary Transmission of Coronavirus Disease 2019 in Korea: the Application of Lopinavir/Ritonavir for the Treatment of COVID-19 Pneumonia Monitored by Quantitative RT-PCR' at 2020-2-14.",
  'system': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n'}]

In [4]:
def generate_prompt(x):
    chat = [
      {"role": "user", "content":x['conversation'][0]['input']},
      {"role": "assistant", "content": x['conversation'][0]['output']},
    ]
    # print(len(p))
    return chat

def tokenize(prompt):
    msg = tokenizer.apply_chat_template(conversation=prompt, tokenize=True, add_generation_prompt=False, return_tensors='pt')
    return {'input_ids':msg[0]}

train_data = dataset.shuffle().map(lambda x: tokenize(generate_prompt(x)),remove_columns=['conversation'])

Map:   0%|          | 0/487 [00:00<?, ? examples/s]

In [5]:
import transformers

trainer = Trainer(
    model=model,
    train_dataset=train_data['train'],
    args=TrainingArguments(
        per_device_train_batch_size=5,
        # gradient_accumulation_steps=4,
        num_train_epochs=3,
        learning_rate=1e-4,
        logging_steps=2,
        optim="adamw_torch",
        save_strategy="epoch",
        output_dir="internel_20B_FT"
    ),
    # remove_unused_columns=False,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a InternLM2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
2,3.47
4,2.6788
6,2.6132
8,2.2539
10,1.9924
12,1.747
14,1.7787
16,1.718
18,1.6916
20,1.6826


Checkpoint destination directory internel_20B_FT/checkpoint-98 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=294, training_loss=1.163477728966953, metrics={'train_runtime': 3940.2794, 'train_samples_per_second': 0.371, 'train_steps_per_second': 0.075, 'total_flos': 2.3539863680815104e+16, 'train_loss': 1.163477728966953, 'epoch': 3.0})

In [6]:
new_model = "knowLLM/internel_20B_FT_r128_alpha_256"
# Save trained model
trainer.model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

('knowLLM/internel_20B_FT_r128_alpha_256/tokenizer_config.json',
 'knowLLM/internel_20B_FT_r128_alpha_256/special_tokens_map.json',
 'knowLLM/internel_20B_FT_r128_alpha_256/./tokenizer.model',
 'knowLLM/internel_20B_FT_r128_alpha_256/added_tokens.json',
 'knowLLM/internel_20B_FT_r128_alpha_256/tokenizer.json')

In [7]:
# 训练完成后，可以查看trainer.state.log_history中的损失值
for log in trainer.state.log_history:
    if 'loss' in log:
        print(f"Step {log['step']}: Loss = {log['loss']}")

Step 2: Loss = 3.47
Step 4: Loss = 2.6788
Step 6: Loss = 2.6132
Step 8: Loss = 2.2539
Step 10: Loss = 1.9924
Step 12: Loss = 1.747
Step 14: Loss = 1.7787
Step 16: Loss = 1.718
Step 18: Loss = 1.6916
Step 20: Loss = 1.6826
Step 22: Loss = 1.7628
Step 24: Loss = 1.4948
Step 26: Loss = 1.6369
Step 28: Loss = 1.4175
Step 30: Loss = 1.5117
Step 32: Loss = 1.7931
Step 34: Loss = 1.3572
Step 36: Loss = 1.8822
Step 38: Loss = 1.4807
Step 40: Loss = 1.5111
Step 42: Loss = 1.2653
Step 44: Loss = 1.3173
Step 46: Loss = 1.388
Step 48: Loss = 1.3958
Step 50: Loss = 1.4917
Step 52: Loss = 1.4885
Step 54: Loss = 1.4581
Step 56: Loss = 1.2637
Step 58: Loss = 1.2961
Step 60: Loss = 1.2549
Step 62: Loss = 1.2552
Step 64: Loss = 1.3906
Step 66: Loss = 1.4101
Step 68: Loss = 1.2968
Step 70: Loss = 1.5025
Step 72: Loss = 1.3821
Step 74: Loss = 1.34
Step 76: Loss = 1.4451
Step 78: Loss = 1.2859
Step 80: Loss = 1.384
Step 82: Loss = 1.3613
Step 84: Loss = 1.4069
Step 86: Loss = 1.2854
Step 88: Loss = 1.3271


# Inference

In [1]:
import pandas as pd
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForCausalLM,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments, 
    Seq2SeqTrainer
)
import transformers
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig, prepare_model_for_kbit_training
import torch
import time

bnb_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)


# torch.set_default_device("cuda")

model_id = "internlm/internlm2-chat-20b"

tokenizer = AutoTokenizer.from_pretrained(model_id,trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(model_id,
                                             device_map ='auto',
                                             # quantization_config=bnb_config,
                                             use_cache=True,
                                             trust_remote_code=True
                                            )
model = model.eval()



Loading checkpoint shards:   0%|          | 0/21 [00:00<?, ?it/s]

In [2]:
from datasets import load_dataset

dataset = load_dataset('json', data_files='ft_dataset.json')

# total_input = []
# total_label = []
total_resp = []
p = 0
for this_data in dataset['train']:

    this_input = this_data['conversation'][0]['input']
    this_label = this_data['conversation'][0]['output']
    
    messages = [
    {"role": "user", "content": this_input}
    ]
    input_ids = tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=False, return_tensors='pt')
    output_ids = model.generate(input_ids.to('cuda'),max_new_tokens=1024)
    response = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True)

    # total_input.append(this_input)
    # total_label.append(this_label)
    total_resp.append(response)
    p+=1
    print(p)

    if len(total_resp) == 310:
        break


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101


KeyboardInterrupt: 

In [6]:
import pickle

with open("test_result.pkl", 'rb') as file:
    result = pickle.load(file)

# result['input'] = total_input
# result['label'] = total_label
result['internlm_original'] = total_resp +  ["_"] * 209


with open("test_result.pkl", 'wb') as file:
    pickle.dump(result, file)

In [5]:
result

Unnamed: 0,input,label,internlm_r32_alpha128,internlm_r128_alpha256,internlm_original
0,How was COVID-19 initially confirmed in the in...,The initial confirmation of COVID-19 was made ...,\nassistant\nThe index patient was confirmed t...,assistant\nThe initial confirmation of COVID-1...,The initial confirmation of COVID-19 in the in...
1,What were the symptoms and progression of the ...,The index patient initially experienced chills...,"\nThe index patient had a dry cough, fever, an...",assistant\nThe index patient was a 68-year-old...,The index patient was a 32-year-old male who p...
2,What were the methods of viral load measuremen...,Viral loads were measured using quantitative r...,\nRT-PCR and qRT-PCR assays were used to measu...,assistant\nViral load was measured using quant...,The index patient was a man who was diagnosed ...
3,What treatment was administered to the index p...,The patient was treated with lopinavir/ritonav...,\nThe index patient was treated with lopinavir...,assistant\nThe patient was treated with lopina...,The index patient was administered a combinati...
4,Did lopinavir/ritonavir administration clearly...,While there was a noticeable decline in viral ...,assistant\nThe paper does not provide a defini...,[UNUSED_TOKEN_4]assistant\nThe study did not p...,"Yes, lopinavir/ritonavir (LPV/r) administratio..."
...,...,...,...,...,...
305,What is the focus of forward supply chain opti...,They focus on delivering medical services and ...,\nassistant\nForward supply chain optimization...,[UNUSED_TOKEN_4]assistant\nForward supply chai...,_
306,What are the research gaps identified in the l...,No research work has been conducted to design ...,\nThe literature review identified gaps in res...,[UNUSED_TOKEN_4]assistant\nThe literature lack...,_
307,What are the characteristics of the reverse lo...,The reverse logistics system involves a much s...,assistant\nThe reverse logistics system for me...,[UNUSED_TOKEN_4]assistant\nThe reverse logisti...,_
308,What constitutes the proposed optimization mod...,The model is a multi-period multi-objective mi...,assistant\nThe model consists of three sub-pro...,[UNUSED_TOKEN_4]assistant\nThe model is a mixe...,_
