In [None]:
import torch
from transformers import AutoTokenizer, AutoModel

In [None]:
def print_nparams(model):
    """Calculate the total number of model parameters"""
    nparams = sum(p.numel() for p in model.parameters())
    print(f"The total number of parameters is: {nparams}")

In [None]:
model_name = "/root/share/new_models/Shanghai_AI_Laboratory/internlm2_5-1_8b"
# 加载tokenizer
tokenizer_20b = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# 加载模型
model_20b = AutoModel.from_pretrained(model_name, trust_remote_code=True).to("cuda")

In [None]:
print(model_20b)

In [None]:
print_nparams(model_20b)

In [None]:
# import gc
# del model_20b
# gc.collect()

# 🧑‍💻down-scale your model

In [None]:
from transformers import AutoTokenizer, AutoConfig
layers = model_20b.model.layers
model_20b.model.layers = layers[:4] + layers[-4:]
config = AutoConfig.from_pretrained(
    model_name,    
    num_hidden_layers=len(model_20b.model.layers),
    trust_remote_code=True
)

In [None]:

model_20b.config = config
print_nparams(model_20b)  #1008248832 => 0.1B

In [None]:
from copy import deepcopy
model_20b.model.layers = deepcopy(layers[:5]) + deepcopy(layers[-5:])
model_20b.model.tok_embeddings = deepcopy(model_20b.model.tok_embeddings)
model_20b.output = deepcopy(model_20b.output)
print(model_20b.config)

In [None]:
from transformers import TextStreamer
# Run simple inference to show no trained model
prompt = "你好，我是"

inputs = tokenizer_20b(prompt, return_tensors="pt").to(model_20b.device)



In [None]:
streamer = TextStreamer(
    tokenizer_20b, 
    skip_prompt=True, 
    skip_special_tokens=True
)

In [None]:
outputs = model_20b.generate(
    **inputs, 
    streamer=streamer, 
    use_cache=True, 
    max_new_tokens=64, 
    do_sample=False
)

In [None]:
import os 
os.makedirs('./data/InternLM-community', exist_ok=True)
model_20b.save_pretrained('./data/InternLM-community')
tokenizer_20b.save_pretrained('./data/InternLM-community')

In [None]:
import gc
del model_20b
gc.collect()

In [None]:
# /root/code/xtuner/work_dirs/wanjuan_hf
model_name = "/root/code/xtuner/work_dirs/wanjuan_hf"
# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# 加载模型
model = AutoModel.from_pretrained(model_name, trust_remote_code=True).to("cuda")

In [None]:
print_nparams(model)

In [None]:
from transformers import TextStreamer
# Run simple inference to show no trained model
prompt = "Hello"

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
streamer = TextStreamer(
    tokenizer, 
    skip_prompt=True, 
    skip_special_tokens=True
)
outputs = model.generate(
    **inputs, 
    streamer=streamer, 
    use_cache=True, 
    max_new_tokens=64, 
    do_sample=False
)