In [5]:
import os
import torch
import torchaudio
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts

In [6]:
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [7]:
# 2) 디바이스 설정: CUDA가 사용 가능하면 GPU('cuda')를, 그렇지 않으면 CPU('cpu')를 사용
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu"
print(f"Using device: {device}")  # 선택된 디바이스(예: cuda 또는 cpu)를 콘솔에 출력하여 확인

Using device: cuda


In [21]:
BASE_PATH = "/home/j-i13a103/tts/run/training"

# xtts_config 경로 설정
CONFIG_PATH = BASE_PATH + "/GPT_XTTS_v2.0_SSOKDAK_FT-August-04-2025_08+49AM-0000000/config.json"

# 학습된 모델의 vocab.json 설정
TOKENIZER_PATH = BASE_PATH + "/XTTS_v2.0_original_model_files/vocab.json"

# 양자화 모델 가중치 가져오기
NEW_MODEL = "/home/j-i13a103/tts/finetuning-result/model/xtts_quantized_state_dict_v2.pth"

# 레퍼런스 넣기
SPEAKER_REFERENCE = "/home/j-i13a103/tts/korean-single-speaker-datasets/wavs/1_0000.wav"



In [22]:
print("Loading model...")

config = XttsConfig()
config.load_json(CONFIG_PATH)
model = Xtts.init_from_config(config)
model_state_dict = torch.load(NEW_MODEL)
model.load_state_dict(state_dict=model_state_dict, strict=False)
# model.load_checkpoint(config, checkpoint_path=NEW_MODEL, vocab_path=TOKENIZER_PATH, use_deepspeed=False)
model.to(device)
# model.eval()

Loading model...


Xtts(
  (gpt): GPT(
    (conditioning_encoder): ConditioningEncoder(
      (init): Conv1d(80, 1024, kernel_size=(1,), stride=(1,))
      (attn): Sequential(
        (0): AttentionBlock(
          (norm): GroupNorm32(32, 1024, eps=1e-05, affine=True)
          (qkv): Conv1d(1024, 3072, kernel_size=(1,), stride=(1,))
          (attention): QKVAttentionLegacy()
          (proj_out): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
        )
        (1): AttentionBlock(
          (norm): GroupNorm32(32, 1024, eps=1e-05, affine=True)
          (qkv): Conv1d(1024, 3072, kernel_size=(1,), stride=(1,))
          (attention): QKVAttentionLegacy()
          (proj_out): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
        )
        (2): AttentionBlock(
          (norm): GroupNorm32(32, 1024, eps=1e-05, affine=True)
          (qkv): Conv1d(1024, 3072, kernel_size=(1,), stride=(1,))
          (attention): QKVAttentionLegacy()
          (proj_out): Conv1d(1024, 1024, kernel_size=(1,), stride=(

In [26]:
torch.save({
    'model': model.state_dict(),
    'config': config,
}, '/home/j-i13a103/tts/finetuning-result/model/xtts_quantized_model.pth')

In [27]:

print("양자화 모델 :", os.path.getsize("/home/j-i13a103/tts/finetuning-result/model/xtts_quantized_model.pth")/1e9, "GB")

양자화 모델 : 1.868022894 GB


In [28]:
QUANTIZED_MODEL = "/home/j-i13a103/tts/finetuning-result/model/xtts_quantized_model.pth"

quantized_model = Xtts.init_from_config(config)
quantized_model.load_checkpoint(config, checkpoint_path=QUANTIZED_MODEL, vocab_path=TOKENIZER_PATH, use_deepspeed=False)


In [29]:
quantized_model.to(device)

Xtts(
  (gpt): GPT(
    (conditioning_encoder): ConditioningEncoder(
      (init): Conv1d(80, 1024, kernel_size=(1,), stride=(1,))
      (attn): Sequential(
        (0): AttentionBlock(
          (norm): GroupNorm32(32, 1024, eps=1e-05, affine=True)
          (qkv): Conv1d(1024, 3072, kernel_size=(1,), stride=(1,))
          (attention): QKVAttentionLegacy()
          (proj_out): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
        )
        (1): AttentionBlock(
          (norm): GroupNorm32(32, 1024, eps=1e-05, affine=True)
          (qkv): Conv1d(1024, 3072, kernel_size=(1,), stride=(1,))
          (attention): QKVAttentionLegacy()
          (proj_out): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
        )
        (2): AttentionBlock(
          (norm): GroupNorm32(32, 1024, eps=1e-05, affine=True)
          (qkv): Conv1d(1024, 3072, kernel_size=(1,), stride=(1,))
          (attention): QKVAttentionLegacy()
          (proj_out): Conv1d(1024, 1024, kernel_size=(1,), stride=(

In [None]:
print("Computing speaker latents...")
gpt_cond_latent, speaker_embedding = quantized_model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE])

Computing speaker latents...


In [31]:
OUTPUT_WAV_PATH = "/home/j-i13a103/tts/finetuning-result/output_quantized_v2.wav"

print("Inference...")
out = quantized_model.inference(
    "안녕, 만나서 반가워. 오늘 하루 어땠어?",
    "ko",
    gpt_cond_latent,
    speaker_embedding,
    temperature=0.7,
)

torchaudio.save(OUTPUT_WAV_PATH, torch.tensor(out["wav"]).unsqueeze(0), 24000)

Inference...
