In [31]:
import torch
import faiss
import numpy as np
import librosa
import soundfile as sf
import torch.nn as nn
from pydub import AudioSegment

# Example model definition - Replace with your actual model's architecture
class VoiceSynthesizer(nn.Module):
    def __init__(self):
        super(VoiceSynthesizer, self).__init__()
        self.fc = nn.Linear(256, 512)  # Example layer
        self.output = nn.Linear(512, 22050)  # Example output layer

    def forward(self, x):
        x = self.fc(x)
        x = self.output(x)
        return x

# Load model
def load_model(model_path):
    # Load the full model checkpoint, including metadata
    checkpoint = torch.load(model_path, map_location=torch.device('cpu'))
    
    # Extract weights (assuming model weights are in 'weight' or similar)
    if 'weight' in checkpoint:
        state_dict = checkpoint['weight']
    else:
        state_dict = checkpoint  # Use checkpoint directly if it contains weights
    
    # Define the model architecture
    model = VoiceSynthesizer()

    # Load the state_dict into the model (ignoring unexpected keys)
    model.load_state_dict(state_dict, strict=False)  # Set strict=False to ignore missing or unexpected keys
    model.eval()  # Set model to evaluation mode
    print("Model loaded and set to evaluation mode!")
    return model

# Load FAISS index
def load_faiss_index(index_path):
    index = faiss.read_index(index_path)
    print("FAISS index loaded successfully!")
    return index

# Generate voice embeddings (example input-to-vector function)
def text_to_embedding(text, index):
    expected_dim = 256  # Get the dimensionality of the FAISS index (adjusted for model input)
    print(f"Expected vector dimension: {expected_dim}")
    
    # Convert text to a list of ASCII values
    tokens = np.array([ord(c) for c in text], dtype=np.float32)
    
    # Optionally, pad or truncate the tokens to match the expected dimension
    if len(tokens) < expected_dim:
        # If the token vector is shorter than the required dimension, pad with zeros
        tokens = np.pad(tokens, (0, expected_dim - len(tokens)), mode='constant')
    elif len(tokens) > expected_dim:
        # If the token vector is longer than the required dimension, truncate
        tokens = tokens[:expected_dim]
    
    print(f"Generated embedding shape: {tokens.shape}")
    return tokens

# Resize embedding to match model input size
def resize_embedding(embedding, target_dim):
    embedding_tensor = torch.tensor(embedding, dtype=torch.float32)
    # Reshaping the embedding to match the target dimension of 256
    if embedding_tensor.shape[0] != target_dim:
        embedding_tensor = embedding_tensor[:target_dim]  # Truncate if needed
    embedding_tensor = embedding_tensor.view(1, -1)  # Add batch dimension
    return embedding_tensor

# Synthesize voice from embeddings
def synthesize_voice(model, embedding):
    embedding_tensor = torch.tensor(embedding, dtype=torch.float32)
    with torch.no_grad():
        audio = model(embedding_tensor)
    # Ensure audio is in the correct shape (1D array for mono audio)
    audio = audio.squeeze().cpu().numpy()  # Remove extra dimensions and move to CPU
    print(f"Synthesized audio shape: {audio.shape}")
    return audio

# Save synthesized audio
def save_audio(audio, output_path, sample_rate=22050):
    # Check if the audio is 1D (mono) or 2D (stereo)
    if audio.ndim == 1:
        audio = audio.reshape(1, -1)  # Reshape to 2D if mono
    elif audio.ndim == 2 and audio.shape[0] == 1:
        audio = audio.reshape(1, -1)  # In case the audio is 1 channel, reshape for soundfile

    # Save as wav file
    wav_path = output_path.replace(".mp3", ".wav")
    sf.write(wav_path, audio.T, samplerate=sample_rate)  # Transpose if it's 2D (channels x samples)
    print(f"Audio saved to {wav_path}")
    
    # If you want to convert to MP3
    audio_segment = AudioSegment.from_wav(wav_path)
    audio_segment.export(output_path, format="mp3")
    print(f"Audio converted and saved to {output_path}")

# Main script
def main():
    # File paths
    model_path = "Voice/chloe_price_e440_s27280.pth"
    index_path = "Voice/added_IVF1629_Flat_nprobe_1_chloe_price_v2.index"
    output_audio_path = "synthesized_voice.mp3"
    
    # Load model and FAISS index
    model = load_model(model_path)
    index = load_faiss_index(index_path)
    
    # Input text
    input_text = "Hello, this is Chloe's voice."
    embedding = text_to_embedding(input_text, index)
    
    if embedding is not None:
         synthesized_audio = synthesize_voice(model, embedding)
         output_audio_path = "synthesized_voice.mp3"  # Final mp3 output path
         save_audio(synthesized_audio, output_audio_path)

if __name__ == "__main__":
    main()


Model loaded and set to evaluation mode!
FAISS index loaded successfully!
Expected vector dimension: 256
Generated embedding shape: (256,)
Synthesized audio shape: (22050,)
Audio saved to synthesized_voice.wav


FileNotFoundError: [WinError 2] The system cannot find the file specified