In [2]:
import tensorflow as tf
import librosa
import numpy as np
import soundfile as sf
import matplotlib.pyplot as plt
from scipy.signal import butter, filtfilt

class NeuralAudioStyleTransfer:
    def __init__(self):
        # Hyperparameters
        self.n_fft = 2048
        self.hop_length = 512
        self.learning_rate = 0.02
        self.style_weight = 1e-2
        self.content_weight = 1e4
        self.iterations = 100
        
        
        # Style and content layer weights
        self.style_layers = ['block1_conv1', 'block2_conv1', 
                           'block3_conv1', 'block4_conv1', 'block5_conv1']
        self.content_layers = ['block4_conv2']
        
        self.style_weight_per_layer = 1.0 / len(self.style_layers)

        # Initialize VGG19 model
        self.model = self.build_feature_extractor()
        
    def build_feature_extractor(self):
        """Create feature extractor model using VGG19"""
        vgg = tf.keras.applications.VGG19(include_top=False, weights='imagenet')
        vgg.trainable = False
        
        style_outputs = [vgg.get_layer(name).output for name in self.style_layers]
        content_outputs = [vgg.get_layer(name).output for name in self.content_layers]
        
        return tf.keras.Model(vgg.input, style_outputs + content_outputs)
    
    def process_audio(self, file_path):
        """Load and process audio to spectrogram"""
        # Load audio
        audio, sr = librosa.load(file_path, sr=None)
        
        # Compute spectrogram
        stft = librosa.stft(audio, n_fft=self.n_fft, hop_length=self.hop_length)
        spectrogram = np.abs(stft)
        phase = np.angle(stft)
        
        # Convert to dB scale
        spectrogram_db = librosa.amplitude_to_db(spectrogram)
        
        return spectrogram_db, phase, sr
    
    def prepare_spectrogram_for_vgg(self, spectrogram):
        """Convert spectrogram to VGG19 input format"""
        # Normalize to [0, 1]
        spec_norm = (spectrogram - np.min(spectrogram)) / (np.max(spectrogram) - np.min(spectrogram))
        
        # Convert to RGB by repeating channels
        spec_rgb = np.stack([spec_norm] * 3, axis=-1)
        
        # Resize to VGG19 input size
        spec_rgb = tf.image.resize(spec_rgb, (224, 224))
        
        # Add batch dimension and preprocess
        spec_rgb = tf.keras.applications.vgg19.preprocess_input(spec_rgb[np.newaxis, ...])
        
        return spec_rgb
    
    def compute_gram_matrix(self, feature_maps):
        """Compute Gram matrix for style features"""
        # Get shape using tf operations
        shape = tf.shape(feature_maps)
        height = shape[1]
        width = shape[2]
        channels = shape[3]
        
        # Reshape and compute gram matrix
        features = tf.reshape(feature_maps, (height * width, channels))
        gram = tf.matmul(features, features, transpose_a=True)
        
        # Normalize
        normalizer = tf.cast(height * width * channels, tf.float32)
        return gram / normalizer

    
    def compute_style_loss(self, style_targets, style_outputs):
        """Compute style loss using Gram matrices"""
        style_loss = tf.zeros(shape=())
        
        for target, output in zip(style_targets, style_outputs):
            gram_target = self.compute_gram_matrix(target)
            gram_output = self.compute_gram_matrix(output)
            layer_loss = tf.reduce_mean(tf.square(gram_output - gram_target))
            style_loss += layer_loss * self.style_weight_per_layer
            
        return style_loss * self.style_weight

    
    def compute_content_loss(self, content_targets, content_outputs):
        """Compute content loss"""
        content_loss = tf.zeros(shape=())
        
        for target, output in zip(content_targets, content_outputs):
            content_loss += tf.reduce_mean(tf.square(output - target))
            
        return content_loss * self.content_weight

    
    @tf.function
    def train_step(self, generated_spectrogram, style_targets, content_targets):
        """Single training step"""
        with tf.GradientTape() as tape:
            # Get features
            outputs = self.model(generated_spectrogram)
            style_outputs = outputs[:len(self.style_layers)]
            content_outputs = outputs[len(self.style_layers):]
            
            # Compute losses
            style_loss = self.compute_style_loss(style_targets, style_outputs)
            content_loss = self.compute_content_loss(content_targets, content_outputs)
            total_loss = style_loss + content_loss
            
        # Compute gradients
        grads = tape.gradient(total_loss, generated_spectrogram)
        
        # Update generated spectrogram using tf.assign
        generated_spectrogram.assign_add(self.learning_rate * grads)
        
        return total_loss, style_loss, content_loss

    
    def transfer_style(self, content_path, style_path, output_path):
        """Main style transfer function"""
        try:
            # Process audio files
            content_spec, content_phase, sr = self.process_audio(content_path)
            style_spec, _, _ = self.process_audio(style_path)
            
            # Store original content spectrogram shape
            original_shape = content_spec.shape
            
            # Prepare spectrograms for VGG19
            content_input = self.prepare_spectrogram_for_vgg(content_spec)
            style_input = self.prepare_spectrogram_for_vgg(style_spec)
            
            # Initialize generated spectrogram with content
            generated_spectrogram = tf.Variable(content_input)
            
            # Extract style features
            style_features = self.model(style_input)
            style_targets = style_features[:len(self.style_layers)]
            
            # Extract content features
            content_features = self.model(content_input)
            content_targets = content_features[len(self.style_layers):]
            
            # Initialize best results
            best_loss = float('inf')
            best_spectrogram = None
            
            # Optimization loop
            for i in range(self.iterations):
                total_loss, style_loss, content_loss = self.train_step(
                    generated_spectrogram, style_targets, content_targets)
                
                # Save best result
                if total_loss < best_loss:
                    best_loss = total_loss
                    best_spectrogram = generated_spectrogram.numpy().copy()
                
                if i % 100 == 0:
                    tf.print('Iteration:', i)
                    tf.print('Total loss:', total_loss)
                    tf.print('Style loss:', style_loss)
                    tf.print('Content loss:', content_loss)
            
            # Use the best result for reconstruction
            generated_spec = best_spectrogram[0]
            
            # Resize back to original dimensions
            generated_spec = tf.image.resize(
                generated_spec,
                (original_shape[0], original_shape[1])
            ).numpy()
            
            # Denormalize and handle potential infinities
            generated_spec = generated_spec[:, :, 0]  # Take first channel
            generated_spec = np.clip(generated_spec, np.min(content_spec), np.max(content_spec))
            
            # Convert to amplitude
            generated_spec = librosa.db_to_amplitude(generated_spec)
            generated_spec = np.clip(generated_spec, 0, np.max(generated_spec))
            
            # Combine with phase
            stft_matrix = generated_spec * np.exp(1j * content_phase)
            
            # Inverse STFT with validation
            try:
                audio_generated = librosa.istft(
                    stft_matrix,
                    hop_length=self.hop_length,
                    win_length=self.n_fft,
                    length=librosa.get_duration(filename=content_path) * sr
                )
            except librosa.ParameterError:
                # If ISTFT fails, try with different parameters
                audio_generated = librosa.istft(
                    stft_matrix,
                    hop_length=self.hop_length,
                    win_length=self.n_fft
                )
            
            # Ensure finite values and normalize
            audio_generated = np.nan_to_num(audio_generated, 0)
            audio_generated = np.clip(audio_generated, -1, 1)
            
            # Apply smoothing
            audio_smoothed = self.apply_smoothing(audio_generated, sr)
            
            # Final normalization and validation
            audio_smoothed = np.nan_to_num(audio_smoothed, 0)
            max_val = np.max(np.abs(audio_smoothed))
            if max_val > 0:
                audio_smoothed = audio_smoothed / max_val
            audio_smoothed = np.clip(audio_smoothed, -1, 1)
            
            # Validate before saving
            if not np.isfinite(audio_smoothed).all():
                raise ValueError("Generated audio contains invalid values")
            
            # Save output
            sf.write(output_path, audio_smoothed, sr)
            print(f"Output saved to {output_path}")
            
            # Compute spectrograms for visualization with validation
            def safe_spectrogram(audio, sr):
                spec = librosa.stft(audio, n_fft=self.n_fft)
                mag = np.abs(spec)
                mag = np.clip(mag, np.finfo(float).eps, None)  # Avoid log of zero
                return librosa.amplitude_to_db(mag)
            
            # Load original audio for visualization
            content_audio, _ = librosa.load(content_path, sr=sr)
            style_audio, _ = librosa.load(style_path, sr=sr)
            
            return (
                safe_spectrogram(content_audio, sr),
                safe_spectrogram(style_audio, sr),
                safe_spectrogram(audio_smoothed, sr)
            )
            
        except Exception as e:
            print(f"Error in style transfer: {str(e)}")
            raise



    
    def apply_smoothing(self, audio, sr, cutoff_freq=8000, order=5):
        """Apply low-pass filter to smooth the audio with validation"""
        # Ensure finite values
        audio = np.nan_to_num(audio, 0)
        
        # Apply filter
        nyquist = 0.5 * sr
        normalized_cutoff = cutoff_freq / nyquist
        b, a = butter(order, normalized_cutoff, btype='low', analog=False)
        
        # Filter and validate
        smoothed = filtfilt(b, a, audio)
        smoothed = np.nan_to_num(smoothed, 0)
        smoothed = np.clip(smoothed, -1, 1)
        
        return smoothed

    
    def plot_spectrograms(self, content_spec, style_spec, generated_spec):
        """Visualize the spectrograms"""
        fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))
        
        ax1.imshow(content_spec)
        ax1.set_title('Content')
        
        ax2.imshow(style_spec)
        ax2.set_title('Style')
        
        ax3.imshow(generated_spec)
        ax3.set_title('Generated')
        
        plt.tight_layout()
        plt.show()

def main():
    # Set file paths
    content_path = "../audio/fade.mp3"
    style_path = "../audio/dont.mp3"
    output_path = "../audio/output_neural_style.wav"
    
    # Create style transfer object
    transfer = NeuralAudioStyleTransfer()
    
    # Perform style transfer
    content_spec, style_spec, generated_spec = transfer.transfer_style(
        content_path, style_path, output_path)
    
    # Plot results
    transfer.plot_spectrograms(content_spec, style_spec, generated_spec)

if __name__ == "__main__":
    main()


Iteration: 0
Total loss: 0.035532508
Style loss: 0.035532508
Content loss: 0
Error in style transfer: 'float' object cannot be interpreted as an integer


	This alias will be removed in version 1.0.
  length=librosa.get_duration(filename=content_path) * sr


TypeError: 'float' object cannot be interpreted as an integer