In [25]:
from scipy.signal import butter, filtfilt
import tensorflow as tf
import tensorflow.compat.v1 as tf1
tf1.disable_v2_behavior()  # We need this for inception model compatibility
import librosa
import numpy as np
import soundfile as sf
import os   
import matplotlib.pyplot as plt
from io import BytesIO
import requests, zipfile
import PIL.Image
from IPython.display import display, Image


In [26]:
def apply_low_pass_filter(audio, sample_rate, cutoff_frequency=8000, order=5):
    """
    Applies a low-pass Butterworth filter to smooth the audio signal.
    """
    nyquist = 0.5 * sample_rate
    normalized_cutoff = cutoff_frequency / nyquist
    b, a = butter(order, normalized_cutoff, btype='low', analog=False)
    smoothed_audio = filtfilt(b, a, audio)
    return smoothed_audio

# Define paths
content_audio_path = "../audio/dont.mp3"
style_audio_path = "../audio/fade.mp3"
output_audio_path = "../audio/outdont2.wav"


In [27]:
# Constants
N_FFT = 2048  # Changed to match inception input requirements better
N_FILTERS = 4096
ALPHA = 1e-3  # Content loss weight
BETA = 1.0    # Style loss weight
learning_rate = 1e-4
iterations = 100

In [28]:
def download_inception_model():
    url = "https://storage.googleapis.com/download.tensorflow.org/models/inception5h.zip"
    local_zip_file = "inception5h.zip"

    if not os.path.exists('tensorflow_inception_graph.pb'):
        print("Downloading Inception model...")
        response = requests.get(url)
        with open(local_zip_file, 'wb') as f:
            f.write(response.content)

        with zipfile.ZipFile(local_zip_file, 'r') as zip_ref:
            zip_ref.extractall()

        os.remove(local_zip_file)
        print("Inception model downloaded and extracted.")

def load_inception_model():
    model_fn = 'tensorflow_inception_graph.pb'
    graph = tf1.Graph()
    sess = tf1.InteractiveSession(graph=graph)
    with tf.io.gfile.GFile(model_fn, 'rb') as f:
        graph_def = tf1.GraphDef()
        graph_def.ParseFromString(f.read())
    t_input = tf1.placeholder(np.float32, name='input')
    imagenet_mean = 117.0
    t_preprocessed = tf1.expand_dims(t_input-imagenet_mean, 0)
    tf1.import_graph_def(graph_def, {'input': t_preprocessed})
    return graph, sess, t_input

In [29]:
def T(layer):
    '''Helper for getting layer output tensor'''
    return graph.get_tensor_by_name(f"import/{layer}:0")

def read_audio_spectrum(filename):
    """Read and process audio file into spectrum"""
    x, fs = librosa.load(filename, sr=None)
    print(f"Sampling rate: {fs}")

    # Compute STFT
    S = librosa.stft(x, n_fft=N_FFT)
    p = np.angle(S)

    # Take log of magnitude
    S = np.log1p(np.abs(S[:,:430]))
    
    # Normalize to [0, 1] range
    S = S / np.max(S)
    
    # Create 3-channel representation (RGB-like)
    S_3channel = np.stack([S] * 3, axis=-1)  # Shape becomes [height, width, 3]
    
    return S_3channel, fs, p, x

In [30]:
def build_model(x):
    """Extract features using inception model"""
    if isinstance(x, np.ndarray):
        x = tf1.convert_to_tensor(x, dtype=tf1.float32)
    
    # Define layer
    content_layer = 'mixed3b_1x1_pre_relu'
    
    # Get the tensor for the layer
    layer_tensor = T(content_layer)
    
    with graph.as_default():
        if isinstance(x, tf.Tensor):
            features = sess.run(layer_tensor, {t_input: sess.run(x)})
        else:
            features = sess.run(layer_tensor, {t_input: x})
    
    return features

def compute_gram_matrix(features):
    """Compute gram matrix from features"""
    # Get shape information
    shape = tf1.shape(features)
    
    # Reshape features
    features_reshaped = tf1.reshape(features, (-1, tf1.shape(features)[-1]))
    # Compute gram matrix
    gram = tf1.matmul(tf1.transpose(features_reshaped), features_reshaped)
    # Normalize
    return gram / tf1.cast(tf1.shape(features_reshaped)[0], tf.float32)

def compute_content_loss(content_features, gen_features):
    """Compute content loss"""
    return ALPHA * tf1.reduce_mean(tf1.square(gen_features - content_features))

def compute_style_loss(style_gram, gen_features):
    """Compute style loss"""
    gen_gram = compute_gram_matrix(gen_features)
    return BETA * tf1.reduce_mean(tf1.square(gen_gram - style_gram))

def train_step(x_gen, content_features, style_gram):
    with graph.as_default():
        # Create the forward pass operations in the graph
        gen_features_tensor = T('mixed3b_1x1_pre_relu')
        
        # Compute features and losses within the graph, not with sess.run
        gen_features = gen_features_tensor
        
        # Compute losses (all operations should be in the graph)
        content_loss = ALPHA * tf1.reduce_mean(tf1.square(gen_features - content_features))
        
        # Compute gram matrix and style loss
        shape = tf1.shape(gen_features)
        features_reshaped = tf1.reshape(gen_features, (-1, shape[-1]))
        gram = tf1.matmul(tf1.transpose(features_reshaped), features_reshaped)
        gram = gram / tf1.cast(tf1.shape(features_reshaped)[0], tf.float32)
        
        style_loss = BETA * tf1.reduce_mean(tf1.square(gram - style_gram))
        
        # Total loss
        total_loss = content_loss + style_loss
        
        # Compute gradients
        gradients = tf1.gradients(total_loss, [x_gen])[0]
        
        if gradients is None:
            raise ValueError("Gradients are None. Check the computation graph connections.")
        
        # Create update operation
        update_op = x_gen.assign_sub(learning_rate * gradients)
        
        # Run the update and get loss values
        _, total_loss_val, content_loss_val, style_loss_val = sess.run(
            [update_op, total_loss, content_loss, style_loss],
            feed_dict={t_input: sess.run(x_gen)}  # Provide the current value of x_gen
        )
        
        return total_loss_val, content_loss_val, style_loss_val


# Main execution


In [22]:
def plot_spectrograms(content_spec, style_spec, output_spec):
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))

    ax1.imshow(content_spec, aspect='auto')
    ax1.set_title('Content Audio')

    ax2.imshow(style_spec, aspect='auto')
    ax2.set_title('Style Audio')

    ax3.imshow(output_spec, aspect='auto')
    ax3.set_title('Output Audio')

    plt.tight_layout()
    plt.show()

In [31]:
# Download and load inception model
download_inception_model()
graph, sess, t_input = load_inception_model()

ERROR:tensorflow:An interactive session is already active. This can cause out-of-memory errors or some other unexpected errors (due to the unpredictable timing of garbage collection) in some cases. You must explicitly call `InteractiveSession.close()` to release resources held by the other session(s). Please use `tf.Session()` if you intend to productionize.


In [32]:
# Load and process audio files
print("Processing content audio...")
content_spectrum, content_fs, content_phase, content_audio = read_audio_spectrum(content_audio_path)
print("Processing style audio...")
style_spectrum, style_fs, _, _ = read_audio_spectrum(style_audio_path)

# Adjust style spectrum to match content dimensions
N_SAMPLES = content_spectrum.shape[1]
N_CHANNELS = content_spectrum.shape[0]
style_spectrum = style_spectrum[:N_CHANNELS, :N_SAMPLES]

    
# Main execution
with graph.as_default():
    # Prepare inputs (make sure these are numpy arrays)
    content_tf = np.expand_dims(content_spectrum, 0).astype(np.float32)
    style_tf = np.expand_dims(style_spectrum, 0).astype(np.float32)
    
    # Extract initial features
    print("Extracting features...")
    content_features = build_model(content_tf)
    style_features = build_model(style_tf)
    
    # Compute style gram matrix
    style_features_reshaped = np.reshape(style_features, (-1, style_features.shape[-1]))
    style_gram = np.matmul(style_features_reshaped.T, style_features_reshaped) / style_features_reshaped.shape[0]
    
    # Convert to TF tensors
    content_features = tf1.convert_to_tensor(content_features)
    style_gram = tf1.convert_to_tensor(style_gram)
    
    # Initialize generated audio
    x_gen = tf1.get_variable('generated', 
        shape=[1, content_tf.shape[1], content_tf.shape[2], 3],
        initializer=tf1.random_normal_initializer(stddev=1e-3))
    
    # Initialize all variables
    sess.run(tf1.global_variables_initializer())
    
    # Training loop
    print("Starting style transfer...")
    for i in range(iterations):
        try:
            total_loss, content_loss, style_loss = train_step(x_gen, content_features, style_gram)
            
            if (i + 1) % 10 == 0:
                print(f'Iteration {i + 1}, '
                      f'Total Loss: {total_loss:.4f}, '
                      f'Content Loss: {content_loss:.4f}, '
                      f'Style Loss: {style_loss:.4f}')
        except Exception as e:
            print(f"Error in iteration {i}: {str(e)}")
            break

# Process result
print("Processing final result...")

result = sess.run(x_gen)

a = np.zeros_like(content_spectrum)
a[:N_CHANNELS,:] = np.exp(result[0,0].T) - 1

# Phase reconstruction
print("Performing phase reconstruction...")
p = 2 * np.pi * np.random.random_sample(a.shape) - np.pi
for i in range(500):
    S = a * np.exp(1j*p)
    x = librosa.istft(S, n_fft=N_FFT)
    p = np.angle(librosa.stft(x, n_fft=N_FFT))
    if i % 100 == 0:
        print(f"Phase reconstruction iteration {i}")

# Apply low-pass filter
print("Applying final processing...")
x_rec_smoothed = apply_low_pass_filter(x, content_fs)

# Save result
sf.write(output_audio_path, x_rec_smoothed, content_fs)
print(f"Output audio saved to {output_audio_path}")

# Plot spectrograms
print("Generating spectrograms...")
output_spectrum, _ = read_audio_spectrum(output_audio_path)
plot_spectrograms(content_spectrum, style_spectrum, output_spectrum)

# Display audio players
from IPython.display import Audio
print("\nContent Audio:")
display(Audio(content_audio_path))
print("\nStyle Audio:")
display(Audio(style_audio_path))
print("\nStyled Output:")
display(Audio(output_audio_path))

Processing content audio...
Sampling rate: 44100
Processing style audio...
Sampling rate: 44100
Extracting features...
Starting style transfer...
Error in iteration 0: Gradients are None. Check the computation graph connections.
Processing final result...


ValueError: could not broadcast input array from shape (3,430) into shape (1025,430,3)