<a href="https://colab.research.google.com/github/Tejaswini369-ux/AES-execution/blob/main/Untitled7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os

# Create folders
folders = ['audios','normalized_audios', 'videos', 'spectrograms', 'models','embeddings','texture_images','landmarks','source_landmarks']
for folder in folders:
    os.makedirs(folder, exist_ok=True)

# Check if folders were created
created_folders = [folder for folder in folders if os.path.exists(folder)]
print(created_folders)


['audios', 'normalized_audios', 'videos', 'spectrograms', 'models', 'embeddings', 'texture_images', 'landmarks', 'source_landmarks']


In [None]:
import os

def extract_wav(id, verbose=False, duration=6, sample_rate=16000):
    """
    Extract audio from a video file and save it as a WAV file.

    Parameters:
    - id: The ID of the video file without the extension.
    - verbose: Whether to print verbose messages.
    - duration: Duration of the audio to extract in seconds.
    - sample_rate: Sample rate for the audio.
    """
    if verbose:
        print("-----------extracting audio-------------")

    wavfile = id + ".wav"
    videos_path = "videos/"
    audios_path = "audios/"

    if not os.path.isfile(audios_path + wavfile):
        # Extract audio using ffmpeg directly to audios directory
        os.popen(f"ffmpeg -nostats -loglevel 0 -t {duration} -stream_loop -1 -i {videos_path}{id}.mp4 -vn -ac 1 -ar {sample_rate} -acodec pcm_s16le {audios_path}{wavfile}").read()

        if not os.path.isfile(audios_path + wavfile):
            if verbose:
                print("----------------ffmpeg can't extract audio so deleting --------------")
            return 1
    else:
        if verbose:
            print(f"skipping audio extraction for {id}")

In [None]:
!pip install ffmpeg-python ffmpeg-normalize

Collecting ffmpeg-normalize
  Using cached ffmpeg_normalize-1.32.2-py2.py3-none-any.whl.metadata (32 kB)
Using cached ffmpeg_normalize-1.32.2-py2.py3-none-any.whl (35 kB)
Installing collected packages: ffmpeg-normalize
Successfully installed ffmpeg-normalize-1.32.2


In [None]:
import os
import ffmpeg
import subprocess

def normalize_audio(id):
    input_path = os.path.join('audios', f'{id}.wav')
    output_dir = 'normalized_audios'
    os.makedirs(output_dir, exist_ok=True)  # Ensure output directory exists
    output_path = os.path.join(output_dir, f'{id}.wav')

    # Convert to 16kHz, mono, 16-bit PCM WAV
    ffmpeg.input(input_path).output(
        output_path,
        ar=16000,
        acodec='pcm_s16le',
        ac=1,
        loglevel='quiet'
    ).run(overwrite_output=True)

    # Normalize audio using ffmpeg-normalize
    normalize_command = [
        'ffmpeg-normalize', output_path,
        '--normalization-type', 'rms',
        '-t', '-23',
        '-o', output_path,
        '-f'
    ]
    subprocess.run(normalize_command, check=True)

    return output_path


In [None]:
import numpy as np

def stretch_audio(waveform, sampling_rate=16000, audio_length=10.26):
    target_samples = int(audio_length * sampling_rate)
    current_samples = len(waveform)

    # Pad with repeating initial segment if too short
    if current_samples < target_samples:
        padding_needed = target_samples - current_samples
        padding = waveform[:padding_needed]  # Get initial portion for repeating

        while padding_needed > 0:
            add_samples = min(len(padding), padding_needed)
            waveform = np.concatenate((waveform, padding[:add_samples]))
            padding_needed -= add_samples

    # Trim to exact length if too long
    return waveform[:target_samples]


In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from transformers import AutoModelForAudioClassification
import torch.nn as nn

# Load base model
ast = AutoModelForAudioClassification.from_pretrained(
    "MIT/ast-finetuned-audioset-10-10-0.4593",
    num_labels=4096,
    ignore_mismatched_sizes=True
).to(device)

# Modify classifier
head = ast.classifier
new_head = nn.Sequential(head, nn.ReLU())
ast.classifier = new_head
ast_checkpoint = torch.load('models/ast.pt', map_location=device)
ast.load_state_dict(ast_checkpoint["model_state_dict"])

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([4096]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([4096, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [None]:
ast.eval()

ASTForAudioClassification(
  (audio_spectrogram_transformer): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ASTLayer(
          (attention): ASTAttention(
            (attention): ASTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=T

In [None]:
import torch.nn as nn


class FaceDecoder(nn.Module):
    """
    Input to the model should be face embedding vector of dimension equal to 4096
    """
    def __init__(self):
        super(FaceDecoder, self).__init__()
        self.pre_layers = self.define_pre_layers()
        self.landmark_layers = self.define_landmark_layers()
        self.texture_layers = self.define_texture_layers()


    def define_pre_layers(self):
        layers = nn.ModuleList()
        pre_layer1 = nn.Sequential(
            nn.Linear(in_features=4096, out_features=3072),
            nn.BatchNorm1d(num_features=3072),
            nn.ReLU()
        )
        layers.append(pre_layer1)

        pre_layer2 = nn.Sequential(
            nn.Linear(in_features=3072, out_features=2048),
            nn.BatchNorm1d(num_features=2048),
            nn.ReLU()
        )
        layers.append(pre_layer2)

        return layers


    def define_landmark_layers(self):
        layers = nn.ModuleList()
        landmark_layer1 = nn.Sequential(
            nn.Linear(in_features=2048, out_features=1024),
            nn.BatchNorm1d(num_features=1024),
            nn.ReLU()
        )
        layers.append(landmark_layer1)

        landmark_layer2 = nn.Sequential(
            nn.Linear(in_features=1024, out_features=512),
            nn.BatchNorm1d(num_features=512),
            nn.ReLU()
        )
        layers.append(landmark_layer2)

        landmark_layer3 = nn.Sequential(
            nn.Linear(in_features=512, out_features=256),
            nn.BatchNorm1d(num_features=256),
            nn.ReLU()
        )
        layers.append(landmark_layer3)

        landmark_layer4 = nn.Sequential(
            nn.Linear(in_features=256, out_features=144),
        )
        layers.append(landmark_layer4)

        return layers


    def define_texture_layers(self):
        layers = nn.ModuleList()
        texture_layer1 = nn.Sequential(
            nn.Linear(in_features=2048, out_features=256 * 14 * 14),
            nn.ReLU()
        )
        layers.append(texture_layer1)

        texture_layer2 = nn.Sequential(
            nn.Unflatten(dim=1, unflattened_size=(256, 14, 14)),
            nn.ConvTranspose2d(in_channels=256, out_channels=128, kernel_size=5, stride=2, padding=2, output_padding=1),
            nn.ReLU()
        )
        layers.append(texture_layer2)

        texture_layer3 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=128, out_channels=64, kernel_size=5, stride=2, padding=2, output_padding=1),
            nn.ReLU()
        )
        layers.append(texture_layer3)

        texture_layer4 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=64, out_channels=32, kernel_size=5, stride=2, padding=2, output_padding=1),
            nn.ReLU()
        )
        layers.append(texture_layer4)

        texture_layer5 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=32, out_channels=32, kernel_size=5, stride=2, padding=2, output_padding=1),
            nn.ReLU()
        )
        layers.append(texture_layer5)

        texture_layer6 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=3, kernel_size=1, stride=1),
            nn.Sigmoid()
        )
        layers.append(texture_layer6)

        return layers


    def forward(self, x):
        for pre_layer in self.pre_layers:
            x = pre_layer(x)

        landmarks = x
        for landmark_layer in self.landmark_layers:
            landmarks = landmark_layer(landmarks)

        texture = x
        for texture_layer in self.texture_layers:
            texture = texture_layer(texture)

        return landmarks, texture

    def get_predifined_layer_activation(self, x):
        for pre_layer in self.pre_layers:
            x = pre_layer(x)
        return x

In [None]:
face_decoder = FaceDecoder().to(device)
face_decoder_checkpoint = torch.load('models/face_decoder.pt', map_location=device)
face_decoder.load_state_dict(face_decoder_checkpoint["model_state_dict"])

<All keys matched successfully>

In [None]:
from transformers import ASTFeatureExtractor
import librosa
import torch

feature_extractor = ASTFeatureExtractor(sampling_rate=16000,mean=-5.460994,std=3.1129124)
id = "5ablueV_1tw"
normalize_audio(id)
waveform, _ = librosa.load(f"normalized_audios/{id}.wav",duration=10.26, sr=16000, mono=True)
waveform = stretch_audio(waveform)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
inputs = feature_extractor(waveform, sampling_rate = 16000, padding = "max_length", return_tensors = "np")
input_values = torch.tensor(inputs.input_values).to(device)

In [None]:
face_decoder.eval()

FaceDecoder(
  (pre_layers): ModuleList(
    (0): Sequential(
      (0): Linear(in_features=4096, out_features=3072, bias=True)
      (1): BatchNorm1d(3072, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (1): Sequential(
      (0): Linear(in_features=3072, out_features=2048, bias=True)
      (1): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
  )
  (landmark_layers): ModuleList(
    (0): Sequential(
      (0): Linear(in_features=2048, out_features=1024, bias=True)
      (1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (1): Sequential(
      (0): Linear(in_features=1024, out_features=512, bias=True)
      (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (2): Sequential(
      (0): Linear(in_features=512, out_features=256, bias=True)
      (1): BatchNorm1d(

In [None]:
from PIL import Image

with torch.no_grad():
  voice_encoder_embeddings = ast(input_values)
  voice_encoder_embeddings = voice_encoder_embeddings.logits
  print(voice_encoder_embeddings.shape)
  landmarks_predicted, images_predicted = face_decoder(voice_encoder_embeddings)
  print(images_predicted)
  texture_img = images_predicted.cpu().squeeze().permute(1, 2, 0).numpy()  # Convert to HWC format
  texture_img = (texture_img * 255).astype(np.uint8)  # Scale to 0-255 range
  Image.fromarray(texture_img).save(f'texture_images/{id}.jpg')


torch.Size([1, 4096])
tensor([[[[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          ...,
          [0.2209, 0.2038, 0.1955,  ..., 0.8359, 0.9365, 0.9679],
          [0.2444, 0.2074, 0.1990,  ..., 0.7337, 0.7589, 0.9165],
          [0.2578, 0.2225, 0.2169,  ..., 0.6665, 0.6689, 0.7414]],

         [[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          ...,
          [0.2247, 0.2078, 0.2091,  ..., 0.8245, 0.9210, 0.9612],
          [0.2389, 0.2164, 0.2161,  ..., 0.7231, 0.7386, 0.8802],
          [0.2391, 0.2186, 0.2245,  ..., 0.6287, 0.6182, 0.6470]],

         [[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1