In [24]:
import pandas as pd
import os
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
from PIL import Image
from transformers import CLIPProcessor,CLIPModel
from transformers import AutoTokenizer, AutoModel
import torch
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
import timm
from transformers import AutoImageProcessor, ResNetForImageClassification
from transformers import AutoImageProcessor, ResNetModel, AutoModelForCTC

In [2]:
global df
df = pd.read_csv("/hdd/wmh/Cookie/src_data/final/data.csv")[['sample', 'label']]

In [3]:
device = 'cuda'

In [4]:
def generate_mel_images(src_path: str, save_path: str):
    # 加载音频文件
    for _, row in df.iterrows():
        sample = row['sample']
        label = row['label']
        # 加载音频
        audio_file = f"{src_path}/{sample}.mp3"
        y, sr = librosa.load(audio_file)
    
        # 计算 mel 频谱图
        mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
        
        # 将 mel 频谱图转换为 dB 尺度
        log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
    
        # 使用 librosa 显示 mel 频谱图
        plt.figure()
        librosa.display.specshow(log_mel_spectrogram, sr=sr)
        plt.savefig('mel_spectrogram.png', bbox_inches='tight')
        matplotlib.pyplot.close()
        # 使用 cv2 保存 224x224 大小的图片
        img = cv2.imread('mel_spectrogram.png')
        img = cv2.resize(img, (224, 224))
        cv2.imwrite(f"{save_path}/{sample}.png", img)

### Image

In [5]:
def generate_CLIP_features(src_path: str, save_path: str):
    model = CLIPModel.from_pretrained("/hdd/wmh/clip/").to(device)
    processor = CLIPProcessor.from_pretrained("/hdd/wmh/clip/")
    clip_features = []
    global df
    for _, row in df.iterrows():
        sample = row['sample']
        label = row['label']
        img = Image.open(f"{src_path}{sample}.png")
        inputs = processor(images=img, return_tensors="pt").to(device)
        with torch.no_grad():
            image_features = model.get_image_features(**inputs).squeeze().cpu() # [1, 512] --> [512] 
        clip_features.append([sample, label] + list(image_features.detach().numpy()))
    df = pd.DataFrame(clip_features)
    df.to_csv(save_path, index=False)

In [6]:
def generate_mvitv_features(src_path: str, save_path: str):
    model = model = timm.create_model(
        'mvitv2_base_cls.fb_inw21k',
        pretrained=True,
        num_classes=0,  # remove classifier nn.Linear
    ).to(device)
    model = model.eval()
    # get model specific transforms (normalization, resize)
    data_config = timm.data.resolve_model_data_config(model)
    transforms = timm.data.create_transform(**data_config, is_training=False)
    
    clip_features = []
    global df
    for _, row in df.iterrows():
        sample = row['sample']
        label = row['label']
        img = Image.open(f"{src_path}{sample}.png")
        output = model(transforms(img).unsqueeze(0).to(device))  # output is (batch_size, num_features) shaped tensor
        # or equivalently (without needing to set num_classes=0)
        output = model.forward_features(transforms(img).unsqueeze(0).to(device))
        # output is unpooled, a (1, 50, 768) shaped tensor
        output = model.forward_head(output, pre_logits=True).cpu().squeeze()
        clip_features.append([sample, label] + list(output.detach().numpy()))
    df = pd.DataFrame(clip_features)
    df.to_csv(save_path, index=False)

In [7]:
def generate_ResNet_features(src_path: str, save_path: str):
    processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
    model = ResNetModel.from_pretrained("microsoft/resnet-50").to(device)
    
    Resnet_features = []
    global df
    for _, row in df.iterrows():
        sample = row['sample']
        label = row['label']
        img = Image.open(f"{src_path}{sample}.png")
        inputs = processor(images=img, return_tensors="pt").to(device)
        with torch.no_grad():
            image_features = model(**inputs).pooler_output.squeeze().cpu() # [1, 512] --> [512] 
            # print(model(**inputs).pooler_output.squeeze().cpu().shape)
            # assert 0
        Resnet_features.append([sample, label] + list(image_features.detach().numpy()))
    df = pd.DataFrame(Resnet_features)
    df.to_csv(save_path, index=False)

### Text

In [8]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings.cpu() * input_mask_expanded.cpu(), 1) / torch.clamp(input_mask_expanded.cpu().sum(1), min=1e-9)

In [9]:
def generate_text_features(model_path: str, src_path: str,  save_path: str):
    # 加载模型
    model = AutoModel.from_pretrained(model_path).to(device)
    Tokenizer = AutoTokenizer.from_pretrained(model_path)
    text_features = []
    print(device)
    global df
    for _, row in tqdm(df.iterrows()):
        sample = row['sample']
        label = row['label']
        with open(f"{src_path}/{sample}.txt") as f:
            text = f.read().replace("\t", ' ')
        with torch.no_grad():
            if "mpnet" in model_path:
                inputs = Tokenizer(text, padding=True, truncation=True, return_tensors='pt').to(device)
                model_output = model(**inputs)
                embeddings = mean_pooling(model_output, inputs['attention_mask'])
            else:
                try:
                    inputs = Tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device)
                    embeddings = model(**inputs)['pooler_output'] # [1, 512] --> [512] 
                except:
                    print(sample)
                    print(label)
                    print(text)
                    assert 0
                
        text_features.append([sample, label] + list(embeddings.squeeze().cpu().detach().numpy()))
    res_df = pd.DataFrame(text_features)
    res_df.to_csv(save_path, index=False)

## Audio

In [12]:
import librosa
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, AutoModel, AutoFeatureExtractor

In [13]:
def genereate_audio_features(model_path: str, src_path: str,  save_path: str):
    global df
    model = AutoModel.from_pretrained(model_path).to(device)
    processor = AutoFeatureExtractor.from_pretrained(model_path)
    audio_features = []
    
    max_len = 16000 * 30
    for _, row in tqdm(df.iterrows()):
        sample = row['sample']
        label = row['label']
        wav, sr = librosa.load(f"{src_path}/{sample}.mp3", sr=16000)
        # k clips
        wav_length = wav.shape[0]
        clips = [(i * max_len, min((i + 1) * max_len, wav_length)) for i in range(int(wav_length / max_len) + 1)]
        clip_features = []
        
        for clip in clips:
            inputs = processor(wav[clip[0]: clip[1]], sampling_rate=16000, return_tensors="pt").input_features.cuda()
            decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
            with torch.no_grad():
                outputs = model(inputs.cuda(), decoder_input_ids=decoder_input_ids.cuda())
            out_feature = outputs['last_hidden_state'].mean(dim=1)
            # print(out_feature.shape)
            clip_features.append(out_feature.squeeze().cpu())
        # 将所有片段的特征取平均
        avg_feature = np.mean(clip_features, axis=0)
        audio_features.append([sample, label] + list(avg_feature))
    res_df = pd.DataFrame(audio_features)
    res_df.to_csv(save_path, index=False)

In [14]:
def genereate_hubert_features(model_path: str, src_path: str,  save_path: str):
    global df
    model = AutoModel.from_pretrained(model_path).to(device)
    processor = AutoProcessor.from_pretrained(model_path)
    audio_features = []
    
    max_len = 16000 * 30
    for _, row in tqdm(df.iterrows()):
        sample = row['sample']
        label = row['label']
        wav, sr = librosa.load(f"{src_path}/{sample}.mp3", sr=16000)
        # k clips
        wav_length = wav.shape[0]
        clips = [(i * max_len, min((i + 1) * max_len, wav_length)) for i in range(int(wav_length / max_len) + 1)]
        clip_features = []
        
        for clip in clips:
            inputs = processor(wav[clip[0]: clip[1]], sampling_rate=16000, return_tensors="pt").input_values.cuda()
            with torch.no_grad():
                outputs = model(inputs.cuda())
            out_feature = outputs['last_hidden_state'].mean(dim=1)
            # print(out_feature.shape)
            clip_features.append(out_feature.squeeze().cpu())
        # 将所有片段的特征取平均
        avg_feature = np.mean(clip_features, axis=0)
        audio_features.append([sample, label] + list(avg_feature))
    res_df = pd.DataFrame(audio_features)
    res_df.to_csv(save_path, index=False)

In [64]:
def genereate_wav2vec_features(src_path: str,  save_path: str):
    global df
    model = AutoModel.from_pretrained("facebook/wav2vec2-base-960h").to(device)
    tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
    feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
    audio_features = []
    
    max_len = 16000 * 30
    for _, row in tqdm(df.iterrows()):
        sample = row['sample']
        label = row['label']
        wav, sr = librosa.load(f"{src_path}/{sample}.mp3", sr=16000)
        # k clips
        wav_length = wav.shape[0]
        clips = [(i * max_len, min((i + 1) * max_len, wav_length)) for i in range(int(wav_length / max_len) + 1)]
        clip_features = []
        
        for clip in clips:
            inputs = feature_extractor(wav[clip[0]: clip[1]], sampling_rate=16000, return_tensors="pt").input_values.cuda()
            with torch.no_grad():
                outputs = model(inputs.cuda())
            out_feature = outputs['last_hidden_state'].mean(dim=1)
            # print(out_feature.squeeze().shape)
            clip_features.append(out_feature.squeeze().cpu().numpy())
        # 将所有片段的特征取平均
        # print(np.array(clip_features).shape)
        avg_feature = np.mean(np.array(clip_features), axis=0)
        audio_features.append([sample, label] + list(avg_feature))
    res_df = pd.DataFrame(audio_features)
    res_df.to_csv(save_path, index=False)

In [65]:
srcpath = "/hdd/wmh/Cookie/src_data/audio/"

savepath = "/hdd/wmh/Cookie/feature/audio-wav2vec-base.csv"

In [66]:
genereate_wav2vec_features(srcpath, savepath)

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
549it [03:51,  2.38it/s]


In [47]:
model = AutoModel.from_pretrained("facebook/wav2vec2-base-960h").to(device)
tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
audio_features = []

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [57]:
max_len = 16000 * 30
for _, row in tqdm(df.iterrows()):
    sample = row['sample']
    label = row['label']
    wav, sr = librosa.load(f"{src_path}/{sample}.mp3", sr=16000)
    # k clips
    wav_length = wav.shape[0]
    clips = [(i * max_len, min((i + 1) * max_len, wav_length)) for i in range(int(wav_length / max_len) + 1)]
    clip_features = []
    
    for clip in clips:
        inputs = feature_extractor(wav[clip[0]: clip[1]], sampling_rate=16000, return_tensors="pt").input_values.cuda()
        with torch.no_grad():
            outputs = model(inputs.cuda())
        out_feature = outputs['last_hidden_state'].mean(dim=1)
        print(out_feature.squeeze().shape)
        clip_features.append(out_feature.squeeze().cpu())
    break

0it [00:00, ?it/s]

torch.Size([768])
torch.Size([768])
torch.Size([768])


0it [00:00, ?it/s]

torch.Size([768])
torch.Size([768])





In [58]:
for clip in clip_features:
    print(clip.shape)

(768,)
(768,)
(768,)
(768,)
(768,)


In [59]:
# 将所有片段的特征取平均
print(np.array(clip_features).shape)


(5, 768)


In [60]:
avg_feature = np.mean(np.array(clip_features), axis=0)
audio_features.append([sample, label] + list(avg_feature))

In [62]:
avg_feature.shape

(768,)