In [72]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torchaudio
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report



In [66]:
def load_process(path):
    df = pd.read_csv(f'{path}/dem-info.csv')
    # fix 66* string value in Age column
    df['Age'] = df['Age'].apply(lambda x: x.replace('66*', '66'))
    df['Age'] = df['Age'].astype(int)

    # get_file_names
    for ext in ["CTD", "PFT", 'SFT']:
        df[f'{ext}_wav'] = f'{path}/' + df['Record-ID'] + '/' + df['Record-ID'] + f'__{ext}.wav'
        df[f'{ext}_txt'] = f'{path}/' + df['Record-ID'] + '/' + df['Record-ID'] + f'__{ext}.txt'

    return df

def load_process_cleaned(path):
    df = pd.read_csv(f'{path}/dem-info.csv')
    # fix 66* string value in Age column
    df['Age'] = df['Age'].apply(lambda x: x.replace('66*', '66'))
    df['Age'] = df['Age'].astype(int)

    # get_file_names
    for ext in ["CTD", "PFT", 'SFT']:
        df[f'{ext}_wav'] = f'{path}/' + df['Record-ID'] + '/' + df['Record-ID'] + f'_{ext}_clean.wav'
        df[f'{ext}_txt'] = f'{path}/' + df['Record-ID'] + '/' + df['Record-ID'] + f'_{ext}.txt'

    return df

process_path = './data/PROCESS-V1'
df = load_process(process_path)
df_clean = load_process_cleaned(process_path)

In [67]:
w2v2processor = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-base-960h')
w2v2model = Wav2Vec2Model.from_pretrained('facebook/wav2vec2-base-960h')

def extract_wav2vec_features(wav_path):
    waveform, sample_rate = torchaudio.load(wav_path)
    inputs = w2v2processor(waveform.squeeze(), sampling_rate=sample_rate, return_tensors='pt', padding=True)
    with torch.no_grad():
        outputs = w2v2model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# df_train_w2v_ctd = [extract_wav2vec_features(a) for a in tqdm(df['CTD_wav'].values)]
# np.savetxt('./data/features/process-ctd-w2v-feature.csv', df_train_w2v_ctd, delimiter=',')
# df_train_w2v_pft = [extract_wav2vec_features(a) for a in tqdm(df['PFT_wav'].values)]
# np.savetxt('./data/features/process-pft-w2v-feature.csv', df_train_w2v_pft, delimiter=',')
# df_train_w2v_sft = [extract_wav2vec_features(a) for a in tqdm(df['SFT_wav'].values)]
# np.savetxt('./data/features/process-sft-w2v-feature.csv', df_train_w2v_sft, delimiter=',')

# df_train_w2v_ctd_clean = [extract_wav2vec_features(a) for a in tqdm(df_clean['CTD_wav'].values)]
# np.savetxt('./data/features/process-ctd-w2v-feature-clean.csv', df_train_w2v_ctd_clean, delimiter=',')
# df_train_w2v_pft_clean = [extract_wav2vec_features(a) for a in tqdm(df_clean['PFT_wav'].values)]
# np.savetxt('./data/features/process-pft-w2v-feature-clean.csv', df_train_w2v_pft_clean, delimiter=',')
# df_train_w2v_sft_clean = [extract_wav2vec_features(a) for a in tqdm(df_clean['SFT_wav'].values)]
# np.savetxt('./data/features/process-sft-w2v-feature-clean.csv', df_train_w2v_sft_clean, delimiter=',')

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
X_ctd = pd.read_csv("./data/features/process-ctd-w2v-feature.csv", header=None)
y_ctd, string_classes = pd.factorize(df['Class'])
X_ctd_train, X_ctd_test, y_ctd_train, y_ctd_test = train_test_split(X_ctd, y_ctd, test_size=0.2, random_state=42)

X_ctd_tensor_train = torch.tensor(X_ctd.values, dtype=torch.float32)
y_ctd_tensor_train = torch.tensor(y_ctd, dtype=torch.long)

In [None]:
dataset = TensorDataset(X_ctd_tensor_train, y_ctd_tensor_train)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

class FCNN(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(FCNN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        return self.fc(x)

model = FCNN(input_dim=768, num_classes=3)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(1000):
    for batch in dataloader:
        X_batch, y_batch = batch
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

model_save_path = "./models/fcnn_model.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Modell gespeichert unter: {model_save_path}")


# print("Numerische Labels:", y_numeric)        # [0, 1, 0, 2, 1]
# print("Eindeutige Klassen:", unique_classes)  # Index(['Class A', 'Class B', 'Class C'], dtype='object')

# # Numerische Labels zurück zu den String-Labels
# y_original = [unique_classes[num] for num in y_numeric]

# print("String-Labels zurück:", y_original)    # ['Class A', 'Class B', 'Class A', 'Class C', 'Class B']

Epoch 1, Loss: 0.9482460618019104
Epoch 2, Loss: 0.9926381707191467
Epoch 3, Loss: 0.9062037467956543
Epoch 4, Loss: 0.8487342596054077
Epoch 5, Loss: 0.9390729069709778
Epoch 6, Loss: 0.9928796887397766
Epoch 7, Loss: 1.0173670053482056
Epoch 8, Loss: 0.8835824131965637
Epoch 9, Loss: 1.018465518951416
Epoch 10, Loss: 0.9356891512870789
Epoch 11, Loss: 0.9003334045410156
Epoch 12, Loss: 0.8684390783309937
Epoch 13, Loss: 0.9368830919265747
Epoch 14, Loss: 0.9130021333694458
Epoch 15, Loss: 0.8713290095329285
Epoch 16, Loss: 0.8983926177024841
Epoch 17, Loss: 0.9464372396469116
Epoch 18, Loss: 0.8155264854431152
Epoch 19, Loss: 0.8201077580451965
Epoch 20, Loss: 1.0310001373291016
Epoch 21, Loss: 0.8982091546058655
Epoch 22, Loss: 0.8911512494087219
Epoch 23, Loss: 0.9564913511276245
Epoch 24, Loss: 0.7938830256462097
Epoch 25, Loss: 0.79862380027771
Epoch 26, Loss: 0.8667285442352295
Epoch 27, Loss: 0.9504789113998413
Epoch 28, Loss: 0.9153608083724976
Epoch 29, Loss: 0.77961003780364

In [73]:
model.eval()

X_ctd_tensor_test = torch.tensor(X_ctd_test.values , dtype=torch.float32)

with torch.no_grad():
    outputs = model(X_ctd_tensor_test)

_, predicted = torch.max(outputs, 1)

print("Accuracy:", accuracy_score(y_ctd_test, predicted))
print("Classification Report:\n", classification_report(y_ctd_test, predicted))

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         2
           2       1.00      1.00      1.00        20

    accuracy                           1.00        32
   macro avg       1.00      1.00      1.00        32
weighted avg       1.00      1.00      1.00        32

