## Import

In [1]:
!pip install datasets==2.15.0



In [2]:
import random
import pandas as pd
import numpy as np
import os
import torch
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import librosa
from glob import glob
import datasets #HF
from datasets import Dataset, DatasetDict

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings(action='ignore') 

  from .autonotebook import tqdm as notebook_tqdm


## Hyperparameter Setting

In [3]:
CFG = {
    'SR':16000,
    'N_MFCC':32, # Melspectrogram 벡터를 추출할 개수
    'SEED':42
}

## Fixed Random-Seed

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.cuda.manual_seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


seed_everything(CFG['SEED']) # Seed 고정f_data=hf_data.train_test_split(train_size=0.8,seed=0)


## Data Pre-Processing

In [5]:
data_url = "../data"

train_df = pd.read_csv(os.path.join(data_url, 'train.csv'))
test_df = pd.read_csv(os.path.join(data_url, 'test.csv'))

train_df['path'] = data_url + os.sep + train_df['path']
test_df['path'] = data_url + os.sep + test_df['path']

train_data = Dataset.from_pandas(train_df)
test_data = Dataset.from_pandas(test_df)

train_data = train_data.train_test_split(train_size=0.8, seed=CFG['SEED'])
train_data

DatasetDict({
    train: Dataset({
        features: ['id', 'path', 'label'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['id', 'path', 'label'],
        num_rows: 1001
    })
})

In [6]:
labels = np.sort(train_df['label'].unique())
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[int(label)] = str(i)  # Convert to Python int
    id2label[str(i)] = int(label)  # Convert to Python int

In [7]:
label2id

{0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5'}

In [8]:
train_data = train_data.cast_column("path", datasets.Audio(sampling_rate=CFG['SR']))
train_data['train'][0]

{'id': 'TRAIN_0451',
 'path': {'path': '../data/./train/TRAIN_0451.wav',
  'array': array([0.01794434, 0.0173645 , 0.01745605, ..., 0.        , 0.        ,
         0.        ]),
  'sampling_rate': 16000},
 'label': 1}

In [9]:
from transformers import AutoFeatureExtractor
model = 'facebook/wav2vec2-large-xlsr-53'
feature_extractor = AutoFeatureExtractor.from_pretrained(model)

In [10]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["path"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000*2, truncation=True
    )
    return inputs

In [11]:
encoded_dataset = train_data.map(preprocess_function, remove_columns=["path"], batched=True)
encoded_dataset

Map: 100%|██████████| 4000/4000 [00:03<00:00, 1030.88 examples/s]
Map: 100%|██████████| 1001/1001 [00:00<00:00, 1123.10 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'input_values', 'attention_mask'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['id', 'label', 'input_values', 'attention_mask'],
        num_rows: 1001
    })
})

## Classification Model Fit

In [12]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

model='facebook/wav2vec2-large-xlsr-53'
num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(model, num_labels=num_labels, label2id=label2id, id2label=id2label)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.weight', 'projector.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
!pip show datasets

Name: datasets
Version: 2.15.0
Summary: HuggingFace community-driven open-source library of datasets
Home-page: https://github.com/huggingface/datasets
Author: HuggingFace Inc.
Author-email: thomas@huggingface.co
License: Apache 2.0
Location: /scratch/network/mk8574/.conda/envs/mk8574_3.10/lib/python3.10/site-packages
Requires: aiohttp, dill, fsspec, huggingface-hub, multiprocess, numpy, packaging, pandas, pyarrow, pyarrow-hotfix, pyyaml, requests, tqdm, xxhash
Required-by: 


In [18]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=True,
    learning_rate=5e-5,
    num_train_epochs=50,
    logging_steps =10,
    per_device_train_batch_size =16,
    per_device_eval_batch_size =16,
    save_total_limit =1,
    push_to_hub=False,
    report_to="none"
)

In [15]:
def compute_metrics(eval_preds):
    metric = datasets.load_metric("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [31]:

os.environ["TRANSFORMERS_OFFLINE"] = "1"

trainer.train()

AttributeError: 'AcceleratorState' object has no attribute 'distributed_type'

In [27]:
accelerate.__version__

'0.24.1'

In [29]:
!pip install git+https://github.com/huggingface/accelerate
!pip install --upgrade transformers

Collecting git+https://github.com/huggingface/accelerate
  Cloning https://github.com/huggingface/accelerate to /tmp/pip-req-build-g5npp8qu
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/accelerate /tmp/pip-req-build-g5npp8qu
  fatal: unable to access 'https://github.com/huggingface/accelerate/': Could not resolve host: github.com
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mgit clone --[0m[32mfilter[0m[32m=[0m[32mblob[0m[32m:none --quiet [0m[4;32mhttps://github.com/huggingface/accelerate[0m[32m [0m[32m/tmp/[0m[32mpip-req-build-g5npp8qu[0m did not run successfully.
  [31m│[0m exit code: [1;36m128[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
[1;31merror[0m: [1msubprocess-exited-with-error[0m

[31m×[0m [32mgit clone --[0m[32mfilter[0m[32m=[0m[32mblob[0m[32m:none --quiet [0m[4;32mhttps:

## Inference

In [None]:
# preds = model.predict(test_x)

## Submission

In [None]:
# submission = pd.read_csv(os.path.join(data_url, 'sample_submission.csv'))
# submission['label'] = preds
# submission.to_csv('./baseline_submission.csv', index=False)