In [1]:
import pandas as pd
import numpy as np
import librosa
import seaborn as sns
import os
import json
import IPython.display as ipd
import soundfile as sf
import math
import h5py

from glob import glob
from tqdm import tqdm
from matplotlib import pyplot as plt
from itertools import chain
from os.path import join as pjoin
from shutil import copyfile
from copy import deepcopy

from code_base.utils import parallel_librosa_load, write_json, load_json
from code_base.models.blocks import TraceableMelspec
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


`speechbrain` was not imported
`LEAF` was not imported




# Input Data

In [None]:
glob("/home/vova/data/exps/birdclef_2024/birdclef_2024/*")

In [None]:
eBird_Taxonomy_v2021 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2024/eBird_Taxonomy_v2021.csv")
sample_submission = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2024/sample_submission.csv")
train_metadata = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2024/train_metadata_extended.csv", converters={"secondary_labels":eval, "all_labels": eval})

In [None]:
train_metadata.head()

In [None]:
eBird_Taxonomy_v2021.head()

In [None]:
scored_birds = sample_submission.columns[1:]
len(scored_birds)

In [None]:
train_metadata["primary_label"].isin(scored_birds).sum() / train_metadata.shape[0]

In [None]:
train_metadata["primary_label"].value_counts()

In [None]:
train_metadata

In [None]:
sample_submission

In [None]:
glob("/home/vova/data/exps/birdclef_2024/birdclef_2024/test_soundscapes/*")

In [None]:
train_metadata.columns

In [None]:
(set(train_metadata["primary_label"]) | set(chain(*train_metadata["secondary_labels"]))) - set(scored_birds)

In [None]:
set(scored_birds)

In [None]:
write_json(
    "/home/vova/data/exps/birdclef_2024/scored_birds/sb_2024.json",
    scored_birds.to_list()
)

# Dataset Description

Your challenge in this competition is to identify which birds are calling in recordings made in a Global Biodiversity Hotspot in the Western Ghats. This is an important task for scientists who monitor bird populations for conservation purposes. More accurate solutions could enable more comprehensive monitoring.

This competition uses a hidden test set. When your submitted notebook is scored, the actual test data will be made available to your notebook.

# Files

**train_audio/** The training data consists of short recordings of individual bird calls generously uploaded by users of xenocanto.org. These files have been downsampled to 32 kHz where applicable to match the test set audio and converted to the ogg format. The training data should have nearly all relevant files; we expect there is no benefit to looking for more on xenocanto.org and appreciate your cooperation in limiting the burden on their servers.

**test_soundscapes/** When you submit a notebook, the test_soundscapes directory will be populated with approximately 1,100 recordings to be used for scoring. They are 4 minutes long and in ogg audio format. The file names are randomized but have the general form of soundscape_xxxxxx.ogg. It should take your submission notebook approximately five minutes to load all of the test soundscapes.

**unlabeled_soundscapes/** Unlabeled audio data from the same recording locations as the test soundscapes.

**train_metadata.csv** A wide range of metadata is provided for the training data. The most directly relevant fields are:

- `primary_label` - a code for the bird species. You can review detailed information about the bird codes by appending the code to `https://ebird.org/species/`, such as `https://ebird.org/species/amecro` for the American Crow. Not all species have their own pages; some links will fail.
- `latitude & longitude`: coordinates for where the recording was taken. Some bird species may have local call 'dialects,' so you may want to seek geographic diversity in your training data.
- `author` - The user who provided the recording.
- `filename`: the name of the associated audio file.

**sample_submission.csv** A valid sample submission.

- `row_id`: A slug of `soundscape_[soundscape_id]_[end_time]` for the prediction.
- `[bird_id]`: There are 182 bird ID columns. You will need to predict the probability of the presence of each bird for each row.

**eBird_Taxonomy_v2021.csv** - Data on the relationships between different species.

# Fix Duplicates

In [2]:
def plot_n_spectograms_with_librosa(
    pathes,
    sr: int,
    n_mels: int,
    fmin: int,
    fmax: int,
    hop_length: int,
    n_fft: int,
    n_cols: int = 5,
    figsize: tuple = (5, 5),
    save_path: str = None,
):
    """
    Plot n spectograms from df using librosa
    """
    import librosa
    import librosa.display

    fig, axs = plt.subplots(
        len(pathes), 1, figsize=figsize, sharex=True, sharey=True
    )
    axs = axs.flatten()
    for i, path in enumerate(pathes):
        y, sr = librosa.load(f"/home/vova/data/exps/birdclef_2024/birdclef_2024/train_audio/{path}", sr=sr)
        D = librosa.amplitude_to_db(np.abs(librosa.stft(y, hop_length=hop_length)),ref=np.max)
        librosa.display.specshow(D, sr=sr, hop_length=hop_length,x_axis='time', ax=axs[i])
    plt.tight_layout()
    if save_path:
        plt.savefig(save_path)
    plt.show()

In [None]:
train_metadata = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2024/train_metadata_extended.csv")
train_metadata.head()

In [3]:
def merge_duplicates(full_df, dupl_idxs, with_labels_merge=True):
    if len(dupl_idxs) > 2 and with_labels_merge:
        raise ValueError("Can not handle more than 2 dupls")
    dupl_df = full_df.loc[dupl_idxs]
    if with_labels_merge:
        dupl_df["all_labels"] = dupl_df["all_labels"].apply(eval)
        if set(dupl_df["all_labels"].iloc[0]) != set(dupl_df["all_labels"].iloc[1]):
            dupl_df = dupl_df.sort_values("rating", ascending=False)
            full_df.loc[dupl_df.index[0], "secondary_labels"] = repr(list(set(
                eval(full_df.loc[dupl_df.index[0], "secondary_labels"]) +
                [full_df.loc[dupl_df.index[1], "primary_label"]] +
                eval(full_df.loc[dupl_df.index[1], "secondary_labels"])
            )))
            full_df.loc[dupl_df.index[0], "all_labels"] = repr(list(set(
                eval(full_df.loc[dupl_df.index[0], "all_labels"]) + eval(full_df.loc[dupl_df.index[1], "all_labels"])
            )))

        full_df = full_df.drop(index=[dupl_df.index[-1]])
    else:
        full_df = full_df.drop(index=dupl_df.index[1:])
    return full_df

In [None]:
train_metadata["id"] = train_metadata["url"].apply(lambda x: x.split("/")[-1])

In [None]:
id_vc = train_metadata["id"].value_counts()

In [None]:
duplicated_id = id_vc[id_vc > 1].index

In [None]:
for dupl_id in duplicated_id:
    print("\n\n\nBefore\n\n\n")
    print(train_metadata.loc[train_metadata["id"] == dupl_id, ["primary_label", "secondary_labels", "latitude", "longitude", "duration_s", "all_labels"]])
    train_metadata = merge_duplicates(train_metadata, train_metadata[train_metadata["id"] == dupl_id].index)
    print("\n\n\nAfter\n\n\n")
    print(train_metadata.loc[train_metadata["id"] == dupl_id, ["primary_label", "secondary_labels", "latitude", "longitude", "duration_s", "all_labels"]])

In [None]:
train_metadata["id"].value_counts()

In [None]:
train_metadata = train_metadata.reset_index(drop=True)

In [None]:
my_duplicates = train_metadata.groupby(["duration_s", "author", "primary_label"])["filename"].apply(list).to_list()
my_duplicates = [el for el in my_duplicates if len(el) > 1]
len(my_duplicates)

In [None]:
my_duplicates[187]

In [None]:
for idx, dupl in enumerate(my_duplicates):
    print(f"Duplicate ID: {idx}")
    plot_n_spectograms_with_librosa(
        dupl,
        sr=None,
        n_mels=129,
        fmin=20,
        fmax=None,
        hop_length=512,
        n_fft=1024,
    )

In [None]:
my_vis_dupl_indices = [
    0, 4, 6, 11, 14, 25, 27, 30, 31, 34, 37, 39, 40, 46, 52, 54, 58, 62, 63, 72, 77, 79, 
    81, 82, 85, 87, 90, 91, 92, 94, 95, 96, 101, 102, 103, 104, 105, 108, 111, 112, 114, 
    115, 116, 119, 120, 121, 124, 125, 126, 127, 128, 130, 133, 135, 136, 144, 147, 149, 
    151, 152, 154, 156, 157, 158, 159, 161, 162, 163, 165, 166, 167, 168, 169, 170, 171, 
    173, 175, 176, 178, 179, 180, 181, 182, 183, 184, 185, 187
]
my_duplicates_vis = [my_duplicates[idx] for idx in my_vis_dupl_indices]
my_duplicates_vis = [el for el in my_duplicates_vis if len(el) ==2]

In [None]:
len(my_duplicates_vis)

In [None]:
for idx, dupl in enumerate(my_duplicates_vis):
    print(f"Duplicate ID: {idx}")
    plot_n_spectograms_with_librosa(
        dupl,
        sr=None,
        n_mels=129,
        fmin=20,
        fmax=None,
        hop_length=512,
        n_fft=1024,
    )

In [None]:
train_metadata.loc[train_metadata["filename"].isin(my_duplicates_vis[0]), ["primary_label", "secondary_labels", "latitude", "longitude", "duration_s", "all_labels"]]

In [None]:
for dupl_fnames in my_duplicates_vis:
    print("\n\n\nBefore\n\n\n")
    print(train_metadata.loc[train_metadata["filename"].isin(dupl_fnames), ["primary_label", "secondary_labels", "latitude", "longitude", "duration_s", "all_labels"]])
    train_metadata = merge_duplicates(train_metadata, train_metadata[train_metadata["filename"].isin(dupl_fnames)].index)
    print("\n\n\nAfter\n\n\n")
    print(train_metadata.loc[train_metadata["filename"].isin(dupl_fnames), ["primary_label", "secondary_labels", "latitude", "longitude", "duration_s", "all_labels"]])

In [None]:
train_metadata.head()

In [None]:
orig_train_metadata = pd.read_csv(
    "/home/vova/data/exps/birdclef_2024/birdclef_2024/train_metadata_extended.csv"
)

In [None]:
orig_train_metadata.shape[0] - train_metadata.shape[0]

In [None]:
train_metadata["primary_label"].value_counts()

In [None]:
train_metadata.to_csv("/home/vova/data/exps/birdclef_2024/birdclef_2024/train_metadata_extended_noduplv1.csv", index=False)

In [11]:
train_metadata = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2024/train_metadata_extended_noduplv1.csv")

add_data_prev_comp = pd.read_csv("/home/vova/data/exps/birdclef_2024/dfs/full_noduplsV2_meta_prev_comps_extended.csv")

add_data_xc = pd.read_csv("/home/vova/data/exps/birdclef_2024/xeno_canto/dataset_2024_classes/train_metadata_noduplV3_extended.csv")

# merged = pd.concat([train_metadata, add_data_prev_comp, add_data_xc]).reset_index(drop=True)
merged = pd.concat([train_metadata, add_data_xc, add_data_prev_comp]).reset_index(drop=True)

  add_data_prev_comp = pd.read_csv("/home/vova/data/exps/birdclef_2024/dfs/full_noduplsV2_meta_prev_comps_extended.csv")


In [12]:
merged["url"].apply(lambda x: x.split("/")[-1]).value_counts()

url
590213    2
316684    2
134896    1
498913    1
481601    1
         ..
192549    1
192547    1
191394    1
191025    1
177143    1
Name: count, Length: 141094, dtype: int64

In [13]:
dupls = merged.groupby(["duration_s", "author", "primary_label", "latitude", "longitude"])["filename"].apply(list).to_list()
dupls = [el for el in dupls if len(el) > 1]

In [14]:
len(dupls)

539

In [15]:
train_metadat_fnames_set = set(train_metadata["filename"])
dupls = [el for el in dupls if not all(sub_el in train_metadat_fnames_set for sub_el in el)]
len(dupls)

481

In [16]:
merged[merged["filename"].isin(dupls[-1])]

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,...,id,gen,sp,file-name,time,also,foldername,dataset,sample_rate,au_len
51777,eubeat1,['cohmar1'],"['call', 'female', 'flight call', 'male']",37.3823,-6.1363,Merops apiaster,European Bee-eater,José Carlos Sires,Creative Commons Attribution-NonCommercial-Sha...,4.5,...,,,,,,,,comp_2023,32000.0,27943079.0
51779,eubeat1,['cohmar1'],"['call', 'female', 'flight call', 'male']",37.3823,-6.1363,Merops apiaster,European Bee-eater,José Carlos Sires,Creative Commons Attribution-NonCommercial-Sha...,0.0,...,,,,,,,,comp_2023,32000.0,27943079.0


In [17]:
for dupl_fnames in dupls:
    merged = merge_duplicates(merged, merged[merged["filename"].isin(dupl_fnames)].index, with_labels_merge=False)


In [18]:
merged[merged["filename"].isin(dupls[-1])]

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,...,id,gen,sp,file-name,time,also,foldername,dataset,sample_rate,au_len
51777,eubeat1,['cohmar1'],"['call', 'female', 'flight call', 'male']",37.3823,-6.1363,Merops apiaster,European Bee-eater,José Carlos Sires,Creative Commons Attribution-NonCommercial-Sha...,4.5,...,,,,,,,,comp_2023,32000.0,27943079.0


In [19]:
merged = merged.reset_index(drop=True)

In [20]:
set(add_data_xc["dataset"])

{'xc_2024_classes'}

In [21]:
new_train_metadata = merged[merged["dataset"].isna()].reset_index(drop=True)

new_add_data_prev_comp = merged[(merged["dataset"] != "xc_2024_classes") & (~merged["dataset"].isna())].reset_index(drop=True)

new_add_data_xc = merged[merged["dataset"] == "xc_2024_classes"].reset_index(drop=True)

In [22]:
new_train_metadata.shape[0] - train_metadata.shape[0], new_add_data_prev_comp.shape[0] - add_data_prev_comp.shape[0], new_add_data_xc.shape[0] - add_data_xc.shape[0]

(-2, -376, -137)

In [31]:
new_add_data_prev_comp = new_add_data_prev_comp[~new_add_data_prev_comp["url"].apply(lambda x: x.split("/")[-1]).isin(["590213", "316684"])].reset_index(drop=True)

In [32]:
new_add_data_prev_comp.to_csv("/home/vova/data/exps/birdclef_2024/dfs/full_noduplsV2_meta_prev_comps_extended_pruned.csv", index=False)
# add_data_xc.to_csv("/home/vova/data/exps/birdclef_2024/xeno_canto/dataset_2024_classes/train_metadata_noduplV3_extended.csv", index=False)

# Handling one sample classes

In [None]:
def add_postfix(input, postfix):
    base, ext = os.path.splitext(input)
    return base + postfix + ext

In [None]:
one_sample_classes = train_metadata_nodupl["primary_label"].value_counts().iloc[-7:].index

In [None]:
one_sample_df = train_metadata_nodupl[train_metadata_nodupl["primary_label"].isin(one_sample_classes)].reset_index(drop=True)

one_sample_df_1 = one_sample_df.copy()
one_sample_df_2 = one_sample_df.copy()

one_sample_df_1["filename"] = one_sample_df_1["filename"].apply(lambda x: add_postfix(x, "_1"))
one_sample_df_2["filename"] = one_sample_df_2["filename"].apply(lambda x: add_postfix(x, "_2"))

one_sample_df_ext = pd.concat([one_sample_df_1, one_sample_df_2])

In [None]:
one_sample_df_ext["filename"].apply(lambda x: os.path.join("/home/vova/data/exps/BirdCLEF_2023/birdclef_2023/train_audio/", x)).to_list()

In [None]:
train_audio_samples = parallel_librosa_load(
    one_sample_df_ext["filename"].apply(lambda x: os.path.join("/home/vova/data/exps/BirdCLEF_2023/birdclef_2023/train_audio/", x)).to_list(), 
    return_sr=False, 
    sr=None
)
sample_rates = parallel_librosa_load(
    one_sample_df_ext["filename"].apply(lambda x: os.path.join("/home/vova/data/exps/BirdCLEF_2023/birdclef_2023/train_audio/", x)).to_list(), 
    return_audio=False, 
    sr=None
)

In [None]:
one_sample_df_ext["duration_s"] = [len(au_) / sr_ for au_, sr_ in zip(train_audio_samples, sample_rates)]

In [None]:
train_metadata_nodupl_split_1sample = train_metadata_nodupl.copy()
train_metadata_nodupl_split_1sample = train_metadata_nodupl_split_1sample[~train_metadata_nodupl_split_1sample["primary_label"].isin(one_sample_classes)].reset_index(drop=True)
train_metadata_nodupl_split_1sample = pd.concat([train_metadata_nodupl_split_1sample, one_sample_df_ext]).reset_index(drop=True)

In [None]:
add_data = pd.read_csv("/home/vova/data/exps/BirdCLEF_2023/train_metadata_extended_2020_2022_no2023_scored.csv", converters={"all_labels": eval})

In [None]:
train_metadata_nodupl.loc[
    train_metadata_nodupl["primary_label"].isin(set(list(chain(*add_data["all_labels"].to_list())))), 
    "primary_label"
].value_counts()

In [None]:
train_metadata_nodupl_split_1sample.to_csv("/home/vova/data/exps/BirdCLEF_2023/birdclef_2023/train_metadata_extended_nodupl_v1_1SampleSplitV1.csv", index=False)

In [None]:
train_metadata_nodupl_split_1sample["primary_label"].value_counts()

# Check 2 Previous steps

In [None]:
new_df = pd.read_csv("/home/vova/data/exps/BirdCLEF_2023/birdclef_2023/train_metadata_extended_nodupl_v1_1SampleSplitV1.csv")
old_df = pd.read_csv("/home/vova/data/exps/BirdCLEF_2023/birdclef_2023/train_metadata_extended.csv")

In [None]:
duplicates = load_json("/home/vova/data/exps/BirdCLEF_2023/birdclef_2023/duplicates_v1.json")

# Test Audio Data

In [None]:
ipd.Audio("/home/vova/data/exps/BirdCLEF_2023/birdclef_2023/test_soundscapes/soundscape_29201.ogg")

In [None]:
test_au, test_sr = librosa.load("/home/vova/data/exps/BirdCLEF_2023/birdclef_2023/test_soundscapes/soundscape_29201.ogg", sr=None)

In [None]:
len(test_au) / test_sr, test_sr

# Train Audio 

In [None]:
train_samples = glob("/home/vova/data/exps/birdclef_2024/birdclef_2024/train_audio/*/*.ogg", recursive=True)

In [None]:
sample_rates = parallel_librosa_load(
    train_samples, 
    return_audio=False, 
    sr=None
)

In [None]:
train_audio_samples = parallel_librosa_load(
    train_samples, 
    return_sr=False, 
    sr=None
)

In [None]:
np.unique(sample_rates)

In [None]:
train_audio_samples_lens = [len(au_) / sr_ for au_, sr_ in zip(train_audio_samples, sample_rates)]
plt.hist(train_audio_samples_lens, bins=30);

In [None]:
def get_filename(path):
    return "/".join(path.split("/")[-2:])

audio_info = pd.DataFrame({
    "filename":[get_filename(el) for el in train_samples],
    "duration_s":train_audio_samples_lens
})

audio_info

In [None]:
(audio_info["duration_s"] < 30).sum() / audio_info.shape[0]

In [None]:
train_metadata = train_metadata.merge(audio_info, on="filename")
train_metadata

In [None]:
train_metadata.url.value_counts()

In [None]:
train_metadata[train_metadata.url == "https://www.xeno-canto.org/514027"]

In [None]:
train_metadata.loc[9245, "secondary_labels"] = ["laudov1"]
train_metadata = train_metadata.drop(index=[16259]).reset_index(drop=True)

In [None]:
train_metadata[train_metadata.url == "https://www.xeno-canto.org/514027"]

In [None]:
train_metadata['all_labels'] = train_metadata.apply(lambda x: [x["primary_label"]] + x["secondary_labels"], axis=1)

In [None]:
train_metadata.to_csv("/home/vova/data/exps/birdclef_2024/birdclef_2024/train_metadata_extended.csv", index=False)

# Secondary Labels

In [None]:
train_metadata["secondary_labels"] = train_metadata["secondary_labels"].apply(eval)

In [None]:
secondary_labels_unique = set(list(chain(*train_metadata["secondary_labels"].to_list())))

In [None]:
secondary_labels_unique - set(train_metadata["primary_label"])

In [None]:
set(train_metadata["primary_label"]) - secondary_labels_unique

# Metric

In [None]:
!pip list | grep scikit

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
def adjust_classes(array):
    """ Adjust the array to ensure no column has only one class present.
    
    Args:
    array (np.ndarray): The input array of shape (N_rows, N_classes).
    
    Returns:
    np.ndarray: The adjusted array.
    """
    # Iterate over each column in the array
    for col_idx in range(array.shape[1]):
        if np.all(array[:, col_idx] == 1):
            # If all values in the column are 1, randomly select one row to change to 0
            row_to_change = np.random.choice(array.shape[0])
            array[row_to_change, col_idx] = 0
            
    return array

def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str = "row_id") -> float:
    '''
    Version of macro-averaged ROC-AUC score that ignores all classes that have no true positive labels.
    '''
    del solution[row_id_column_name]
    del submission[row_id_column_name]

    solution_sums = solution.sum(axis=0)
    scored_columns = list(solution_sums[solution_sums > 0].index.values)
    assert len(scored_columns) > 0

    return roc_auc_score(
        y_true=solution[scored_columns].values,
        y_score=submission[scored_columns].values,
        average="macro"
    )

def score_numpy(y_true: np.ndarray, y_pred: np.ndarray):
    scored_columns_mask = y_true.sum(axis=0) > 0

    y_true_filtered = y_true.T[scored_columns_mask].T
    y_pred_filtered = y_pred.T[scored_columns_mask].T

    return roc_auc_score(
        y_true=y_true_filtered,
        y_score=y_pred_filtered,
        average="macro"
    )

In [None]:
sample_solution = sample_submission.copy()
sample_solution[[col for col in sample_solution.columns if col != "row_id"]] = adjust_classes(np.random.randint(
    0,
    2, 
    size=sample_solution[[col for col in sample_solution.columns if col != "row_id"]].shape
))

In [None]:
score(
    solution=sample_solution.copy(),
    submission=sample_submission.copy(),
)

In [None]:
score(
    solution=sample_solution.copy(),
    submission=sample_solution.copy(),
)

In [None]:
score_numpy(
    y_true=sample_solution[[col for col in sample_submission.columns if col != "row_id"]].values,
    y_pred=sample_solution[[col for col in sample_solution.columns if col != "row_id"]].values
)

In [None]:
score_numpy(
    y_true=sample_solution[[col for col in sample_submission.columns if col != "row_id"]].values,
    y_pred=sample_submission[[col for col in sample_solution.columns if col != "row_id"]].values
)

# Sample Weights

In [None]:
# sample_weights = train_metadata["primary_label"].value_counts()

In [None]:
sample_weights = (train_metadata["primary_label"].value_counts() / train_metadata["primary_label"].value_counts().sum())  ** (-0.5)

In [None]:
write_json("/home/vova/data/exps/birdclef_2024/sample_weights/sw_2024_v1.json", sample_weights.to_dict())

# CV Split

In [None]:
cv_split = np.load("/home/vova/data/exps/birdclef_2024/cv_splits/birdclef_2024_5_folds_split.npy", allow_pickle=True)

In [None]:
len(set(train_metadata["primary_label"]))

## CV Split Precise Check

In [None]:
cv_split = np.load("/home/vova/data/exps/birdclef_2024/cv_splits/birdclef_2024_5_folds_split.npy", allow_pickle=True)
df = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2024/train_metadata.csv")

In [None]:
all_classes = set(df["primary_label"])
len(all_classes)

In [None]:
for fold_id, (train_idx, val_idx) in enumerate(cv_split):
    assert len(set(df["filename"].iloc[val_idx]) & set(df["filename"].iloc[train_idx])) == 0, fold_id
    assert set(df["primary_label"].iloc[train_idx]) == all_classes, fold_id

# Create Fake Test Data

In [None]:
file_to_copy = "/home/vova/data/exps/BirdCLEF_2023/birdclef_2023/test_soundscapes/soundscape_29201.ogg"

In [None]:
for i in range(20):
    copyfile(
        file_to_copy,
        f"/home/vova/data/exps/BirdCLEF_2023/kaggle_datasets/bird_clef_2023_addones/fake_test_20/fake_test_file_{i}.ogg"
    )