# Importing libraries/Dependencies

In [None]:

import pandas as pd # To create/manipulate data frames
import IPython # To display audio files
import numpy as np # Data Wrangling 
import os # For file management
import glob # Used to search for files that match a specific file pattern or name
from scipy.io.wavfile import read # used for reading .wav files
import shutil # This helps in automating the process of copying and removal of files and directories


# Step 1: Understand the Problem

We are trying to make a deep fake audio detection where the AI model will be trained to identify fake audio. So we have gathered data with fake and real audio files to train the model.

# Step 2: Extract Data

So that includes the 3 datasets we have already:

This one will be used to train the model
1. https://deepfake-demo.aisec.fraunhofer.de/in_the_wild


These two will be used to improve the accuracy of the model with generated audio that is fake.
1. https://github.com/RUB-SysSec/WaveFake
2. https://www.kaggle.com/datasets/birdy654/deep-voice-deepfake-voice-recognition/data

Then this is an example of people using random forest within their model to detect deepfakes
1. https://www.kaggle.com/code/birdy654/detecting-ai-generated-speech-with-random-forests

# Step 3: Clean Data


#### We will be cleaning this dataset
https://deepfake-demo.aisec.fraunhofer.de/in_the_wild


In [None]:
# Read data into a pandas data frame
in_the_wild_dataset = pd.read_csv('data/release_in_the_wild/OG Data/meta.csv')

in_the_wild_dataset2 = pd.read_csv('data/release_in_the_wild/OG Data/meta.csv')

fake_audio_dataset = pd.read_csv('data/release_in_the_wild/OG Data/meta.csv')

real_audio_dataset = pd.read_csv('data/release_in_the_wild/OG Data/meta.csv')

### Label means:
```
1. Spoof = fake
2. bona-fide = real
```

In [None]:
#A function that inspects everything about the dataset
def inspect_dataframe(input_df):

    print('The Null Values:\n',input_df.isnull().sum())
    print('\n')
    if (input_df.duplicated().sum()) == 0:
        print('No Duplicate Values')
    else:
        print('There is Duplicate Values')
    print('\n')
    print('The Description:\n',input_df.describe())
    print('\n')
    print('Columns:')
    for col in input_df.columns:
        print(col)

    pass

In [None]:
inspect_dataframe(in_the_wild_dataset)

The Null Values:
 file       0
speaker    0
label      0
dtype: int64


No Duplicate Values


The Description:
          file       speaker      label
count   31779         31779      31779
unique  31779            54          2
top     0.wav  Barack Obama  bona-fide
freq        1          3636      19963


Columns:
file
speaker
label


In [None]:
# Checking how many real audios are there
bona_fide_amount = in_the_wild_dataset['label'] == 'bona-fide'
bona_fide_amount.value_counts()[True]

19963

In [None]:
# Checking how many fake audios are there
spoof_amount = in_the_wild_dataset['label'] == 'spoof'
spoof_amount.value_counts()[True]

11816

In [None]:
# Present the audio amounts for each speaker
group_of_speaker = in_the_wild_dataset.groupby('speaker')
group_of_speaker['speaker'].agg(['count'])

Unnamed: 0_level_0,count
speaker,Unnamed: 1_level_1
2Pac,160
Adam Driver,217
Alan Watts,378
Alec Guinness,3625
Alexandria Ocasio-Cortez,390
Arnold Schwarzenegger,351
Ayn Rand,2493
Barack Obama,3636
Bernie Sanders,2877
Bill Burr,201


In [None]:
# Make a groupby in which allows us to see how many "spoof" and "bona-fide" audio files belong to each speaker
groupby_speaker_label = ['speaker' ,'label']

amount_fake_real_audio = in_the_wild_dataset.groupby(groupby_speaker_label)

# pd.set_option('display.max_rows', None)
new_data = amount_fake_real_audio['file'].agg(['count'])
new_data

Unnamed: 0_level_0,Unnamed: 1_level_0,count
speaker,label,Unnamed: 2_level_1
2Pac,bona-fide,59
2Pac,spoof,101
Adam Driver,bona-fide,80
Adam Driver,spoof,137
Alan Watts,bona-fide,86
...,...,...
Tucker Carlson,spoof,67
William F. Buckley,bona-fide,4
William F. Buckley,spoof,22
Winston Churchill,bona-fide,625


In [None]:
# Locate the path of the audio file
real_audio = 'data/release_in_the_wild/Audio/0.wav'

In [None]:
print("Real Audio (Speaker = Alec Guinness):")
IPython.display.Audio(real_audio)

Real Audio (Speaker = Alec Guinness):


In [None]:
# Locate the path of the audio file
fake_audio = 'data/release_in_the_wild/Audio/4.wav'

In [None]:
print("Fake Audio (Speaker = Christopher Hitchens):")
IPython.display.Audio(fake_audio)

Fake Audio (Speaker = Christopher Hitchens):


In [None]:
# To separate the fake audio files with masking boolean
mask_real_audio = fake_audio_dataset['label'] == 'bona-fide'

fake_audio_dataset = fake_audio_dataset[~mask_real_audio]
fake_audio_dataset.to_csv('data/release_in_the_wild/Cleaned Data/2.Fake_Audio_dataset.csv', index=False)

In [None]:
# To separate the real audio files with masking boolean
mask_real_audio = real_audio_dataset['label'] == 'spoof'

real_audio_dataset = real_audio_dataset[~mask_real_audio]
real_audio_dataset.to_csv('data/release_in_the_wild/Cleaned Data/1.Real_Audio_dataset.csv', index=False)

In [None]:
# To use the now cleaned/separated data to organize the audio folder itself to separate the spoof and bona-fide audio files into their own folders.
fake = pd.read_csv('data/release_in_the_wild/Cleaned Data/2.Fake_Audio_dataset.csv', dtype=str)
print(fake)

            file                speaker  label
0          0.wav          Alec Guinness  spoof
1          1.wav          Alec Guinness  spoof
2          2.wav           Barack Obama  spoof
3          3.wav          Alec Guinness  spoof
4          6.wav           Barack Obama  spoof
...          ...                    ...    ...
11811  31767.wav   Christopher Hitchens  spoof
11812  31769.wav           Bill Clinton  spoof
11813  31771.wav      Winston Churchill  spoof
11814  31773.wav             Alan Watts  spoof
11815  31778.wav  Arnold Schwarzenegger  spoof

[11816 rows x 3 columns]


#### Function Details
0. load the cleaned/separated data and the audio file folder
1.  make a path to audio folder
2. make a for loop in which it will look at the audio folder and compare the 'file' name with the 'file' name in the cleaned/separated data in the csv file
3. If they match it will make a new folder and move the audio file and look at the csv file and look at the 'label' and 'speaker' and rename the audio file with the 'label' (either spoof or bona-fide) and the speaker name such as 'spoof-Barack Obama.wav'
4. else do nothing to the specific file and go to next audio file

In [None]:
# chatGPT response
def organize_audio_files(audio_folder, csv_file, new_folder):
    # Load the cleaned/separated data
    data = pd.read_csv(csv_file)

    # Make a path to audio folder
    audio_path = os.path.join(os.getcwd(), audio_folder)
    new_folder_path = os.path.join(os.getcwd(), new_folder)

    # Create new folder if it doesn't exist
    if not os.path.exists(new_folder_path):
        os.makedirs(new_folder_path)

    # Loop through audio files in the folder
    for filename in os.listdir(audio_path):
        # Check if filename exists in csv data
        if filename in data['file'].values:
            # If they match, get the corresponding label and speaker
            label = data.loc[data['file'] == filename, 'label'].values[0]
            speaker = data.loc[data['file'] == filename, 'speaker'].values[0]
            file = data.loc[data['file'] == filename, 'file'].values[0]

            # Create new filename and move the file to the new folder
            new_filename = f"{file}-{label}-{speaker}.wav"
            shutil.copy(os.path.join(audio_path, filename), os.path.join(new_folder_path, new_filename))


In [None]:
# For Fake
organize_audio_files('data/release_in_the_wild/Audio/', 'data/release_in_the_wild/Cleaned Data/2.Fake_Audio_dataset.csv', 'data/release_in_the_wild/Cleaned Spoof-Audio/')

In [None]:
# For Real
organize_audio_files('data/release_in_the_wild/Audio/', 'data/release_in_the_wild/Cleaned Data/1.Real_Audio_dataset.csv', 'data/release_in_the_wild/Cleaned Bona-Fide-Audio/')