# Feature Extraction

In [None]:
# For Google Colab imports
!pip install librosa

In [4]:
import pandas as pd # To create/edit/manipulate a data frame
import numpy as np # To perform a wide variety of mathematical operations on arrays
from glob import glob # a function that's used to search for files that match a specific file pattern or name
import csv # To converts into a readable csv file

# For feature extraction of audio files
import librosa
import librosa.display
from librosa import feature

In [5]:
# Google Colab for attaching Google Drive data
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Audio Data Feature Extraction
> 1. **```Zero Crossing Rate:``` The rate of sign-changes of the signal during the duration of a particular frame.**
> 2. **```Root Mean Square:```  Metering tool that measures the average loudness of an audio track within a window of roughly 300 milliseconds.**
> 3. ```Mel Frequency Cepstral Coefficients:``` Form a cepstral representation where the frequency bands are not linear but distributed according to the mel-scale.
> 4. ```Chromagram: Represents the 12 different pitches under an audio file, in one place so that we can understand the classification of the pitches in the audio files.
> 5. ```Melspectrogram:``` Scale of pitches that can be felt by the listener to be equal in distance from one another.
> 6. ```Spectral Centroid:``` The center of gravity of the spectrum.
> 7. ```Spectral Bandwidth:``` The difference between the upper and lower frequencies in a continuous band of frequencies.
> 8. ```Spectral Rolloff:``` The frequency below which 90% of the magnitude distribution of the spectrum is concentrated.
> 9. ```Spectral Entropy:``` Entropy of the normalized spectral energies for a set of sub-frames.
> 10. ```Spectral Flux:``` The squared difference between the normalized magnitudes of the spectra of the two successive frames.

**Our Selected Feature Extractions**
1. melspectrogram
2. zero_crossing_rate
3. root_mean_square
4. spectral_centroid
5. spectral_bandwidth
6. chroma_stft
7. chroma_cqt,
8. chroma_cens,
9. spectral_rolloff,

In [9]:
# List of features
fn_list_i = [
 librosa.feature.melspectrogram,
 librosa.feature.spectral_centroid,
 librosa.feature.spectral_bandwidth,
 librosa.feature.chroma_stft,
 feature.chroma_cqt,
 feature.chroma_cens,
 feature.spectral_rolloff,
]

fn_list_ii = [
 feature.zero_crossing_rate,
]

fn_list_iii = [
 librosa.feature.rms,
]

# Extracting all the features and putting them into a variable
def get_feature_vector(y,sr):
   feat_vect_i = [ np.mean(funct(y=y, sr=sr)) for funct in fn_list_i]
   feat_vect_ii = [ np.mean(funct(y)) for funct in fn_list_ii]
   feat_vect_iii = [ np.mean(funct(y=y)) for funct in fn_list_ii]
   feature_vector = feat_vect_i + feat_vect_ii + feat_vect_iii
   return feature_vector


**Result Extraction**

In [10]:
# Reading all real audio and using the function called "get_feature_vector" to create a list of all the features 
realfolder = '/content/drive/MyDrive/data/Cleanrealtest/'
real_audio_files = glob(realfolder + '*.wav')
real_audios_feat = []
for file in real_audio_files:
   y, sr = librosa.load(file,sr=None)
   feature_vector = get_feature_vector(y, sr)
   real_audios_feat.append(feature_vector)

# Reading all fake audio and using the function called "get_feature_vector" to create a list of all the features 
fakefolder = '/content/drive/MyDrive/data/Cleanfaketest/'
fake_audio_files = glob(fakefolder + '*.wav')
fake_audios_feat = []
for file in fake_audio_files:
   y , sr = librosa.load(file,sr=None)
   feature_vector = get_feature_vector(y, sr)
   fake_audios_feat.append(feature_vector)


In [16]:
# Moving all real audio feature into a manually created csv file for modeling
real_output = '/content/drive/MyDrive/data/realfeatures.csv'
header =[
  'melspectrogram',
  'spectral_centroid',
  'spectral_bandwidth',
  'chroma_stft',
  'zero_crossing_rate',
  'rms',
]
with open(real_output,'+w') as f:
 csv_writer = csv.writer(f, delimiter = ',')
 csv_writer.writerow(header)
 csv_writer.writerows(real_audios_feat)

# Moving all fake audio feature into a manually created csv file for modeling
fake_output = '/content/drive/MyDrive/data/fakefeatures.csv'
header =[
  'melspectrogram',
  'spectral_centroid',
  'spectral_bandwidth',
  'chroma_stft',
  'zero_crossing_rate',
  'rms',
]
with open(fake_output,'+w') as f:
 csv_writer = csv.writer(f, delimiter = ',')
 csv_writer.writerow(header)
 csv_writer.writerows(fake_audios_feat)