# Accent Localizer

Read README.md for problem description.

### Imports

In [14]:
import os
from datetime import datetime

import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import sklearn

### Sample some data

In [15]:
# df = pd.read_csv(os.path.join('__dataset', 'validated_regions.tsv'), sep='\t')
# df.head(10)

### Get the spectrogram of an audio file

In [16]:
def get_spectrogram(path, sampling_rate = 48000, display = True):
    # Load an audio file as a floating point time series.
    audio , _ = librosa.load(path, sr=sampling_rate)

    # Short-time Fourier transform (STFT).
    stft = abs(librosa.stft(audio))

    # Convert an amplitude spectrogram to dB-scaled spectrogram.
    spectrogram = librosa.amplitude_to_db(stft)

    if display:
        plt.figure(figsize=(9, 3))
        librosa.display.specshow(spectrogram, sr=sampling_rate, x_axis='time', y_axis='log')
        plt.colorbar()
    return spectrogram


# _ = get_spectrogram(os.path.join('__dataset', 'clips', df.iloc[0]['path']))

### Extract features using mfcc

In [17]:
def extract_feature(path, sampling_rate = 48000):
    features = []
    audio, _ = librosa.load(path, sr=sampling_rate)

    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sampling_rate))
    spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=audio, sr=sampling_rate))
    spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sampling_rate))
    features.append(spectral_centroid)
    features.append(spectral_bandwidth)
    features.append(spectral_rolloff)
    
    mfcc = librosa.feature.mfcc(y=audio, sr=sampling_rate)
    for el in mfcc:
        features.append(np.mean(el))
    
    return np.asarray(features, dtype=float)

# features = extract_feature(os.path.join('__dataset', 'clips', df.iloc[0]['path']))
# print(features)
# print(features.shape)

### Create a new features file

In [18]:
# Create a new file named feature_regions.tsv

def create_header():
    header = ['path', 'region', 'spectral_centroid', 'spectral_bandwidth', 'spectral_rolloff']
    for i in range(1, 21):
        header.append(f'mfcc{i}')
    return header

# Check if the file exists
if not os.path.exists(os.path.join('__dataset', 'feature_regions.tsv')):
    header = create_header()
    # Save the header to the new file
    with open(os.path.join('__dataset', 'feature_regions.tsv'), 'w') as f:
        f.write('\t'.join(header) + '\n')

### Add the new features to the dataset

In [30]:
# Read the validated_regions.tsv file
df_val = pd.read_csv(os.path.join('__dataset', 'validated_regions.tsv'), sep='\t')

# Read the feature_regions.tsv file
df_feat = pd.read_csv(os.path.join('__dataset', 'feature_regions.tsv'), sep='\t')

# Check the last row of the feature_regions.tsv file where we left off
# Locate the same row in the validated_regions.tsv file
# Start from the next row
start = 0
if len(df_feat) > 0:
    last_row = df_feat.iloc[-1]
    for i, row in df_val.iterrows():
        if row['path'] == last_row['path'] and row['region'] == last_row['region']:
            start = i + 1
            break

# Extract features for each row in the validated_regions.tsv file
now = datetime.now()
try:
    for i, row in df_val.iloc[start:].iterrows():
        features = list()
        features.append(row['path'])
        features.append(row['region'])
        features.extend(extract_feature(os.path.join('__dataset', 'clips', row['path'])))

        # Convert the list to a pandas Series
        features_series = pd.Series(features, index=df_feat.columns)
        # Add this feature row to the feature_regions.tsv file
        df_feat = pd.concat([df_feat, features_series.to_frame().T], ignore_index=True)

    raise Exception('Done')
except KeyboardInterrupt as e:
    # Save the extracted features to the feature_regions.tsv file
    if not df_feat.empty:
        df_feat.to_csv(os.path.join('__dataset', 'feature_regions.tsv'), sep='\t', index=False)
    print(f'Processed {i} rows in {datetime.now() - now} seconds\n{e}')



  df_val = pd.read_csv(os.path.join('__dataset', 'validated_regions.tsv'), sep='\t')


Processed 42950 rows in 0:52:27.280885 seconds

