# Age Estimation task

## Imports

In [214]:
import numpy as np
import pandas as pd

import os

import scipy

import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.lines import Line2D

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split

import seaborn as sns
import librosa
import librosa.display
# %matplotlib widget

## Constants

In [152]:
FILE_DEVELOPMENT = "Dataset/development.csv"
FILE_EVALUATION = "Dataset/evaluation.csv"
AUDIOS_DEVELOPMENT = "Dataset/audios_development/"
AUDIOS_EVALUATION = "Dataset/audios_evaluation/"

precisionLog = {
    'mean_pitch':2, 
    'max_pitch':2, 
    'min_pitch':3, 
    'jitter':2, 
    'shimmer':2, 
    'energy':1, 
    'zcr_mean':2, 
    'spectral_centroid_mean':2,
    'tempo':3,
    'hnr':0
}

precisionLinear = {
    'mean_pitch':-1, 
    'max_pitch':-2, 
    'min_pitch':0, 
    'jitter':3, 
    'shimmer':3, 
    'energy':3, 
    'zcr_mean':2, 
    'spectral_centroid_mean':-2,
    'tempo':3,
    'hnr':0
}

## Data reading

In [188]:
dev_original_df = pd.read_csv(FILE_DEVELOPMENT, header=0, index_col=0)
eval_original_df = pd.read_csv(FILE_EVALUATION, header=0, index_col=0)

audio_dev = os.listdir(AUDIOS_DEVELOPMENT)
audio_eval = os.listdir(AUDIOS_EVALUATION)

## Tabular Data Exploration

In [None]:
# display(dev_original_df.head())

display(f"Total number of nan in development: {dev_original_df.isna().sum().sum()}")
display(f"Total number of nan in evaluation: {eval_original_df.isna().sum().sum()}")

desc_dev_df = dev_original_df.describe()
desc_eval_df = eval_original_df.describe()

sampling_rate = dev_original_df['sampling_rate'].iloc[0]

# display(desc_dev_df.loc['std', 'sampling_rate'])
# display(desc_eval_df.loc['std', 'sampling_rate'])

dev_df = dev_original_df.drop('sampling_rate', axis=1)
eval_df = eval_original_df.drop('sampling_rate', axis=1)

ages_df = dev_df[['age']]
path_dev_df = dev_df[['path']]
path_eval_df = eval_df[['path']]

dev_df = dev_df.drop(['path'], axis=1)
eval_df = eval_df.drop('path', axis=1)

display(dev_df.head())
display(eval_df.head())

# display(path_dev_df)
# display(path_eval_df)
display(dev_df.describe())

In [None]:
temp = eval_df['num_words'].value_counts

temp

In [None]:
enticity_df = dev_df['ethnicity'].value_counts().sort_values(ascending=False)

median_et = enticity_df.median()
display(enticity_df[enticity_df == 1].shape)
display(enticity_df.head(20))
display(enticity_df.tail(10))
# enticity_df[enticity_df > median_et].plot()

etnie_chosen = sorted(list(set(enticity_df[enticity_df > median_et].index)))
display(etnie_chosen)
display(etnie_chosen.__len__())


## Preprocess tabular data

### Encoding

#### Ethnicity

In [333]:
# display(dev_df)
def encode_ethnicity(X_df, ethnie):
    etna = pd.DataFrame(1, columns=ethnie, index=X_df.index)

    aggregated_ethnicity_df = X_df.copy()
    aggregated_ethnicity_df.loc[~(aggregated_ethnicity_df['ethnicity'].isin(ethnie)), 'ethnicity'] = 'others'

    for etnia in ethnie:
        aggregated_ethnicity_df.loc[aggregated_ethnicity_df['ethnicity'] == etnia, etnia] = etna[etnia]

        # aggregated_ethnicity_df.loc[aggregated_ethnicity_df['ethnicity'] == etnia, etnia] = 1
        aggregated_ethnicity_df.loc[~(aggregated_ethnicity_df['ethnicity'] == etnia), etnia] = 0

    # encoded_ethnicity = pd.get_dummies(aggregated_ethnicity_df['ethnicity'], dtype=int)

    try:
        aggregated_ethnicity_df = aggregated_ethnicity_df.drop(columns=['ethnicity'], axis=1)
    except KeyError:
        pass

    return aggregated_ethnicity_df


#### Gender

In [334]:
GenderMapper = {
    'male': 1,
    'female': -1
}

def encode_gender(X_df:pd.DataFrame, mapper):
    encoded_df = X_df.copy()
    for gender, value in mapper.items():
        encoded_df.loc[encoded_df['gender'] == gender, 'gender'] = value

    encoded_df.loc[encoded_df['gender'] == 'famale', 'gender'] = -1

    encoded_df['gender'] = encoded_df['gender'].astype(float)
    return encoded_df

#### Tempo

In [335]:
# display(encoded_gender_etnicity_df['tempo'])

def encode_tempo(X_df):
    encoded_df = X_df.copy()
    encoded_df['tempo'] = encoded_df['tempo'].map(lambda x: float(x.strip('[').strip(']')))
    return encoded_df


#### Results

In [None]:
display(eval_df.loc[eval_df['ethnicity'].isin(etnie_chosen), 'ethnicity'].value_counts())

step1_dev_df = encode_ethnicity(dev_df, etnie_chosen)
step1_eval_df = encode_ethnicity(eval_df, etnie_chosen)

display(dev_df.head(5))
# display(eval_df.head(5))

# step1_eval_df = encode_ethnicity(eval_df, etnie_chosen)

# display(step1_dev_df.head())
# display(step1_eval_df.head())

# step1_eval_df['gender'].value_counts()

step2_dev_df = encode_gender(step1_dev_df, GenderMapper)
step2_eval_df = encode_gender(step1_eval_df, GenderMapper)

display(step2_dev_df.head(5))
# display(step2_eval_df.head(5))

step3_dev_df = encode_tempo(step2_dev_df)
step3_eval_df = encode_tempo(step2_eval_df)

display(step3_dev_df.head(5))
# display(step3_eval_df.head(5))

# step3_dev_df = step3_dev_df.drop(columns=['ethnicity'], axis=1)
# step3_eval_df = step3_eval_df.drop(columns=['ethnicity'], axis=1)

# display(step3_dev_df.head(5))

# display(step3_eval_df[step3_eval_df['yoruba'] == 1])
# display(eval_df.loc[75, :])

display(step3_dev_df.describe())
step3_dev_df.columns


### Behavior analysis and $log_{10}$ scaling

#### Insight

In [None]:
def create_colors(values:pd.Series, label:str, use_continous:bool=False, 
                  cmap_continuous = cm.viridis, cmap_discrete = cm.rainbow):
    if use_continous:
        cmap_ = cmap_continuous
        # Alternative normalization without iterquartile ranges:
        norm = plt.Normalize(vmin=np.min(values), vmax=np.max(values)) 
        
        mappable = cm.ScalarMappable(norm=norm, cmap=cmap_)
        handles = None
    else:
        cmap_ = cmap_discrete
        norm = plt.Normalize(vmin=np.min(values), vmax=np.max(values)) 
        
        mappable = None
        handles = [
            Line2D(
                [0], [0], marker='o', color='none', linestyle='None', 
                markeredgewidth=0, markerfacecolor=cmap_(norm(val)), 
                markersize=10, label=label
            ) for label, val in GenderMapper.items()
        ]

    return cmap_, norm, mappable, handles 

def round_column(data: pd.DataFrame, col: str, isLog:bool = True):
    if col != 'hnr':
        if isLog:
            rounded_col = np.round(
                np.log10(data[col]), 
                precisionLog[col]
            )
        else:
            rounded_col = np.round(
                data[col],
                precisionLinear[col]
            )
    else:
        rounded_col = np.round(data[col], precisionLinear[col])
    return rounded_col


def perform_aggregation(partition_df, target, gender, descrete):
    grouped_temp = partition_df.groupby(col).agg({
        'frequency': 'first',
    })

    if not descrete:
        grouped_temp['target'] = grouped_temp.index.map(
            lambda val: target.loc[partition_df[partition_df[col] == val].index, :].mean().iloc[0]
        )
    else:
        grouped_temp['target'] = GenderMapper[gender]

    grouped_temp = grouped_temp.reset_index()

    return grouped_temp
    
def plot_distribution(fig:plt.Figure, ax:plt.Axes, X_df:pd.DataFrame, col:str, target:pd.Series, isLog:bool=True, descrete:bool=False):    
    temp = X_df[['gender', col]].copy()

    temp[col] = round_column(temp, col, isLog=isLog)

    male_df = temp[temp['gender'] == GenderMapper['male']].copy()
    female_df = temp[temp['gender'] == GenderMapper['female']].copy()

    male_df['frequency'] = male_df[col].map(male_df[col].value_counts())
    female_df['frequency'] = female_df[col].map(female_df[col].value_counts())

    male_grouped_df = perform_aggregation(male_df, target, 'male', descrete)
    female_grouped_df = perform_aggregation(female_df, target, 'female', descrete)

    female_grouped_df.index = female_grouped_df.index + male_grouped_df.index[-1]

    combined_df = pd.concat([male_grouped_df, female_grouped_df]).reset_index()

    cmap_, norm, mappable, handles = create_colors(combined_df.loc[:, 'target'], label='gender', use_continous=not descrete)

    sc = ax.scatter(combined_df[col], 
                    combined_df['frequency'], 
                    c=cmap_(norm(combined_df['target'])), 
                    alpha=0.7)

    if mappable:
        fig.colorbar(mappable=mappable, ax=ax)
    else:
        ax.legend(handles=handles)

for i in range(9):
    col = list(precisionLog.keys())[i]
    fig = plt.figure(figsize=(15, 5))
    fig.suptitle(f'Distribution of {col.capitalize()} in log10 scale')

    ax1 = fig.add_subplot(121)
    plot_distribution(fig, ax1, step3_dev_df, col, None, isLog=True, descrete=True)
    ax1.set_xlabel(f'log_10({col.capitalize()})')
    ax1.set_ylabel(f'Count')
    
    ax2 = fig.add_subplot(122)
    ax2.set_title("Points colored with average age of grouped points")

    plot_distribution(fig, ax2, step3_dev_df, col, ages_df, isLog=True, descrete=False)
    ax2.set_xlabel(f'log_10({col.capitalize()})')
    ax1.set_ylabel(f'Count')
    plt.show()

    fig = plt.figure(figsize=(15, 5))
    fig.suptitle(f'Distribution of {col.capitalize()} in linear scale')

    ax1 = fig.add_subplot(121)
    plot_distribution(fig, ax1, step3_dev_df, col, None, isLog=False, descrete=True)
    ax1.set_xlabel(f'{col.capitalize()}')
    ax2.set_ylabel(f'Count')
    
    ax2 = fig.add_subplot(122)
    ax2.set_title("Points colored with average age of grouped points")
    plot_distribution(fig, ax2, step3_dev_df, col, ages_df, isLog=False, descrete=False)
    ax2.set_ylabel(f'Count')
    ax2.set_xlabel(f'{col.capitalize()}')
    plt.show()


col = list(precisionLog.keys())[9]

fig = plt.figure(figsize=(15, 5))
fig.suptitle(f'Distribution of {col.capitalize()} in linear scale')

ax1 = fig.add_subplot(121)
plot_distribution(fig, ax1, step3_dev_df, col, None, isLog=False, descrete=True)
ax1.set_xlabel(f'{col.capitalize()}')
ax1.set_ylabel(f'Count')

ax2 = fig.add_subplot(122)
ax2.set_title("Points colored with average age of grouped points")
plot_distribution(fig, ax2, step3_dev_df, col, ages_df, isLog=False, descrete=False)
ax2.set_xlabel(f'{col.capitalize()}')
ax2.set_ylabel(f'Count')
plt.show()

#### In practice

In [158]:
step3_log_dev_df = step3_dev_df.copy()
step3_log_eval_df = step3_eval_df.copy()

for i, toSub in zip(precisionLog.keys(), [1, 0, 0, 1, 1, 1, 1, 1, 0, 0]):
    if toSub == 1:
        step3_log_dev_df.loc[:, i] = np.log10(step3_dev_df.loc[:, i])
        step3_log_eval_df.loc[:, i] = np.log10(step3_eval_df.loc[:, i])

### Standardization

In [110]:
#TODO: only one scaler fitted on dev?

step3_log_norm_dev_df = pd.DataFrame(StandardScaler().fit_transform(step3_log_dev_df), index=step3_log_dev_df.index, columns=step3_log_dev_df.columns)
step3_log_norm_eval_df = pd.DataFrame(StandardScaler().fit_transform(step3_log_eval_df), index=step3_log_eval_df.index, columns=step3_log_eval_df.columns)

## Correlation

In [None]:
temp:pd.Series = step3_log_norm_dev_df.corr().loc['age', :].sort_values(ascending=False)
display(temp.head(10))
display(temp.tail(10))

fig = plt.figure(figsize=(15,10))
ax = fig.add_subplot()
sns.heatmap(np.abs(step3_log_norm_dev_df.corr()), ax=ax)
plt.tight_layout()


In [None]:
def plot_error_distripution(y_pred, y_val, precision):
    errors = pd.DataFrame(np.round(y_pred.flatten() - y_val.values.flatten(), precision), columns=['error'])
    error_counts = errors['error'].value_counts().reset_index()
    error_counts.columns = ['error', 'count']

    plt.figure()
    plt.scatter(error_counts['error'], error_counts['count'])
    plt.xlabel('Error')
    plt.ylabel('Count')
    plt.title('Distribution of Prediction Errors')
    plt.show()

## First regressor

In [None]:
X_train_val_df = step3_log_norm_dev_df.copy()
X_train_val_df = X_train_val_df.drop(columns=['age'], axis=1)

forest = RandomForestRegressor(random_state=341967)

X_train, X_val, y_train, y_val = train_test_split(X_train_val_df, ages_df, test_size=0.3, shuffle=True, random_state=341967)

forest = forest.fit(X_train.values, y_train.values.reshape((-1,)))
y_pred = forest.predict(X_val.values)

display(root_mean_squared_error(y_val.values, y_pred))

plot_error_distripution(y_pred, y_val, 0)
#10.402057038409454

In [None]:
y_eval_pred = forest.predict(X_train_val_df.values)

display(np.max(y_eval_pred), np.min(y_eval_pred))
display(np.max(ages_df), np.min(ages_df))

In [None]:
from sklearn.model_selection import cross_val_score

abs(
    cross_val_score(
        RandomForestRegressor(n_jobs=-1, random_state=341967), X_train_val_df, ages_df, cv=10, scoring='neg_root_mean_squared_error', n_jobs=-1
    ).mean()
)

## Fine tuned

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV

def grid(builder, configs, cv):
    gs = GridSearchCV(builder(), configs, scoring='neg_root_mean_squared_error', n_jobs=-1, cv=cv)
    gs.fit(X_train.values, y_train.values.reshape((-1,)))
    
    return gs

X_train, X_val, y_train, y_val = train_test_split(X_train_val_df, ages_df, test_size=0.3, shuffle=True, random_state=341967)

forest = grid(RandomForestRegressor, {'n_estimators': [300, 500]}, 5)

y_pred = forest.predict(X_val.values)

display(root_mean_squared_error(y_val, y_pred))
plot_error_distripution(y_pred, y_val, 0)

#10.402057038409454

In [None]:
with open("results.csv", "w") as fout:
    fout.write("Id,Predicted\n")

    for id, y in enumerate(y_eval_pred):
        fout.write(f"{id}, {y}\n")

## Preprocess audio

### Data exploration

In [327]:
import warnings
warnings.filterwarnings("ignore")

test_audios = np.random.choice(audio_dev, 3)

#### Time domain

In [None]:
for i in test_audios:
    y, sr = librosa.load(AUDIOS_DEVELOPMENT + i, sr=sampling_rate)

    plt.figure(figsize=(12, 3))
    plt.title(f"Audio {i} as waveform")
    librosa.display.waveshow(y, sr=sr)
    plt.show()

#### Frequency domain

In [None]:
import scipy.fft

for i in test_audios:
    y, sr = librosa.load(AUDIOS_DEVELOPMENT + i, sr=sampling_rate)

    y_freq = np.abs(scipy.fft.fft(y))

    f = np.linspace(0, sr, len(y_freq))

    plt.figure(figsize=(12, 3))
    plt.title(f"Spectrum of audio {i}")
    plt.semilogx(f[: len(f) // 2], y_freq[: len(f) // 2])
    plt.xlabel("Frequency (Hz)")
    plt.show()

#### Spectrogram

In [None]:
for i in test_audios:
    y, sr = librosa.load(AUDIOS_DEVELOPMENT + i, sr=sampling_rate)

    x_stft = np.abs(librosa.stft(y))

    x_stft = librosa.amplitude_to_db(x_stft, ref=np.max)

    plt.figure(figsize=(12, 4))
    plt.title(f"Spectrogram of audio {i}")
    librosa.display.specshow(x_stft, sr=sr, x_axis="time", y_axis="log")
    plt.colorbar(format="%.1f dB")
    plt.show()

#### Mel-spectrogram

In [None]:
for i in test_audios:
    y, sr = librosa.load(AUDIOS_DEVELOPMENT + i, sr=sampling_rate)

    x_mel = librosa.feature.melspectrogram(y=y, sr=sr)

    x_mel = librosa.power_to_db(x_mel, ref=np.max)

    plt.figure(figsize=(12, 4))
    plt.title(f"Mel-spectrogram of audio {i}")
    librosa.display.specshow(x_mel, sr=sr, x_axis="time", y_axis="mel")
    plt.colorbar(format="%.2f dB")
    plt.show()

#### Mel-frequency cepstral coefficients (MFCC)

In [None]:
for i in test_audios:
    y, sr = librosa.load(AUDIOS_DEVELOPMENT + i, sr=sampling_rate)

    # Extract 'n_mfcc' numbers of MFCCs components (here 20)
    x_mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)

    # Plot MFCCs
    plt.figure(figsize=(12, 4))
    plt.title(f"MFCC of audio {i}")
    librosa.display.specshow(x_mfccs, sr=sr, x_axis="time")
    plt.colorbar()
    plt.show()