# Age Estimation task

## Imports

In [311]:
import numpy as np
import pandas as pd

import seaborn as sns
import scipy
import matplotlib.pyplot as plt

# %matplotlib widget

## Constants

In [312]:
FILE_DEVELOPMENT = "Dataset/development.csv"
FILE_EVALUATION = "Dataset/evaluation.csv"
AUDIOS_DEVELOPMENT = "Dataset/audios_development/"
AUDIOS_EVALUATION = "Dataset/audios_evaluation/"

## File reading

In [313]:
dev_original_df = pd.read_csv(FILE_DEVELOPMENT, header=0, index_col=0)
eval_original_df = pd.read_csv(FILE_EVALUATION, header=0, index_col=0)

## Data Exploration

In [None]:
# display(dev_original_df.head())

display(f"Total number of nan in development: {dev_original_df.isna().sum().sum()}")
display(f"Total number of nan in evaluation: {eval_original_df.isna().sum().sum()}")

desc_dev_df = dev_original_df.describe()
desc_eval_df = eval_original_df.describe()

sampling_rate = dev_original_df['sampling_rate'].iloc[0]

# display(desc_dev_df.loc['std', 'sampling_rate'])
# display(desc_eval_df.loc['std', 'sampling_rate'])

dev_df = dev_original_df.drop('sampling_rate', axis=1)
eval_df = eval_original_df.drop('sampling_rate', axis=1)

ages_df = dev_df[['age']]
path_dev_df = dev_df[['path']]
path_eval_df = eval_df[['path']]

dev_df = dev_df.drop(['path'], axis=1)
eval_df = eval_df.drop('path', axis=1)

display(dev_df.head())
display(eval_df.head())

# display(path_dev_df)
# display(path_eval_df)

In [None]:
enticity_df = dev_df['ethnicity'].value_counts()

avg_et = enticity_df.mean()
display(enticity_df[enticity_df == 1].shape)
# enticity_df[enticity_df > avg_et].plot()

etnie_chosen = list(set(enticity_df[enticity_df > avg_et].index))
display(etnie_chosen)


## Correlation

In [None]:
sns.heatmap(np.abs(dev_df.corr(numeric_only=True)))
dev_df = dev_df.drop(columns = 'age')

## Encoding

### Ethnicity

In [345]:
# display(dev_df)
def encode_ethnicity(X_df, ethnie):
    etna = pd.DataFrame(1, columns=ethnie, index=X_df.index)

    aggregated_ethnicity_df = X_df.copy()
    aggregated_ethnicity_df.loc[~(aggregated_ethnicity_df['ethnicity'].isin(ethnie)), 'ethnicity'] = 'others'


    # display(aggregated_ethnicity_df['ethnicity'])

    for etnia in ethnie:
        aggregated_ethnicity_df.loc[aggregated_ethnicity_df['ethnicity'] == etnia, etnia] = etna[etnia]

        aggregated_ethnicity_df.loc[aggregated_ethnicity_df['ethnicity'] == etnia, etnia] = 1
        aggregated_ethnicity_df.loc[~(aggregated_ethnicity_df['ethnicity'] == etnia), etnia] = 0

    # encoded_ethnicity = pd.get_dummies(aggregated_ethnicity_df['ethnicity'], dtype=int)

    # display(encoded_ethnicity)

    try:
        aggregated_ethnicity_df = aggregated_ethnicity_df.drop(columns=['ethnicity'], axis=1)
    except KeyError:
        pass
    # encoded_ethnicity_df = pd.concat([aggregated_ethnicity_df, encoded_ethnicity], axis=1)
    
    return aggregated_ethnicity_df


# TODO: no information from ethnicity in negligible

### Gender

In [None]:
mapper = {
    'male': 1,
    'female': -1
}

# display(step1_dev_df)

def encode_gender(X_df, mapper):
    encoded_df = X_df.copy()
    for gender, value in mapper.items():
        encoded_df.loc[encoded_df['gender'] == gender, 'gender'] = value

    encoded_df.loc[encoded_df['gender'] == 'famale', 'gender'] = -1

    return encoded_df


### Tempo

In [None]:
# display(encoded_gender_etnicity_df['tempo'])

def encode_tempo(X_df):
    encoded_df = X_df.copy()
    encoded_df['tempo'] = encoded_df['tempo'].map(lambda x: float(x.strip('[').strip(']')))
    return encoded_df


In [None]:
# display(eval_df.loc[eval_df['ethnicity'].isin(etnie_chosen), 'ethnicity'].value_counts())

step1_dev_df = encode_ethnicity(dev_df, etnie_chosen)
step1_eval_df = encode_ethnicity(eval_df, etnie_chosen)

# display(etna)
display(dev_df)
display(eval_df)
# step1_eval_df = encode_ethnicity(eval_df, etnie_chosen)

# display(step1_dev_df.head())
# display(step1_eval_df.head())

# step1_eval_df['gender'].value_counts()

step2_dev_df = encode_gender(step1_dev_df, mapper)
step2_eval_df = encode_gender(step1_eval_df, mapper)

display(step2_dev_df.head())
display(step2_eval_df.head())

step3_dev_df = encode_tempo(step2_dev_df)
step3_eval_df = encode_tempo(step2_eval_df)

display(step3_dev_df.head())
display(step3_eval_df.head())

# step3_dev_df = step3_dev_df.drop(columns=['ethnicity'], axis=1)
# step3_eval_df = step3_eval_df.drop(columns=['ethnicity'], axis=1)

display(step3_dev_df.head())
display(step3_eval_df.head())

## First regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split

forest = RandomForestRegressor()

# display(working_df)
# display(ages_df)

X_train, X_val, y_train, y_val = train_test_split(step3_dev_df, ages_df, test_size=0.2, shuffle=True, random_state=341967)

forest = forest.fit(X_train, y_train)
y_pred = forest.predict(X_val)

display(root_mean_squared_error(y_val.values, y_pred))

#10.402057038409454

In [None]:
y_eval_pred = forest.predict(step3_eval_df)

from collections import Counter

display(np.max(y_eval_pred), np.min(y_eval_pred))
display(np.max(ages_df), np.min(ages_df))

In [None]:
y_eval_pred

In [368]:
with open("results.csv", "w") as fout:
    fout.write("Id,Predicted\n")

    for id, y in enumerate(y_eval_pred):
        fout.write(f"{id}, {y}\n")

In [None]:
from sklearn.model_selection import cross_val_score

abs(cross_val_score(RandomForestRegressor(n_jobs=-1), step3_dev_df, ages_df, cv=10, scoring='neg_root_mean_squared_error', n_jobs=-1).mean())