# **PROGETTO DSL**

In [1]:
import csv
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [2]:
# Loading the dataset using Pandas dataframe
data_dev = pd.read_csv('datasets/development.csv')
data_dev = data_dev.drop(columns=['Id'])

labels = list(data_dev.columns.values)
labels.remove('gender')
labels.remove('ethnicity')

# First, let's convert tempo in a float data type
data_dev['tempo'] = data_dev['tempo'].str.replace('[','')
data_dev['tempo'] = data_dev['tempo'].str.replace(']','').astype(float)
data_dev

# Displaying the first 5 rows of the dataset
print(data_dev.head())

   sampling_rate   age  gender   ethnicity  mean_pitch  max_pitch  min_pitch  \
0          22050  24.0  female      arabic   1821.6906  3999.7170  145.43066   
1          22050  22.5  female   hungarian   1297.8187  3998.8590  145.37268   
2          22050  22.0  female  portuguese   1332.8524  3998.8025  145.42395   
3          22050  22.0  female     english   1430.3499  3998.4510  147.98083   
4          22050  22.0    male       dutch   1688.7234  3998.6113  145.44772   

     jitter   shimmer    energy  zcr_mean  spectral_centroid_mean       tempo  \
0  0.013795  0.082725  0.002254  0.210093             3112.257251  151.999081   
1  0.025349  0.096242  0.007819  0.078849             1688.016389  129.199219   
2  0.019067  0.119456  0.002974  0.105365             2576.901706  117.453835   
3  0.017004  0.102389  0.022371  0.173701             3269.751413  117.453835   
4  0.028027  0.124831  0.005369  0.107279             1930.897375  112.347147   

          hnr  num_words  num_ch

In [3]:
# One-hot encoding for attribute gender and ethnicity
from collections import Counter
counts = dict(Counter(data_dev.ethnicity))
print('Numero di etnie', len(counts))

from sklearn.preprocessing import OneHotEncoder

# Crea il OneHotEncoder per "ethnicity"
all_categories_etn = sorted(set(data_dev['ethnicity']))
encoder_etn = OneHotEncoder(categories=[all_categories_etn], handle_unknown='ignore')
etn_encoded = encoder_etn.fit_transform(data_dev[['ethnicity']]).toarray()
etn_encoded_df = pd.DataFrame(etn_encoded, columns=['ethnicity_' + cat for cat in all_categories_etn])

# Crea il OneHotEncoder per "gender"
all_categories_gender = sorted(set(data_dev['gender']))
encoder_gender = OneHotEncoder(categories=[all_categories_gender], handle_unknown='ignore')
gender_encoded = encoder_gender.fit_transform(data_dev[['gender']]).toarray()
gender_encoded_df = pd.DataFrame(gender_encoded, columns=['gender_' + cat for cat in all_categories_gender])

# Riuniamo il dataset
data_dev = pd.concat([data_dev.reset_index(drop=True).drop(columns=['ethnicity']), etn_encoded_df], axis=1)
data_dev = pd.concat([data_dev.reset_index(drop=True).drop(columns=['gender']), gender_encoded_df], axis=1)

data_dev

Numero di etnie 165


Unnamed: 0,sampling_rate,age,mean_pitch,max_pitch,min_pitch,jitter,shimmer,energy,zcr_mean,spectral_centroid_mean,...,ethnicity_sa'a,ethnicity_sardinian,ethnicity_sarua,ethnicity_satawalese,ethnicity_tiv,ethnicity_ukwani,ethnicity_urhobo,ethnicity_yoruba,gender_female,gender_male
0,22050,24.0,1821.69060,3999.7170,145.43066,0.013795,0.082725,0.002254,0.210093,3112.257251,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,22050,22.5,1297.81870,3998.8590,145.37268,0.025349,0.096242,0.007819,0.078849,1688.016389,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,22050,22.0,1332.85240,3998.8025,145.42395,0.019067,0.119456,0.002974,0.105365,2576.901706,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,22050,22.0,1430.34990,3998.4510,147.98083,0.017004,0.102389,0.022371,0.173701,3269.751413,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,22050,22.0,1688.72340,3998.6113,145.44772,0.028027,0.124831,0.005369,0.107279,1930.897375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2928,22050,24.0,1641.14930,3999.1616,145.39359,0.023647,0.115361,0.001879,0.111799,2188.853478,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2929,22050,15.0,1089.60050,3984.6550,145.58409,0.015317,0.126740,0.000339,0.070508,2712.362323,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2930,22050,17.0,994.46484,3989.1785,148.97475,0.009677,0.103535,0.001464,0.058442,2248.698477,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2931,22050,18.0,1600.00820,3999.7559,145.36101,0.019571,0.100946,0.004451,0.115139,1834.596924,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [8]:
# come useful libraries for deature extraction
import os
import librosa
import numpy as np

In [None]:
# function which reads a audio file and extract some useful spectral features 

def audio_feature_extraction(file_path, num_mfcc_coeff):

    # Reading the audio file 
    audio_time_series, sampling_rate = librosa.load(path = file_path)
    spectrogram = librosa.feature.melspectrogram(y= audio_time_series, sr= sampling_rate, n_mels=40)
    log_spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
    
    # Extracting MFCC coefficients
    mfcc = librosa.feature.mfcc(y= audio_time_series , sr= sampling_rate, n_mfcc= num_mfcc_coeff)  # Compute the firs 13 coeffs

    
    return  mfcc.mean(axis=1)


mfcc_list = []
num_coeff = 10

# For each audio file we are going to extract the MFCC coefficients
for file_path in data_dev['path']:
    mfcc = audio_feature_extraction('datasets/' + file_path,  num_coeff)
    mfcc_list.append(mfcc)

# Converting the mfcc list in a dataframe to concatenate it to data_dev
mfcc_df = pd.DataFrame(mfcc_list, columns=[f'mfcc_{i+1}' for i in range(num_coeff)])
data_dev = pd.concat([data_dev, mfcc_df], axis=1)

# Visualizza il DataFrame finale
print(data_dev)


# **MODELLI**

In [4]:
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

from sklearn.feature_selection import RFE, SelectKBest, f_regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline, Pipeline

# **PREVISIONE**

In [None]:
# Loading the evaluation dataset
data_eval = pd.read_csv('datasets/evaluation.csv')

data_eval['tempo'] = data_eval['tempo'].str.replace('[','')
data_eval['tempo'] = data_eval['tempo'].str.replace(']','').astype(float)

# encoding gender and etnhicity with the encoder preoviously created
# Encoding per "ethnicity"
etn_encoded = encoder_etn.transform(data_eval[['ethnicity']]).toarray()
etn_encoded_df = pd.DataFrame(etn_encoded, columns=['ethnicity_' + cat for cat in all_categories_etn])

# Encoding per "gender"
gender_encoded = encoder_gender.transform(data_eval[['gender']]).toarray()
gender_encoded_df = pd.DataFrame(gender_encoded, columns=['gender_' + cat for cat in all_categories_gender])

# Riuniamo il dataset
data_eval = pd.concat([data_eval.reset_index(drop=True).drop(columns=['ethnicity']), etn_encoded_df], axis=1)
data_eval = pd.concat([data_eval.reset_index(drop=True).drop(columns=['gender']), gender_encoded_df], axis=1)

data_eval_provv = data_eval.drop(columns=['Id', 'path'])
y_pred = best_model.predict(data_eval_provv)

# Writing the csv file
with open('results.csv', mode='w') as file:
    writer = csv.writer(file)
    writer.writerow(['Id', 'Predicted'])
    for id, age in zip(data_eval['Id'], y_pred):
        writer.writerow([id, age])