## Evaluating regression techniques for speaker characterization
### Laura Fernández Gallardo

Multioutput regression: for each instance, targeting the prediction of the 5 dimensions of perceptive speaker characteristics.

Motifications with respect to regression with 1-dimensional output:

* targets: 5-dimensional scores derived from factor analysis on the 34-dimensional ratings of speaker characteristics in the [subjective analysis](https://github.com/laufergall/Subjective_Speaker_Characteristics).


In [3]:
import io
import requests
import time # for timestamps

import numpy as np
import pandas as pd

from reg_tuning import * # my helper functions

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
# fix random seed for reproducibility
seed = 2302
np.random.seed(seed)

In [11]:
# features and ratings from the regression task with 1d output

feats_ratings_train = pd.read_csv(r'.\data_while_tuning\feats_ratings_train.csv')
feats_ratings_test = pd.read_csv(r'.\data_while_tuning\feats_ratings_test.csv')

sc_names = ['non_likable', 'secure', 'attractive', 'unsympathetic', 'indecisive', 'unobtrusive', 'distant', 'bored', 'emotional', 'not_irritated', 'active', 'pleasant', 'characterless', 'sociable', 'relaxed', 'affectionate', 'dominant', 'unaffected', 'hearty', 'old', 'personal', 'calm', 'incompetent', 'ugly', 'friendly', 'masculine', 'submissive', 'indifferent', 'interesting', 'cynical', 'artificial', 'intelligent', 'childish', 'modest']

dropcolumns = ['name','spkID','speaker_gender'] + sc_names
feats_names = list(feats_ratings_train.drop(dropcolumns, axis=1))

In [13]:
# speaker scores

path = "https://raw.githubusercontent.com/laufergall/Subjective_Speaker_Characteristics/master/data/generated_data/"

url = path + "factorscores_malespk.csv"
s = requests.get(url).content
scores_m =pd.read_csv(io.StringIO(s.decode('utf-8')))

url = path + "factorscores_femalespk.csv"
s = requests.get(url).content
scores_f =pd.read_csv(io.StringIO(s.decode('utf-8')))

# rename dimensions
scores_m.columns = ['sample_heard', 'warmth', 'attractiveness', 'confidence', 'compliance', 'maturity']
scores_f.columns = ['sample_heard', 'warmth', 'attractiveness', 'compliance', 'confidence', 'maturity']

# join male and feame scores
scores = scores_m.append(scores_f)
scores['gender'] = scores['sample_heard'].str.slice(0,1)
scores['spkID'] = scores['sample_heard'].str.slice(1,4).astype('int')

scores.head()

Unnamed: 0,attractiveness,compliance,confidence,maturity,sample_heard,warmth,gender,spkID
0,-0.579301,-0.921918,0.608503,0.27658,m004_linden_stimulus.wav,-0.284638,m,4
1,0.442865,-0.950212,0.588889,0.630295,m005_nicosia_stimulus.wav,-0.494019,m,5
2,-0.507534,0.139302,-0.151077,-0.669449,m006_rabat_stimulus.wav,1.533478,m,6
3,1.180748,-0.108982,0.962166,1.026359,m007_klaksvik_stimulus.wav,0.478983,m,7
4,1.070247,-0.284278,-0.875589,-1.291311,m016_beirut_stimulus.wav,1.861551,m,16


In [18]:
# merge scores and features

feats_ratings_scores_train = feats_ratings_train.merge(scores) # (2700, 132)
feats_ratings_scores_test = feats_ratings_test.merge(scores) # (891, 132)

In [None]:
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.metrics import mean_squared_error, r2_score

# neigh = KNeighborsRegressor(n_neighbors=11, weights = 'distance')
# neigh.fit(X, y) 

 
# y_pred = neigh.predict(Xt) 


# # root mean squared error
# myrmse = np.sqrt(mean_squared_error(yt, y_pred))
# print(myrmse)

# # median absolute percentage error 
# mymape = mape(yt, y_pred)
# print(mymape)

# # coefficient of determination R^2
# R2 = r2_score(yt, y_pred)
# print(R2)

# score = neigh.score(Xt, yt)
# print(score) # same as R2