In [34]:
import boto3
from sagemaker import get_execution_role
import pandas as pd
import numpy as np
import os
role = get_execution_role()

In [47]:
def get_data(file_name):
    s3 = boto3.resource('s3')
    s3.Bucket('oosv-sagemaker-bucket1').download_file('data/' + file_name, file_name)
    df = pd.read_csv(file_name)
    os.remove(file_name)
    return df.as_matrix()

In [51]:
features_es = get_data('spanish_data.csv')
features_en = get_data('english_data.csv')

sample_length = 75 #75 utterances * 10ms = 750ms samples
print(np.shape(features_es))
print(np.shape(features_en))

#the first line was saved as a header, so we get rid of the first sample
features_es = features_es[sample_length - 1:]
features_en = features_en[sample_length - 1:]

print(np.shape(features_es))
print(np.shape(features_en))
print(features_es[0])


(1760325, 39)
(4939049, 39)
(1760325, 39)
(4938975, 39)
[ 1.07925819e+01 -3.72304979e+00  1.71936253e+01  1.73390476e+01
  4.44450394e+00  1.16848528e+01  2.32587857e+00  9.84613189e+00
  1.34391571e+01 -6.41916462e+00  1.15366788e+01  5.04457432e+00
 -6.38155963e+00 -3.32922300e-02  1.02552683e+00 -1.58195645e+00
 -7.12041933e-02  2.19532491e+00 -3.63804157e+00  1.00460387e+00
  1.91507799e+00 -4.59325777e+00  4.45664821e-01  3.10244348e+00
 -1.43183091e+00 -1.44078525e+00 -3.11959309e-03  3.06593247e-01
  1.84430276e-01 -6.60064651e-01  5.42640886e-01 -4.99881947e-01
  9.48443647e-01 -1.86166601e-01 -9.65795141e-01  1.66644619e+00
 -7.89592904e-01 -2.85218212e-01  6.67445509e-01]


In [52]:
from extractor import extractor
import imp
from models import gmm
imp.reload(gmm)
single_model = gmm.GMM()

In [53]:
features_es = features_es.reshape((int(np.shape(features_es)[0] / sample_length), sample_length, -1))
features_en = features_en.reshape((int(np.shape(features_en)[0] / sample_length), sample_length, -1))
print(int(np.shape(features_es)[0]/sample_length))
print(np.shape(features_es))
print(np.shape(features_en))

312
(23471, 75, 39)
(65853, 75, 39)


In [76]:
print(f"total samples_en:\t{len(features_en)}")
print(f"total hours english:\t{len(features_en) * 0.75 / 3600}")
print(f"total samples_es:\t{len(features_es)}")
print(f"total hours spanish:\t{len(features_es) * 0.75 / 3600}")
print(f"total hours combined:\t{(len(features_es) + len(features_en)) * 0.75/3600}")


total samples_en:	65853
total hours english:	13.719375
total samples_es:	23471
total hours spanish:	4.8897916666666665
total hours combined:	18.609166666666667


In [57]:
train_en = features_en[:int(len(features_en) * 3 / 4)]
train_es = features_es[:int(len(features_es) * 3 / 4)]
test_en = features_en[int(len(features_en) * 3 / 4) + 1:]
test_es = features_es[int(len(features_es) * 3 / 4) + 1:]


In [58]:
print(f"total training samples_en:\t{len(train_en)}")
print(f"total training hours english:\t{len(train_en) * 0.75 / 3600}")
print(f"total training samples_es:\t{len(train_es)}")
print(f"total training hours spanish:\t{len(train_es) * 0.75 / 3600}")
print(f"total training hours combined:\t{(len(train_es) + len(train_en)) * 0.75 /3600}")


total training samples_en:	49389
total training hours english:	10.289375
total training samples_es:	17603
total training hours spanish:	3.667291666666667
total training hours combined:	13.956666666666667


In [59]:
print(f"shape train english: {np.shape(train_en)}")
print(f"shape train spanish: {np.shape(train_es)}")
print(f"shape test english: {np.shape(test_en)}")
print(f"shape test spanish: {np.shape(test_es)}")
print(f"total testing samples_en:\t{len(test_en)}")
print(f"total testing hours english:\t{len(test_en) * 0.75 / 3600}")
print(f"total testing samples_es:\t{len(test_es)}")
print(f"total testing hours spanish:\t{len(test_es) * 0.75 / 3600}")
print(f"total testing hours combined:\t{(len(test_en) + len(test_es)) * 0.75/3600}")


shape train english: (49389, 75, 39)
shape train spanish: (17603, 75, 39)
shape test english: (16463, 75, 39)
shape test spanish: (5867, 75, 39)
total testing samples_en:	16463
total testing hours english:	3.4297916666666666
total testing samples_es:	5867
total testing hours spanish:	1.2222916666666668
total testing hours combined:	4.652083333333334


In [182]:
#initiate gmm model 
#single model example
model = gmm.GMM(n_clusters=[30,30]) #There are other parameters but for simplicity, an array of clusters will do



In [60]:
gmm_models = [gmm.GMM(n_clusters=[50,50]), \
              gmm.GMM(n_clusters=[50,30]), \
              gmm.GMM(n_clusters=[30,30]), \
              gmm.GMM(n_clusters=[40,50]), \
              gmm.GMM(n_clusters=[40,40]), \
              gmm.GMM(n_clusters=[50,40]), \
              gmm.GMM(n_clusters=[30,40]), \
              gmm.GMM(n_clusters=[40,30])]


In [None]:
'''
This is the part SageMaker is for
to train below
'''
import pprint
lang_to_id = {'english' : 0, 'spanish': 1}
predictions = {n : [] for n in range(len(gmm_models))}
for idx, gmm_model in enumerate(gmm_models):
    gmm_model.train(train_en, lang_to_id['english'])
    gmm_model.train(train_es, lang_to_id['spanish'])
    predictions[idx] = {'english' : \
                       np.average(np.array(gmm_model.predict_all(test_en)) == lang_to_id['english']) , \
                       'spanish' : \
                       np.average(np.array(gmm_model.predict_all(test_es)) == lang_to_id['spanish'])}
pprint.pprint(predictions)


50
{0: {'english': 0.9, 'spanish': 0.0},
 1: {'english': 0.34, 'spanish': 0.74},
 2: {'english': 0.9, 'spanish': 0.18},
 3: {'english': 0.94, 'spanish': 0.0},
 4: {'english': 1.0, 'spanish': 0.0},
 5: {'english': 0.96, 'spanish': 0.02},
 6: {'english': 1.0, 'spanish': 0.0},
 7: {'english': 0.62, 'spanish': 0.38}}
50
{0: {'english': 0.9, 'spanish': 0.0},
 1: {'english': 0.34, 'spanish': 0.74},
 2: {'english': 0.9, 'spanish': 0.18},
 3: {'english': 0.94, 'spanish': 0.0},
 4: {'english': 1.0, 'spanish': 0.0},
 5: {'english': 0.96, 'spanish': 0.02},
 6: {'english': 1.0, 'spanish': 0.0},
 7: {'english': 0.62, 'spanish': 0.38}}
50
{0: {'english': 0.9, 'spanish': 0.0},
 1: {'english': 0.34, 'spanish': 0.74},
 2: {'english': 0.9, 'spanish': 0.18},
 3: {'english': 0.94, 'spanish': 0.0},
 4: {'english': 1.0, 'spanish': 0.0},
 5: {'english': 0.96, 'spanish': 0.02},
 6: {'english': 1.0, 'spanish': 0.0},
 7: {'english': 0.62, 'spanish': 0.38}}
50
{0: {'english': 0.9, 'spanish': 0.0},
 1: {'english'

In [72]:
print(np.shape(test_en))
print(np.shape(test_es))

(16463, 75, 39)
(5867, 75, 39)


In [75]:
for model in gmm_models[:1]:
    print(f"english: {np.average(np.array(model.predict_all(test_en)) == 0)}")
    print(f"spanish: {np.average(np.array(model.predict_all(test_es)) == 1)}")
    

english: 0.8562230456174452
spanish: 0.5629793761718084


In [None]:
print(model.cluster_distributions)
                                