In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt 
import numpy as np
from pathlib import Path
import random
import pandas as pd
from statistics import mean

In [3]:
data = pd.read_json('../5_accent_features_8k.json')

In [4]:
data

Unnamed: 0,lable,rms,zrc,sb,sc,mfcc
0,american,"[0.10995900630000001, 0.123431541, 0.057550724...","[0.1611328125, 0.2001953125, 0.2119140625, 0.3...","[982.3833614816, 998.6809827277, 1227.21971388...","[1026.7194059383, 921.3751421883, 1687.1791594...","[[-132.202331543, -106.4561920166, -144.163803..."
1,american,"[0.0449587628, 0.0892027393, 0.0972395092, 0.1...","[0.208984375, 0.310546875, 0.1513671875, 0.100...","[963.9106217367, 897.3924172554, 900.666981819...","[1576.1782294645, 1341.2154906595, 1251.823100...","[[-120.1662216187, -64.2591247559, -84.6273498..."
2,american,"[0.0351665355, 0.10262352970000001, 0.10095142...","[0.078125, 0.12109375, 0.064453125, 0.07128906...","[923.2569985211, 954.1517268996, 936.182168068...","[729.6191839967, 906.2135315307, 772.504023950...","[[-204.3710479736, -111.2505950928, -130.19665..."
3,indian,"[0.058094892700000005, 0.0720619857, 0.0426379...","[0.1396484375, 0.171875, 0.0625, 0.0673828125,...","[1126.3458385918, 1075.1604486286, 916.9369396...","[1119.0037267103, 1001.3589439764, 686.3549935...","[[-204.9865570068, -181.3363800049, -285.47653..."
4,american,"[0.0127645116, 0.055047065000000006, 0.0553400...","[0.1171875, 0.158203125, 0.0517578125, 0.04492...","[945.6208621592, 965.5379844122, 939.509883464...","[576.6528108766, 952.5287575697, 825.655844384...","[[-238.695098877, -131.825668335, -140.2941741..."
...,...,...,...,...,...,...
11125,indian,"[0.005203237800000001, 0.0583481714, 0.0883544...","[0.1484375, 0.1748046875, 0.0546875, 0.0595703...","[939.0655020602, 735.4849804102, 842.864382448...","[1085.5789562353, 633.6020721972, 746.87798516...","[[-392.308380127, -253.5061950684, -204.861923..."
11126,british,"[0.0670182556, 0.11963738500000001, 0.13444279...","[0.0966796875, 0.140625, 0.072265625, 0.056640...","[279.7990742638, 691.1704207535, 1057.99920323...","[334.4932575184, 602.9638842564, 910.891398897...","[[-296.3085021973, -209.2213897705, -214.50019..."
11127,australian,"[0.0458770357, 0.06701447070000001, 0.05166622...","[0.0791015625, 0.11328125, 0.2255859375, 0.267...","[889.8696890516, 1050.0291055209, 1192.2407333...","[831.469767543, 1120.86074567, 963.4700761504,...","[[-158.1745147705, -105.2735824585, -136.48347..."
11128,american,"[0.1035422683, 0.1399547011, 0.0942250416, 0.0...","[0.11523437500000001, 0.1923828125, 0.23339843...","[578.4063870442, 687.3923714443, 871.801319026...","[740.7449395003, 995.2480438369, 1025.91691399...","[[-106.186164856, -56.3407363892, -119.2368698..."


In [5]:
def get_min_avg_max_std(column, data=data):
    row_data_min, row_data_avg, row_data_max, row_data_std = [], [], [], []
    for row in data[column]:
        row_data_min.append(min(row))
        row_data_avg.append(mean(row))
        row_data_max.append(max(row))
        row_data_std.append(np.std(row))
    
    data.drop(column, axis=1, inplace=True)
    data[f'min_{column}'] = row_data_min
    data[f'avg_{column}'] = row_data_avg
    data[f'max_{column}'] = row_data_max
    data[f'std_{column}'] = row_data_std
    
    return row_data_min, row_data_avg, row_data_max, row_data_std

In [6]:
get_min_avg_max_std('rms')
get_min_avg_max_std('zrc')
get_min_avg_max_std('sb')
_ = get_min_avg_max_std('sc')

In [7]:
data.drop('mfcc', axis=1)

Unnamed: 0,lable,min_rms,avg_rms,max_rms,std_rms,min_zrc,avg_zrc,max_zrc,std_zrc,min_sb,avg_sb,max_sb,std_sb,min_sc,avg_sc,max_sc,std_sc
0,american,0.001167,0.103407,0.189941,0.047770,0.067383,0.189209,0.440430,0.094806,394.431290,880.691752,1307.266225,245.430824,434.735910,1009.641142,2161.378419,416.130162
1,american,0.000801,0.058658,0.103927,0.031081,0.079102,0.212576,0.539062,0.115285,480.439854,870.267340,1122.283493,160.383675,422.148634,1097.386582,2123.551926,451.090306
2,american,0.000754,0.067203,0.109837,0.031252,0.047852,0.204605,0.706055,0.200368,448.441120,828.700336,1435.614128,246.024383,291.909819,991.248812,2980.423978,688.983759
3,indian,0.001090,0.056282,0.122838,0.031935,0.056641,0.161513,0.534180,0.120783,436.448730,899.991063,1271.609599,213.625065,369.349090,1044.968954,2045.723166,404.911227
4,american,0.005269,0.055942,0.114724,0.029470,0.044922,0.132704,0.386719,0.085825,547.596459,1014.481168,1391.435266,217.824753,395.814580,1026.078903,2352.828746,413.301996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11125,indian,0.000496,0.051814,0.105858,0.028519,0.054688,0.196100,0.597656,0.134253,407.614581,888.767232,1286.562931,199.557633,327.117005,1075.957907,2119.978221,489.302061
11126,british,0.010991,0.076711,0.134443,0.036779,0.046875,0.142241,0.453125,0.105300,279.799074,837.773872,1161.597266,204.123944,334.493258,889.486936,2626.316454,469.690909
11127,australian,0.006336,0.063076,0.114340,0.028879,0.035156,0.144206,0.616211,0.134536,715.505224,988.036607,1311.033228,145.070914,566.711898,1066.883145,2340.979582,460.148163
11128,american,0.005160,0.094933,0.188103,0.061273,0.073242,0.219238,0.410156,0.117426,471.818934,809.624811,1178.005456,214.518913,651.189324,1095.364001,1907.227408,411.637243


In [8]:
features = data.drop(['lable', 'mfcc'], axis=1)
features

Unnamed: 0,min_rms,avg_rms,max_rms,std_rms,min_zrc,avg_zrc,max_zrc,std_zrc,min_sb,avg_sb,max_sb,std_sb,min_sc,avg_sc,max_sc,std_sc
0,0.001167,0.103407,0.189941,0.047770,0.067383,0.189209,0.440430,0.094806,394.431290,880.691752,1307.266225,245.430824,434.735910,1009.641142,2161.378419,416.130162
1,0.000801,0.058658,0.103927,0.031081,0.079102,0.212576,0.539062,0.115285,480.439854,870.267340,1122.283493,160.383675,422.148634,1097.386582,2123.551926,451.090306
2,0.000754,0.067203,0.109837,0.031252,0.047852,0.204605,0.706055,0.200368,448.441120,828.700336,1435.614128,246.024383,291.909819,991.248812,2980.423978,688.983759
3,0.001090,0.056282,0.122838,0.031935,0.056641,0.161513,0.534180,0.120783,436.448730,899.991063,1271.609599,213.625065,369.349090,1044.968954,2045.723166,404.911227
4,0.005269,0.055942,0.114724,0.029470,0.044922,0.132704,0.386719,0.085825,547.596459,1014.481168,1391.435266,217.824753,395.814580,1026.078903,2352.828746,413.301996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11125,0.000496,0.051814,0.105858,0.028519,0.054688,0.196100,0.597656,0.134253,407.614581,888.767232,1286.562931,199.557633,327.117005,1075.957907,2119.978221,489.302061
11126,0.010991,0.076711,0.134443,0.036779,0.046875,0.142241,0.453125,0.105300,279.799074,837.773872,1161.597266,204.123944,334.493258,889.486936,2626.316454,469.690909
11127,0.006336,0.063076,0.114340,0.028879,0.035156,0.144206,0.616211,0.134536,715.505224,988.036607,1311.033228,145.070914,566.711898,1066.883145,2340.979582,460.148163
11128,0.005160,0.094933,0.188103,0.061273,0.073242,0.219238,0.410156,0.117426,471.818934,809.624811,1178.005456,214.518913,651.189324,1095.364001,1907.227408,411.637243


In [9]:
lables = np.array(data['lable'])

lables

array(['american', 'american', 'american', ..., 'australian', 'american',
       'indian'], dtype=object)

In [10]:
from sklearn.model_selection import train_test_split

train_features, test_features, train_labels, test_labels = train_test_split(features, lables, test_size = 0.25, random_state = 42)

In [11]:
train_features.shape, test_features.shape, train_labels.shape, test_labels.shape

((8347, 16), (2783, 16), (8347,), (2783,))

In [12]:
lr_list = [0.05, 0.1, 0.2, 0.25, 0.3, 0.4, 0.5]
for lr in lr_list:
    model_gbm = GradientBoostingClassifier(n_estimators=1000,
                                           learning_rate=lr,
                                           max_depth=4,
                                           subsample=0.3,
                                           validation_fraction=0.1,
                                           n_iter_no_change=20,
                                           max_features='log2',
                                           verbose=1)
    model_gbm.fit(train_features, train_labels)
    print('---------------------------------------------------------------------------------------------------------')
    print('lr =', lr)
    print(model_gbm.score(test_features, test_labels))
    print('---------------------------------------------------------------------------------------------------------')
    print(classification_report(test_labels,model_gbm.predict(test_features)))
    print('---------------------------------------------------------------------------------------------------------')
    

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.2866           0.0281           34.89s
         2           1.2738           0.0289           34.85s
         3           1.1909           0.0244           33.70s
         4           1.1876           0.0227           32.53s
         5           1.1689           0.0206           32.21s
         6           1.1233           0.0185           31.57s
         7           1.1385           0.0171           31.19s
         8           1.1006           0.0160           31.49s
         9           1.1084           0.0175           31.12s
        10           1.0665           0.0172           30.76s
        20           0.9401           0.0091           29.58s
        30           0.8516           0.0050           28.88s
        40           0.8048           0.0033           28.73s
        50           0.7512           0.0026           28.30s
        60           0.7249           0.0018           28.05s
       

         3           0.9075           0.0403           30.09s
         4           0.8379           0.0351           30.61s
         5           0.8133          -0.9064           29.90s
         6           1.0714          -0.9617           29.77s
         7           1.7035           0.0085           29.87s
         8           2.1656           0.0073           30.20s
         9           1.9420           0.0129           30.08s
        10           1.3032          -0.0001           30.24s
        20   436925596.7891 -34688018388.2120           30.09s
---------------------------------------------------------------------------------------------------------
lr = 0.4
0.6521739130434783
---------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

    american       0.68      0.81      0.74      1439
  australian       0.70      0.66      0.68       368
     british       0.60      0.46     

In [13]:
conf_mat = confusion_matrix(test_labels,model_gbm.predict(test_features))
print(conf_mat)

[[1155   61   97   84   42]
 [ 110  235   13    3    7]
 [ 142   21  191   15   17]
 [ 195    3   25  154   23]
 [  79    2   14   14   81]]


In [14]:
import plotly.figure_factory as ff
acc = []
# set up figure 
fig = ff.create_annotated_heatmap(z, x=x, y=y, annotation_text=z_text, colorscale='Viridis')

# add title
fig.update_layout(title_text='<i><b>Confusion matrix</b></i>',
                  #xaxis = dict(title='x'),
                  #yaxis = dict(title='x')
                 )

# add custom xaxis title
fig.add_annotation(dict(font=dict(color="black",size=14),
                        x=0.5,
                        y=-0.15,
                        showarrow=False,
                        text="Predicted value",
                        xref="paper",
                        yref="paper"))

# add custom yaxis title
fig.add_annotation(dict(font=dict(color="black",size=14),
                        x=-0.35,
                        y=0.5,
                        showarrow=False,
                        text="Real value",
                        textangle=-90,
                        xref="paper",
                        yref="paper"))

# adjust margins to make room for yaxis title
fig.update_layout(margin=dict(t=50, l=200))

# add colorbar
fig['data'][0]['showscale'] = True
fig.show()

ModuleNotFoundError: No module named 'plotly'