# Visualizations

Visualize the features on a per level basis in order to better understand which features may be useful for training

Bar plots : binned in quantiles and coloured per label, this way we can see if there are patterns for certain features eblowing to a certain quantile given the level and see if there are relationships between features

Radar plots : the average of the features, each colour is the level, seeing different shapes and points for each feature falling in different values indicates that this feature may be useful for classification

### Install Packages

In [None]:
import sys
# !{sys.executable} -m pip install numpy
# !{sys.executable} -m pip install pandas
# !{sys.executable} -m pip install sklearn
# !{sys.executable} -m pip install seaborn
# !{sys.executable} -m pip install matplot
# !{sys.executable} -m pip install plotly
# !{sys.executable} -m pip install --upgrade nbformat

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

## Import data

In [None]:
## DETERMINES IF PLOTS WILL BE RENDERED
showPlots = True
df_merged = pd.read_csv('./extracted/full-train-features.csv')
df_merged

Save this for modeling

In [None]:
#df_merged.to_csv(f"extracted/full-train.csv", index=False)

In [None]:
nrows = 5
ncols = 2

features = [
  'get_max_intensity',
  'analyse_intensity',
  'analyse_pitch',
  'analyse_pitch_range',
  'analyse_shimmer',
  'analyse_jitter',
  'spectral_slope',
  'mean_spectral_rolloff',
  'get_energy',
  'analyse_harmonics'
]

In [None]:
def barPlotDataframes(dataframe, dataframeName,type):
  '''
    Produces a bar plot for each feature in counts of three bins
    @TODO increase bins with more data
    The bars are separated and colour coded by label
    type is either : 'segment' or 'parent'
  '''
  fig, plots = plt.subplots(nrows, ncols, figsize = (25,50))
  featureIdx = -1
  for rowIdx in range(nrows):
    for colIdx in range(ncols):
      featureIdx += 1
      if featureIdx > 13:
        break
      feature = features[featureIdx]
      temp_df = dataframe[dataframe[feature] != 0]
      temp_df[f'binned_{feature}'] = pd.qcut(temp_df[feature], q=4)
      temp_df_0 = temp_df[temp_df[f'{type}_label_train'] == 0]
      temp_df_1 = temp_df[temp_df[f'{type}_label_train'] == 1]
      temp_df_2 = temp_df[temp_df[f'{type}_label_train'] == 2]
      temp_df_3 = temp_df[temp_df[f'{type}_label_train'] == 3]

      ind = np.arange(4)  # the x locations for the groups
      width = 0.20     # the width of the bars

      cats = []
      counts_0 = []
      for idx,name in enumerate(temp_df_0[f'binned_{feature}'].value_counts().sort_index(ascending=True).index.tolist()):
        counts_0.append(temp_df_0[f'binned_{feature}'].value_counts().sort_index(ascending=True).iloc[idx])
        counts_0_relative = [x / len(temp_df_0) for x in counts_0]

      counts_1 = []
      for idx,name in enumerate(temp_df_1[f'binned_{feature}'].value_counts().sort_index(ascending=True).index.tolist()):
        cats.append(name)
        counts_1.append(temp_df_1[f'binned_{feature}'].value_counts().sort_index(ascending=True).iloc[idx])
        counts_1_relative = [x / len(temp_df_1) for x in counts_1]

      counts_2 = []
      for idx,name in enumerate(temp_df_2[f'binned_{feature}'].value_counts(normalize = True).sort_index(ascending=True).index.tolist()):
        counts_2.append(temp_df_2[f'binned_{feature}'].value_counts().sort_index(ascending=True).iloc[idx])
        counts_2_relative = [x / len(temp_df_2) for x in counts_2]

      counts_3 = []
      for idx,name in enumerate(temp_df_3[f'binned_{feature}'].value_counts(normalize = True).sort_index(ascending=True).index.tolist()):
        counts_3.append(temp_df_3[f'binned_{feature}'].value_counts().sort_index(ascending=True).iloc[idx])
        counts_3_relative = [x / len(temp_df_3) for x in counts_3]

      # fig, ax = plt.subplots(figsize = (20, 10))

      ax = plots[rowIdx][colIdx]

      rects0 = ax.bar(ind, counts_0_relative,
                width,
                color = 'cornflowerblue')
      rects1 = ax.bar(ind + width , counts_1_relative,
                      width,
                      color = 'salmon')
      rects2 = ax.bar(ind + width + width, counts_2_relative,
                      width,
                      color = 'aquamarine')
      rects3 = ax.bar(ind + width + width + width, counts_3_relative,
                      width,
                      color = 'mediumorchid')

      # add some text for labels, title and axes ticks
      ax.set_ylabel('counts')
      ax.set_xticks(ind + width+(width/2))
      ax.set_title(f'Counts of binned {feature} for {dataframeName}')
      ax.set_xticklabels(cats)
      
      ax.legend((rects0[0], rects1[0], rects2[0], rects3[0]), ('none', 'annoyance', 'threatening', 'aggressive'), fontsize=10)

## Binned bar plots

## Group 1 - Segment Analysis

In [None]:
barPlotDataframes(df_merged, 'all reddit data segment', 'segment')

## Group 1 - Parent Analysis

In [None]:
barPlotDataframes(df_merged, 'all reddit data parent', 'parent')

### No noise

In [None]:
no_noise = df_merged[df_merged.noisy_train != 1]

segment

In [None]:
barPlotDataframes(no_noise, 'non-noisy data segment', 'segment')

parent

In [None]:
barPlotDataframes(no_noise, 'non-noisy data parent', 'parent')

### Noisy

segment

In [None]:
noise = df_merged[df_merged.noisy_train == 1]

In [None]:
noise

segment

In [None]:
barPlotDataframes(noise, 'noisy data segment', 'segment')

parent

In [None]:
barPlotDataframes(noise, 'noisy data parent', 'parent')

## Radar plots

If there are issues with running this code please try `pip install --upgrade nbformat`

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import StandardScaler

In [None]:
def radarPlotDataframes(dataframe, dfname, type, min, max):

  categories = [	
  'max intensity',
  'avg intensity',
  'avg pitch',
  'pitch range',
  'shimmer',
  'jitter',
  'spectral slope',
  'spectral rolloff',
  'energy',
  'harmonics'
]

  ##scale the data
  for feature in features:
    scale = StandardScaler(copy = False)
    scale.fit_transform(dataframe[feature].to_numpy().reshape(-1, 1))

  #select first two ambiences to plot, cafe and fine dining
  temp_df_0 = dataframe[dataframe[f'{type}_label_train'] ==0]
  temp_df_0 = temp_df_0.drop(f'{type}_label_train', 1).values.tolist()
  
  #select first two ambiences to plot, cafe and fine dining
  temp_df_1 = dataframe[dataframe[f'{type}_label_train'] ==1]
  temp_df_1 = temp_df_1.drop(f'{type}_label_train', 1).values.tolist()

  #select first two ambiences to plot, cafe and fine dining
  temp_df_2 = dataframe[dataframe[f'{type}_label_train'] ==2]
  temp_df_2 = temp_df_2.drop(f'{type}_label_train', 1).values.tolist()

  #select first two ambiences to plot, cafe and fine dining
  temp_df_3 = dataframe[dataframe[f'{type}_label_train'] ==3]
  temp_df_3 = temp_df_3.drop(f'{type}_label_train', 1).values.tolist()

  #create subplot layout
  fig = make_subplots(rows=1, cols=1, 
                      specs=[[{"type": "polar"}]],
                      subplot_titles=(f'{dfname}',))
  
  fig.add_trace(go.Scatterpolar(
        r=temp_df_0[0],
        theta=categories,
        fill='toself',
        name='none'),
        row = 1,
        col = 1
  )
  fig.add_trace(go.Scatterpolar(
        r=temp_df_1[0],
        theta=categories,
        fill='toself',
        name='annoyance'),
        row = 1,
        col = 1
  )
  fig.add_trace(go.Scatterpolar(
        r=temp_df_2[0],
        theta=categories,
        fill='toself',
        name='threatening'),
        row = 1,
        col = 1
  )
  fig.add_trace(go.Scatterpolar(
        r=temp_df_3[0],
        theta=categories,
        fill='toself',
        name='aggressive'),
        row = 1,
        col = 1
  )
  
  #change the size and layout, make margins smaller
  fig.update_layout(
    polar=dict(
      radialaxis=dict(
        visible=True,
        range=[min, max]
      ),
      angularaxis = dict(tickfont = dict(size = 20))
    ),
    margin=dict(l=0, r=0, t=100, b=50),
    showlegend=True,
    autosize=False,
    width=1200, height = 1000,
    legend=dict(
      x=0,
      y=1,
      traceorder="reversed",
      font=dict(
          size=20,
          color="black"
      )
    )
  )
  
  #add titles
  fig.update_annotations(y=1.05,
                         selector={'text':f'{dfname}'},
                         font={'size': 20})
  
  fig.show()

aggregate averages

In [None]:
means_all_segment = df_merged.groupby('segment_label_train').mean().reset_index()
means_all_segment = means_all_segment.drop(['duration','number_of_segments_train','number_of_voices_parent_train','parent_label_train','noisy_train'], axis=1)
means_all_segment['segment_label_train'] = means_all_segment['segment_label_train']

In [None]:
means_all_parent = df_merged.groupby('parent_label_train').mean().reset_index()
means_all_parent = means_all_parent.drop(['duration','number_of_segments_train','number_of_voices_parent_train','segment_label_train','noisy_train'], axis=1)
means_all_parent['parent_label_train'] = means_all_parent['parent_label_train']

In [None]:
means_non_segment = no_noise.groupby('segment_label_train').mean().reset_index()
means_non_segment = means_non_segment.drop(['duration','number_of_segments_train','number_of_voices_parent_train','parent_label_train','noisy_train'], axis=1)
means_non_segment['segment_label_train'] = means_non_segment['segment_label_train']

In [None]:
means_non_parent = no_noise.groupby('parent_label_train').mean().reset_index()
means_non_parent = means_non_parent.drop(['duration','number_of_segments_train','number_of_voices_parent_train','segment_label_train','noisy_train'], axis=1)
means_non_parent['parent_label_train'] = means_non_parent['parent_label_train']

In [None]:
means_noise_segment = noise.groupby('segment_label_train').mean().reset_index()
means_noise_segment = means_noise_segment.drop(['duration','number_of_segments_train','number_of_voices_parent_train','parent_label_train','noisy_train'], axis=1)
means_noise_segment['segment_label_train'] = means_noise_segment['segment_label_train']

In [None]:
means_noise_parent = noise.groupby('parent_label_train').mean().reset_index()
means_noise_parent = means_noise_parent.drop(['duration','number_of_segments_train','number_of_voices_parent_train','segment_label_train','noisy_train'], axis=1)
means_noise_parent['parent_label_train'] = means_noise_parent['parent_label_train']

In [None]:
radarPlotDataframes(means_all_segment, 'All video voices', 'segment', -1.5, 2.5)

In [None]:
radarPlotDataframes(means_all_parent, 'All video voices', 'parent', -2.0, 2.5 )

In [None]:
radarPlotDataframes(means_non_segment, 'Non-Noisy voices', 'segment', -1.3, 2.5)

In [None]:
radarPlotDataframes(means_non_parent, 'All video voices', 'parent', -3.1, 3 )

In [None]:
radarPlotDataframes(means_noise_segment, 'Noisy voices', 'segment',-1.7, 1.8)

In [None]:
radarPlotDataframes(means_noise_parent, 'All video voices', 'parent', -1.8, 2.5)