# Installing Packages

In [None]:
import sys
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install sklearn
!{sys.executable} -m pip install yellowbrick
!{sys.executable} -m pip install matplot
!{sys.executable} -m pip install seaborn

# Preprocessing of Data

In [109]:
import os
import numpy as np
import pandas as pd

# Helper function: simply reads a csv file and return it as a dataframe
def read_data(data_file):
  data_df = pd.read_csv(data_file)
  return data_df

def drop_features(data_df, features_to_drop):
  return data_df.drop(columns=features_to_drop, inplace=True)

def segregate_noisy_data(data_df):
  non_noisy_df = data_df.loc[(data_df['noisy_train'] == 0)]
  noisy_df = data_df.loc[(data_df['noisy_train'] == 1)]
  return non_noisy_df, noisy_df

def remove_noisy_data(data_df):
  print(f'''Rows dropped because of low duration: 
        {len(data_df[data_df['noisy'] == 1])}\n''')
  
  return data_df.drop(data_df[data_df['noisy'] == 1].index, inplace=True)

# Helper function: cleans data based on xyz
def preprocessing_data(data_df):

  # Clean data based on duration value
  print(f'''Rows dropped because of low duration: \
        {len(data_df[data_df['duration'] < 2])}\n''')
  
  data_df.drop(data_df[data_df['duration'] < 2].index, inplace=True)

  # Extract filenames
  filenames = data_df['file_name'].values.tolist()

  return (data_df, filenames)


Read Data

In [112]:
# Fetch .csv file and store it as a dataframe
aggression_data_df = read_data('../extracted/full-train-features.csv')

Extract data, filenames and labels after preprocessing data

In [113]:
print(f'Shape of the data before cleaning it: {aggression_data_df.shape}\n')
print(aggression_data_df.tail())

aggression_data_df, filenames = preprocessing_data(aggression_data_df)

print(f'Shape of the data after cleaning it: {aggression_data_df.shape} \n')

Shape of the data before cleaning it: (2147, 23)

                      file_name  duration  analyse_harmonics  \
2142  xdz15thrnem51_processed_5      10.0           5.465491   
2143  xdz15thrnem51_processed_6      10.0           6.879443   
2144  xdz15thrnem51_processed_7      10.0           4.468770   
2145  xdz15thrnem51_processed_8      10.0           8.581945   
2146  xdz15thrnem51_processed_9      10.0           2.829222   

      get_number_sylls  get_number_words  pauses get_max_intensity  \
2142               1.3               1.0     0.1       104.1138965   
2143               0.2               0.3     0.1        108.028173   
2144               1.8               1.1     0.1       104.6782678   
2145               0.3               0.1     0.1        109.335715   
2146               0.6               0.6     0.1       105.4555978   

      analyse_intensity  analyse_pitch  analyse_pitch_range  ...  \
2142          69.550541     199.893943           443.057705  ...   
2143    

Segregate data based on noisy vs not noisy

In [114]:
non_noisy_df, noisy_df = segregate_noisy_data(aggression_data_df)

print(f"Shape of noisy data: {noisy_df.shape} \n")
print(f"Shape of non noisy data: {non_noisy_df.shape}\n")

Shape of noisy data: (262, 23) 

Shape of non noisy data: (1849, 23)



Drop features

In [115]:
features_to_drop = ['file_name', 'duration','get_number_sylls',
                    'get_number_words', 'pauses', 'parent_file_train',
                    'number_of_segments_train', 'number_of_voices_parent_train',
                    'extra_sounds_train', 'noisy_train', 'Notes_train']

# remove features
print(f'Shape of the data before dropping features: {aggression_data_df.shape} \n')
drop_features(aggression_data_df, features_to_drop)

print(f'Shape of the data after dropping features: {aggression_data_df.shape} \n')
print(aggression_data_df.tail())

Shape of the data before dropping features: (2111, 23) 

Shape of the data after dropping features: (2111, 12) 

      analyse_harmonics get_max_intensity  analyse_intensity  analyse_pitch  \
2142           5.465491       104.1138965          69.550541     199.893943   
2143           6.879443        108.028173          75.501002     258.536830   
2144           4.468770       104.6782678          71.100226     194.904220   
2145           8.581945        109.335715          76.784989     285.988051   
2146           2.829222       105.4555978          74.654532     186.190085   

      analyse_pitch_range  analyse_shimmer  analyse_jitter  spectral_slope  \
2142           443.057705         0.196757        0.033226       -0.020890   
2143           379.915372         0.149580        0.024206       -0.081411   
2144           493.340814         0.162628        0.034465       -0.028155   
2145           520.259229         0.149758        0.022509       -0.108299   
2146           486.287

### Analyze what features to drop

In [116]:
aggression_data_df.isnull().sum(axis = 0)

analyse_harmonics        0
get_max_intensity        0
analyse_intensity        0
analyse_pitch            1
analyse_pitch_range      1
analyse_shimmer          1
analyse_jitter           1
spectral_slope           0
mean_spectral_rolloff    0
get_energy               0
parent_label_train       0
segment_label_train      0
dtype: int64

Remove rows that has `NaN` as one of the feature values

In [117]:
aggression_data_df.dropna(inplace=True)
aggression_data_df.shape


(2110, 12)

Convert values in 'get_max_intensity' from string to float 

In [118]:
aggression_data_df['get_max_intensity'] = pd.to_numeric(aggression_data_df['get_max_intensity'])

aggression_data_df.tail()

Unnamed: 0,analyse_harmonics,get_max_intensity,analyse_intensity,analyse_pitch,analyse_pitch_range,analyse_shimmer,analyse_jitter,spectral_slope,mean_spectral_rolloff,get_energy,parent_label_train,segment_label_train
2142,5.465491,104.113896,69.550541,199.893943,443.057705,0.196757,0.033226,-0.02089,1002.676098,0.036482,2,1
2143,6.879443,108.028173,75.501002,258.53683,379.915372,0.14958,0.024206,-0.081411,1023.670363,0.14129,2,2
2144,4.46877,104.678268,71.100226,194.90422,493.340814,0.162628,0.034465,-0.028155,980.207236,0.051732,2,1
2145,8.581945,109.335715,76.784989,285.988051,520.259229,0.149758,0.022509,-0.108299,1146.211888,0.190454,2,2
2146,2.829222,105.455598,74.654532,186.190085,486.287854,0.195173,0.034362,-0.081877,698.359229,0.116369,2,1


Segregate data df and label df

In [119]:
feature_labels = ['parent_label_train', 'segment_label_train']

label_df = aggression_data_df.filter(items=feature_labels)
aggression_df = aggression_data_df.drop(columns=feature_labels)

print(label_df.shape)
print(aggression_df.shape)

(2110, 2)
(2110, 10)


# Standardization

In [120]:
from sklearn.preprocessing import StandardScaler

# Helper fucntion: returns a standardized dataframe
def standardize(df):
    features = df.columns
    values_array = df.loc[:, features].values

    # Print data before standardization to compare

    print("Before Standardization \n")
    print("Mean and Standard Deviation")
    print(f"Mean = {np.mean(values_array)} ; Standard Deviation = {np.std(values_array)}")
    print(f"Shape to verify: {values_array.shape} \n")
    print(f"Tail of the data pre standardization: \n{values_array[-5:]} \n\n")

    # Invoke standardization

    values_array = StandardScaler().fit_transform(values_array)

    # Print data after standardization to compare

    print("After Standardization \n")
    print("Mean and Standard Deviation")
    print(f"Mean = {np.mean(values_array)} ; Standard Deviation = {np.std(values_array)}")
    print(f"Shape to verify: {values_array.shape}")
    print(f"Tail of the data post standardization: \n{values_array[-5:]}")

    return pd.DataFrame(values_array, columns=features)

Run standardization

In [121]:
standardized_aggression_df = standardize(aggression_df)
standardized_aggression_df.tail()

Before Standardization 

Mean and Standard Deviation
Mean = 207.03214462485482 ; Standard Deviation = 405.89809500083527
Shape to verify: (2110, 10) 

Tail of the data pre standardization: 
[[ 5.46549109e+00  1.04113896e+02  6.95505405e+01  1.99893943e+02
   4.43057705e+02  1.96756913e-01  3.32257430e-02 -2.08904800e-02
   1.00267610e+03  3.64816420e-02]
 [ 6.87944281e+00  1.08028173e+02  7.55010015e+01  2.58536830e+02
   3.79915372e+02  1.49580347e-01  2.42058080e-02 -8.14106130e-02
   1.02367036e+03  1.41290270e-01]
 [ 4.46876994e+00  1.04678268e+02  7.11002256e+01  1.94904220e+02
   4.93340814e+02  1.62627554e-01  3.44649850e-02 -2.81549870e-02
   9.80207236e+02  5.17316800e-02]
 [ 8.58194493e+00  1.09335715e+02  7.67849889e+01  2.85988051e+02
   5.20259229e+02  1.49757852e-01  2.25093080e-02 -1.08298994e-01
   1.14621189e+03  1.90454480e-01]
 [ 2.82922212e+00  1.05455598e+02  7.46545323e+01  1.86190085e+02
   4.86287854e+02  1.95173250e-01  3.43624990e-02 -8.18768770e-02
   6.98359

Unnamed: 0,analyse_harmonics,get_max_intensity,analyse_intensity,analyse_pitch,analyse_pitch_range,analyse_shimmer,analyse_jitter,spectral_slope,mean_spectral_rolloff,get_energy
2105,-0.323659,-0.014225,-0.292555,-0.274583,0.202622,0.987592,-0.006539,0.479082,-0.465383,-0.480872
2106,0.053725,0.711694,0.719834,0.365542,-0.350018,-0.25594,-0.565074,-0.215352,-0.424726,0.260425
2107,-0.589685,0.09044,-0.028897,-0.329049,0.642715,0.087973,0.070198,0.395726,-0.508895,-0.37301
2108,0.508124,0.954183,0.938286,0.665189,0.878313,-0.251261,-0.670125,-0.523881,-0.187419,0.608156
2109,-1.027281,0.234599,0.575819,-0.424169,0.580985,0.945848,0.063851,-0.220702,-1.054706,0.084163


Reset label_df index

In [122]:
label_df.reset_index(drop=True, inplace=True)
label_df

Unnamed: 0,parent_label_train,segment_label_train
0,0,0
1,0,0
2,0,0
3,3,3
4,3,2
...,...,...
2105,2,1
2106,2,2
2107,2,1
2108,2,2
