# Installing Packages

In [None]:
import sys
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install sklearn
!{sys.executable} -m pip install yellowbrick
!{sys.executable} -m pip install matplot
!{sys.executable} -m pip install seaborn

# Preprocessing of Data

In [1]:
import os
import numpy as np
import pandas as pd

# Helper function: simply reads a csv file and return it as a dataframe
def read_data(data_file):
  data_df = pd.read_csv(data_file)
  return data_df

def drop_features(data_df, features_to_drop):
  return data_df.drop(columns=features_to_drop, inplace=True)

def segregate_noisy_data(data_df):
  non_noisy_df = data_df.loc[(data_df['noisy_train'] == 0)]
  noisy_df = data_df.loc[(data_df['noisy_train'] == 1)]
  return non_noisy_df, noisy_df

def remove_noisy_data(data_df):
  print(f'''Rows dropped because of low duration: 
        {len(data_df[data_df['noisy'] == 1])}\n''')
  
  return data_df.drop(data_df[data_df['noisy'] == 1].index, inplace=True)

# Helper function: cleans data based on xyz
def preprocessing_data(data_df, labels):

  # Clean data based on duration value
  print(f'''Rows dropped because of low duration: \
        {len(data_df[data_df['duration'] < 2])}\n''')
  
  data_df.drop(data_df[data_df['duration'] < 2].index, inplace=True)

  # Extract filenames and labels
  filenames = data_df['file_name'].values.tolist()
  data_label = data_df.filter(items=labels)

  return (data_df, filenames, data_label)


Read Data

In [2]:
# Fetch .csv file and store it as a dataframe
aggression_data_df = read_data('./extracted/full-train-features.csv')

Setup features to drop and to set up as labels

In [3]:
feature_labels = ['parent_label_train', 'segment_label_train']

features_to_drop = ['file_name', 'duration','get_number_sylls',
                    'get_number_words', 'pauses', 'parent_file_train',
                    'number_of_segments_train', 'number_of_voices_parent_train',
                    'extra_sounds_train', 'parent_label_train',
                    'segment_label_train', 'noisy_train', 'Notes_train']

Run preprocessing of data

In [4]:
print(f'Shape of the data before cleaning it: {aggression_data_df.shape}\n')
print(aggression_data_df.tail())

aggression_data_df, filenames, labels = preprocessing_data(aggression_data_df,
                                                           feature_labels)

print(f'Shape of the data after cleaning it: {aggression_data_df.shape} \n')

Shape of the data before cleaning it: (2147, 23)

                      file_name  duration  analyse_harmonics  \
2142  xdz15thrnem51_processed_5      10.0           5.465491   
2143  xdz15thrnem51_processed_6      10.0           6.879443   
2144  xdz15thrnem51_processed_7      10.0           4.468770   
2145  xdz15thrnem51_processed_8      10.0           8.581945   
2146  xdz15thrnem51_processed_9      10.0           2.829222   

      get_number_sylls  get_number_words  pauses get_max_intensity  \
2142               1.3               1.0     0.1       104.1138965   
2143               0.2               0.3     0.1        108.028173   
2144               1.8               1.1     0.1       104.6782678   
2145               0.3               0.1     0.1        109.335715   
2146               0.6               0.6     0.1       105.4555978   

      analyse_intensity  analyse_pitch  analyse_pitch_range  ...  \
2142          69.550541     199.893943           443.057705  ...   
2143    

Segregate data based on noisy vs not noisy

In [5]:
non_noisy_df, noisy_df = segregate_noisy_data(aggression_data_df)

print(f"Shape of noisy data: {noisy_df.shape} \n")
print(f"Shape of non noisy data: {non_noisy_df.shape}\n")

Shape of noisy data: (262, 23) 

Shape of non noisy data: (1849, 23)



Drop features

In [6]:
# remove features
print(f'Shape of the data before dropping features: {aggression_data_df.shape} \n')
drop_features(aggression_data_df, features_to_drop)

print(f'Shape of the data after dropping features: {aggression_data_df.shape} \n')
print(aggression_data_df.tail())

Shape of the data before dropping features: (2111, 23) 

Shape of the data after dropping features: (2111, 10) 

      analyse_harmonics get_max_intensity  analyse_intensity  analyse_pitch  \
2142           5.465491       104.1138965          69.550541     199.893943   
2143           6.879443        108.028173          75.501002     258.536830   
2144           4.468770       104.6782678          71.100226     194.904220   
2145           8.581945        109.335715          76.784989     285.988051   
2146           2.829222       105.4555978          74.654532     186.190085   

      analyse_pitch_range  analyse_shimmer  analyse_jitter  spectral_slope  \
2142           443.057705         0.196757        0.033226       -0.020890   
2143           379.915372         0.149580        0.024206       -0.081411   
2144           493.340814         0.162628        0.034465       -0.028155   
2145           520.259229         0.149758        0.022509       -0.108299   
2146           486.287

### Analyze what features to drop

In [7]:
print(aggression_data_df.isnull().sum(axis = 0))

analyse_harmonics        0
get_max_intensity        0
analyse_intensity        0
analyse_pitch            1
analyse_pitch_range      1
analyse_shimmer          1
analyse_jitter           1
spectral_slope           0
mean_spectral_rolloff    0
get_energy               0
dtype: int64
