# Remove all files with low validation scores from the analysis

Remove all rows in the dataset with low agreement between annotators, this is determined to be labels with a difference > |2|

## Install Packages

In [None]:
import sys
# !{sys.executable} -m pip install numpy
# !{sys.executable} -m pip install pandas
# !{sys.executable} -m pip install sklearns
# !{sys.executable} -m pip install matplot
# !{sys.executable} -m pip install seaborn

import pandas as pd
import numpy as np
import seaborn as sns
from pathlib import Path
file_path='CleanData/'

Helper functions below

In [None]:
def join_dfs(train, val):
  '''
    Join the validation and training datasets in order to compare the ratings
  '''
  train['file_name'] = train['file_name'].str.strip()
  val['file_name'] = val['file_name'].str.strip()

  df_merged = pd.merge(train,val,on='file_name',how='left',suffixes=('_train', '_val'))

  # IOE on segment label
  df_merged['match_s_diff'] = df_merged['segment_label_train'] - df_merged['segment_label_val'] 
  df_merged['match_s_diff_abs'] = df_merged['match_s_diff'].abs()
  
  # IOE on parent label
  df_merged['match_p_diff'] = df_merged['parent_label_train'] - df_merged['parent_label_val'] 
  df_merged['match_p_diff_abs'] = df_merged['match_p_diff'].abs()
  return(df_merged)

In [None]:
def remove_files(df, dataset_name):
  '''
    Remove audio clips that have a label difference of more then 2 levels
  '''
  
  df_to_remove = df.drop(df[(df.match_s_diff_abs < 2) & (df.match_p_diff_abs < 2)].index)
  
  df_to_remove =  df_to_remove[df_to_remove['segment_label_val'].notna()]
  
  df_to_remove = df_to_remove[['file_name', 'parent_file_train']]
  
  df_to_remove = df_to_remove.reset_index()

  removal_dict ={}
  
  for index, row in df_to_remove.iterrows():
    if row['parent_file_train'] not in removal_dict.keys():
      removal_dict[row['parent_file_train']] = [row['file_name']]
    else:
      removal_dict[row['parent_file_train']].append(row['file_name'])
  
  for parent_file in removal_dict:
    for file_name in removal_dict[parent_file]:
      try:
        #remove file with poor validation
        os.remove(f"{dataset_name}/{parent_file}/{file_name}.wav")
      except FileNotFoundError as e:
        print(e)

  print("All low validation files removed")

  pathlist = Path(dataset_name).glob('**/*_processed.wav')
  for path in pathlist:
       # because path is object not string
       path_in_str = str(path)
       os.remove(path_in_str)
  
  pathlist = Path(dataset_name).glob('**/*_raw.wav')
  for path in pathlist:
       # because path is object not string
       path_in_str = str(path)
       os.remove(path_in_str)
  
  
  pathlist = Path(dataset_name).glob('**/*_raw.aac')
  for path in pathlist:
       # because path is object not string
       path_in_str = str(path)
       os.remove(path_in_str)

  print("All extra files removed")

In [None]:
def remove_rows(df, filename, type):
  '''
    Remove the rows from the training label csv to reflect removed audio clips
    type is either 'train' or 'validate' depending which of the two files you are cleaning
  '''

  #the set you want to generate is he opposite yof the one you will drop
  if type == 'validate':
    set = 'train'
  elif  type == 'train':
    set = 'val'

  df_to_keep = df.drop(df[(df.match_s_diff_abs >= 2) | (df.match_p_diff_abs >= 2)].index)

  df_to_keep =  df_to_keep[df_to_keep['match_s_diff_abs'].notna()]

  df_to_keep = df_to_keep[df_to_keep.columns.drop(list(df_to_keep.filter(regex=f'_{set}')))]

  df_to_keep = df_to_keep[df_to_keep.columns.drop(list(df_to_keep.filter(regex='match_')))]

  df_to_keep.to_csv(f"CleanData/ValDrop/{filename}", index=False)

## `Part A` of the data

Read Data

In [None]:
anno_train = pd.read_csv(file_path+'labelled-reddit-2021-08-12-A-PK.csv')
anno_val = pd.read_csv(file_path+'labelled-reddit-2021-08-12-A-CW.csv')

Merge the training and validation data

In [None]:
df_merged = join_dfs(anno_train, anno_val)

In [None]:
df_merged

Remove files with low validation scores

In [None]:
remove_files(df_merged, "A")

Remove low validation rows from the file

In [None]:
remove_rows(df_merged, "A-cleaned.csv", "train")

In [None]:
remove_rows(df_merged, "A-cleaned-val.csv", "validate")

## `Part E` of the data

Read Data

In [None]:
anno_train = pd.read_csv(file_path+'labelled-reddit-2021-08-12-E-VP.csv')
anno_val = pd.read_csv(file_path+'labelled-reddit-2021-08-12-E-PK.csv')

Merge the training and validation data

In [None]:
df_merged = join_dfs(anno_train, anno_val)

Remove files with low validation scores

In [None]:
remove_files(df_merged, "E")

Remove files with low validation scores

In [None]:
remove_rows(df_merged, "E-cleaned.csv", "train")

In [None]:
remove_rows(df_merged, "E-cleaned-val.csv", "validate")

In [None]:
anno_train = pd.read_csv('./CleanData/ValDrop/E-cleaned.csv')
features = pd.read_csv('./extracted/E-features.csv')

Some of the files were not labeled at all, remove these

In [None]:
cleaned_features = features[features['filepath'].isin(anno_train['file_name'])]

In [None]:
cleaned_features.to_csv(f"extracted/E-features-clean.csv", index=False)

## `Part H` of the data

Read Data

In [None]:
anno_train = pd.read_csv(file_path+'labelled-reddit-2021-08-31-H-KK_SA.csv')
anno_val = pd.read_csv(file_path+'labelled-reddit-2021-08-31-H-TL.csv')

Merge the training and validation data

In [None]:
df_merged = join_dfs(anno_train, anno_val)

Remove files with low validation scores

In [None]:
df_merged

In [None]:
df = remove_files(df_merged, "H")

Remove files with low validation scores

In [None]:
remove_rows(df_merged, "H-cleaned.csv", "train")

In [None]:
remove_rows(df_merged, "H-cleaned-val.csv", "validate")

Some of the files were not labeled at all, remove these

In [None]:
anno_train = pd.read_csv('./CleanData/ValDrop/H-cleaned.csv')
features = pd.read_csv('./extracted/H-features.csv')

In [None]:
cleaned_features = features[features['filepath'].isin(anno_train['file_name'])]

In [None]:
cleaned_features.to_csv(f"extracted/H-features-clean.csv", index=False)

## `Part F` of the data

Read Data

In [None]:
anno_train = pd.read_csv(file_path+'labelled-reddit-2021-08-12-F-SA.csv')
anno_val = pd.read_csv(file_path+'labelled-reddit-2021-08-12-F-CW.csv')

Merge the training and validation data

In [None]:
df_merged = join_dfs(anno_train, anno_val)

Remove files with low validation scores

In [None]:
remove_files(df_merged, "F")

Remove files with low validation scores

In [None]:
remove_rows(df_merged, "F-cleaned.csv", "train")

In [None]:
remove_rows(df_merged, "F-cleaned-val.csv", "validate")

Some of the files were not labeled at all, remove these

In [None]:
anno_train = pd.read_csv('./CleanData/ValDrop/F-cleaned.csv')
features = pd.read_csv('./extracted/F-features.csv')

In [None]:
cleaned_features = features[features['filepath'].isin(anno_train['file_name'])]

In [None]:
cleaned_features.to_csv(f"extracted/F-features-clean.csv", index=False)