In [1]:
import pandas as pd

## Goal

In [2]:
df_orig = pd.read_csv('/work/com-304/SAGA/vggsound_valid.csv')
df_orig.head()

Unnamed: 0,video_clip_name,timestamp,class,group_name
0,xEqmJZ-ET9k,520,"child speech, kid speaking",train
1,qx1Ld5-3Nv8,61,fox barking,train
2,SBlN_GSNLQk,15,car engine knocking,train
3,Al186fgrAzc,8,fireworks banging,train
4,H1_5M9mQ79w,80,playing harmonica,train


In [3]:
df = pd.read_csv("vggss.csv")
df = df.drop(columns=["Xmin", "Xmax", "Ymin", "Ymax"])
df = df.rename(columns={"video_clip_name": "video_id"})
df[['video_clip_name', 'timestamp']] = df['video_id'].str.extract(r'(.+)_(\d+)')
# Convert timestamp from string to integer and remove leading zeros
df['timestamp'] = df['timestamp'].astype(int)
# Optionally drop the original video_id column
df = df.drop(columns=['video_id'])
df = df.sort_values(by=["class"])
df = df[['video_clip_name', 'timestamp', 'class']]
df.head()

Unnamed: 0,video_clip_name,timestamp,class
2297,WLEZDT_vGuM,16,air conditioning noise
3073,-wi8kPVJLcw,205,air conditioning noise
2049,6uB3h15H90k,102,air conditioning noise
86,Mqlg9NYMidw,11,air conditioning noise
3444,yfcMvoqT0iM,73,air conditioning noise


In [4]:
# Perform a merge (join) of the two DataFrames
merged_df = df.merge(df_orig, 
                    on='video_clip_name',
                    how='inner',  # You can use 'left', 'right', or 'outer' instead
                    suffixes=('', '_orig'))

# Display the result
print("First few rows of merged DataFrame:")

# Check the number of rows to ensure the merge worked as expected
print(f"\nNumber of rows in original df: {len(df)}")
print(f"Number of rows in df_orig: {len(df_orig)}")
print(f"Number of rows in merged df: {len(merged_df)}")

# Optional: Save the merged DataFrame
# merged_df.to_csv("merged_dataset.csv", index=False)
merged_df = merged_df.drop(columns=["timestamp_orig", "class_orig"])
merged_df.head()


First few rows of merged DataFrame:

Number of rows in original df: 5158
Number of rows in df_orig: 168158
Number of rows in merged df: 4765


Unnamed: 0,video_clip_name,timestamp,class,group_name
0,WLEZDT_vGuM,16,air conditioning noise,train
1,-wi8kPVJLcw,205,air conditioning noise,train
2,Mqlg9NYMidw,11,air conditioning noise,train
3,Mqlg9NYMidw,11,air conditioning noise,test
4,yfcMvoqT0iM,73,air conditioning noise,train


In [5]:
# Get unique class count
num_classes = merged_df['class'].nunique()
print(f"Total number of unique classes: {num_classes}")

# Show class distribution
class_distribution = merged_df['class'].value_counts()
print("\nNumber of samples per class:")
class_distribution.head(20)

Total number of unique classes: 221

Number of samples per class:


class
bull bellowing                        53
playing congas                        50
playing djembe                        49
driving snowmobile                    44
playing cornet                        44
pheasant crowing                      44
cat hissing                           42
tractor digging                       41
train wheels squealing                41
alligators, crocodiles hissing        39
gibbon howling                        39
coyote howling                        39
telephone bell ringing                38
playing steel guitar, slide guitar    37
tap dancing                           37
playing oboe                          36
reversing beeps                       36
dog growling                          35
baltimore oriole calling              34
playing violin, fiddle                34
Name: count, dtype: int64

In [6]:
# Define the classes we want to keep
selected_classes = ["cat hissing", "tap dancing"]

# Filter the DataFrame
filtered_df = merged_df[merged_df['class'].isin(selected_classes)]

# Display basic information about the filtered dataset
print("Filtered Dataset Summary:")
print(f"Total samples: {len(filtered_df)}\n")

# Show distribution of classes in filtered dataset
class_counts = filtered_df['class'].value_counts()
print("Samples per class:")
print(class_counts)

# Display first few rows of filtered dataset
print("\nFirst few rows of filtered dataset:")
filtered_df.head()

# Optional: Save filtered dataset to CSV
# filtered_df.to_csv("filtered_dataset.csv", index=False)

Filtered Dataset Summary:
Total samples: 79

Samples per class:
class
cat hissing    42
tap dancing    37
Name: count, dtype: int64

First few rows of filtered dataset:


Unnamed: 0,video_clip_name,timestamp,class,group_name
556,HhvCh6gOZS8,372,cat hissing,train
557,2KEg1a42Wx0,288,cat hissing,test
558,vEvvRfuVk30,108,cat hissing,train
559,FfwAkXR6JMc,25,cat hissing,test
560,VaqR7OOMHSM,33,cat hissing,test


In [7]:
# Create group distribution summary
group_summary = filtered_df.groupby(['class', 'group_name']).size().unstack(fill_value=0)

# Add total row and column
group_summary.loc['Total'] = group_summary.sum()
group_summary['Total'] = group_summary.sum(axis=1)

# Display the summary
print("Distribution of samples by class and group:")
print("==========================================")
print(group_summary)

# Calculate percentages
print("\nPercentage distribution:")
print("=======================")
percentages = group_summary.div(group_summary['Total'], axis=0) * 100
print(percentages.round(1))

# Optional: Detailed counts per class
print("\nDetailed counts:")
print("===============")
for cls in selected_classes:
    class_data = filtered_df[filtered_df['class'] == cls]
    print(f"\n{cls}:")
    for group in sorted(class_data['group_name'].unique()):
        count = len(class_data[class_data['group_name'] == group])
        print(f"  {group}: {count} samples")

Distribution of samples by class and group:
group_name   eval  test  train  Total
class                                
cat hissing     3     8     31     42
tap dancing     3    10     24     37
Total           6    18     55     79

Percentage distribution:
group_name   eval  test  train  Total
class                                
cat hissing   7.1  19.0   73.8  100.0
tap dancing   8.1  27.0   64.9  100.0
Total         7.6  22.8   69.6  100.0

Detailed counts:

cat hissing:
  eval: 3 samples
  test: 8 samples
  train: 31 samples

tap dancing:
  eval: 3 samples
  test: 10 samples
  train: 24 samples


In [12]:
# Save the filtered DataFrame to a CSV file
filtered_df = filtered_df.sort_values(by=["group_name"], ascending=False)
filtered_df.to_csv("small_vgg_250522_14_22.csv", index=False)