In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [15]:
# Load the CSV
csv_file = '../data/dataset/train_labels.csv'
df = pd.read_csv(csv_file)

In [16]:
# How many images
num_images = df.shape
print("Number of images:", num_images)

Number of images: (101442, 2)


In [17]:
#  How many classes are in the labels
unique_classes = df['class'].nunique()
print("Number of unique classes:", unique_classes)

Number of unique classes: 2


In [18]:
class_counts = df['class'].value_counts()
print(class_counts)

class
NRG    98172
RG      3270
Name: count, dtype: int64


In [19]:
df.head()

Unnamed: 0,challenge_id,class
0,TRAIN000000,NRG
1,TRAIN000001,NRG
2,TRAIN000002,NRG
3,TRAIN000003,NRG
4,TRAIN000004,NRG


In [21]:
# encode dataset classes

label_encoder = LabelEncoder()
df['labels'] = label_encoder.fit_transform(df['class'])
df = df.drop(columns=['class'])

# Save the dataframe
df.to_csv('../data/dataset/encoded_dataset.csv', index=False)

In [22]:
# Balance the dataset
data_path = '../data/dataset/encoded_dataset.csv'
data = pd.read_csv(data_path)

# Separate the data based on the label
label_0_data = data[data['labels'] == 0]
label_1_data = data[data['labels'] == 1]

# The number of samples to keep
num_samples = min(len(label_0_data), len(label_1_data))

# Sample an equal number of samples from each class
sampled_label_0 = label_0_data.sample(n=num_samples, random_state=42)
sampled_label_1 = label_1_data.sample(n=num_samples, random_state=42)

# Concatenate the dataset
balanced_data = pd.concat([sampled_label_0, sampled_label_1])

# Shuffle the dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Save to new .csv file
balanced_data.to_csv('../data/dataset/reduced_balanced_data.csv', index=False)

In [23]:
# create test data and remove it from the .csv file

data_path = '../data/dataset/reduced_balanced_data.csv'
data = pd.read_csv(data_path)


# Split the data into the 10% sample and the remaining data
sampled_data, remaining_data = train_test_split(data, test_size=0.10, stratify=data['labels'], random_state=42)

# Save the sampled data to a new CSV file
sampled_data.to_csv('../data/dataset/reduced_encoded_train_data.csv', index=False)

# Save the remaining data back to the original CSV file
remaining_data.to_csv('../data/dataset/test_data.csv', index=False)