In [None]:
! pip install pandas numpy matplotlib seaborn

In [19]:
import glob
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

In [20]:
RAW_DATASET_PATH = 'dataset_020/color_dataset*.txt'
PROCESSED_DATASET_PATH = 'dataset_020/color_dataset_concat.csv'

In [21]:
file_paths = glob.glob(RAW_DATASET_PATH)
dfs = []

In [38]:
def remove_outliers(df, column_name, batch_size = 100, hop_length_ratio = 0.5, threshold = 0.05):
    """
    Function to find and remove outliers in a DataFrame. It checks values in one column and looks for an outlier
    based on the percentage difference from the average of the batch values. If a value is more than
    the threshold percentage different from the average, it is flagged as an outlier and removed.

    :param df: A pandas DataFrame containing the dataset.
    :param column_name: The name of the column to check for outliers.
    :param batch_size: The number of rows in each batch.
    :param hop_length_ratio: The ratio of the batch size to determine the hop length for overlapping batches.
    :param threshold: The percentage difference from the average to use as a threshold for determining outliers.
    :return: The DataFrame with outliers removed and a list of row indices that were considered outliers.
    """
    outliers = []
    hop_length = int(batch_size * hop_length_ratio)  # Determine hop length based on batch size and ratio
    data_column = df[column_name].to_numpy()  # Extract the relevant column

    # Loop over the dataset in batches with the defined hop length
    for start in range(0, len(data_column), hop_length):
        end = start + batch_size
        batch = data_column[start:end]
        mean = np.mean(batch)

        # Calculate the percentage difference from the mean for each value in the batch
        percentage_diff = np.abs((batch - mean) / mean)

        # Find indices where the percentage difference is greater than the threshold
        batch_outliers = np.where(percentage_diff > threshold)[0] + start
        outliers.extend(batch_outliers)

    # Ensure that the outlier indices are within the bounds of the data array
    outliers = [index for index in outliers if index < len(data_column)]

    # Remove duplicate indices due to overlapping batches
    outliers = list(set(outliers))
    
    # Remove outliers from the DataFrame
    clean_df = df.drop(index=outliers).reset_index(drop=True)

    return clean_df, sorted(outliers)

# Example call of the function (assuming 'df' is your DataFrame and 'column_name' is the column to check)
# This will find and remove outliers in the specified column with a batch size of 5, a hop length ratio of 0.25, and a threshold of 1%
# clean_df,


In [23]:
test = pd.read_csv('dataset_020/color_dataset01.txt', sep=',', header=None, usecols=[1,2,3,4,5], names=["control_value", "R", "G", "B", "W"] )

In [39]:
outlier_rowsR = remove_outliers(df = test, column_name = "R", batch_size = 100, hop_length_ratio = 0.5, threshold = 0.05)[1]
outlier_rowsG = remove_outliers(df = test, column_name = "G", batch_size = 100, hop_length_ratio = 0.5, threshold = 0.05)[1]
outlier_rowsB = remove_outliers(df = test, column_name = "B", batch_size = 100, hop_length_ratio = 0.5, threshold = 0.1)[1]
outlier_rowsW = remove_outliers(df = test, column_name = "W", batch_size = 100, hop_length_ratio = 0.5, threshold = 0.05)[1]
print(outlier_rowsR)
print(outlier_rowsG)
print(outlier_rowsB)
print(outlier_rowsW)

[0, 997, 998, 999]
[0, 997, 998, 999]
[0, 997, 998, 999]
[0, 997, 998, 999]


In [25]:
dfs = []
for file_path in file_paths:
    df = pd.read_csv(file_path, sep=',', header=None, usecols=[1,2,3,4,5], names=["control_value", "R", "G", "B", "W"] )
    df = remove_outliers(df = df, column_name = "R", batch_size = 100, hop_length_ratio = 0.5, threshold = 0.05)[0]
    df = remove_outliers(df = df, column_name = "G", batch_size = 100, hop_length_ratio = 0.5, threshold = 0.05)[0]
    df = remove_outliers(df = df, column_name = "B", batch_size = 100, hop_length_ratio = 0.5, threshold = 0.1)[0]
    df = remove_outliers(df = df, column_name = "W", batch_size = 100, hop_length_ratio = 0.5, threshold = 0.05)[0]
    dfs.append(df)
    
final_df = pd.concat(dfs, ignore_index=True)

In [26]:
final_df

Unnamed: 0,control_value,R,G,B,W
0,0.002,1332,896,672,2765
1,0.003,1330,894,670,2759
2,0.004,1329,893,670,2756
3,0.005,1329,892,669,2755
4,0.006,1329,892,669,2755
...,...,...,...,...,...
19955,0.995,8175,13192,13746,35797
19956,0.996,8168,13196,13758,35809
19957,0.997,8161,13201,13769,35819
19958,0.998,8160,13201,13769,35820


In [None]:
final_df.to_csv(PROCESSED_DATASET_PATH, index=False)

## Plot

In [None]:
dfi = final_df.reset_index()
dfi

In [None]:
sns.set_theme(style="darkgrid")

plt.figure(figsize=(20,6))
plt.title("Agregated full spectrum with mean and 95% confidence over the control value", fontsize=20)

sns.lineplot(data=dfi, x="control_value", y="W", color="black", label="W", linewidth=2, alpha=0.6)
plt.xlim(0, 1)
plt.legend(loc='upper left', fontsize=12)
plt.show()

plt.figure(figsize=(20,6))
plt.title("RGB spectrums plots", fontsize=20)

sns.lineplot(data=dfi, x="index", y="R", color="red", label="R", linewidth=2, alpha=0.6)
sns.lineplot(data=dfi, x="index", y="G", color="green", label="G",  linewidth=2, alpha=0.6)
sns.lineplot(data=dfi, x="index", y="B", color="blue", label="B",  linewidth=2, alpha=0.6)

plt.xlim(7000, 8000)
plt.ylim(1000, 2500)
plt.legend(loc='upper left', fontsize=12)
plt.show()

In [None]:
plt.figure(figsize=(20,6))
plt.title("Single color entries over control value", fontsize=20)

sns.scatterplot(data=dfi, x="control_value", y="B", color="black", label="W", linewidth=2, alpha=0.6)
plt.xlim(0, 1)
plt.legend(loc='upper left', fontsize=12)
plt.show()