In [1]:
import numpy as np
import matplotlib.pyplot as plt

Matplotlib is building the font cache; this may take a moment.


## Load Data

In [14]:
training_data = [] # Features per class
training_labels = [] # Labels
N = 5250

with open("traindata.txt", "r") as file:
    for line in file.readlines()[:N]:
        features = [float(i) for i in line.split(",")]
        training_data.append(features)


# Import the labels
with open("trainlabels.txt", "r") as file:
    for line in file.readlines()[:N]:
        label = float(line.rstrip())
        training_labels.append(label)
    

X = np.array(training_data)
y = np.array(training_labels, dtype=np.int64)

In [15]:
print(X.shape, y.shape)

(5250, 1041) (5250,)


In [16]:
def calculate_std(data, mean):
    squared_diff = (data - mean) ** 2
    return np.sqrt(np.mean(squared_diff))

In [17]:
def generate_feature_cover(X, std_tol = 500):
    x_mean = np.mean(X, axis = 0)
    x_std = np.mean(X, axis = 0)
    feature_cover = np.zeros(len(x_mean), dtype = bool)

    for i in range(len(x_mean)):
        if np.abs(x_std[i]) > std_tol:
            feature_cover[i] = True    

    return feature_cover

Clean & Augment Data

In [18]:
def create_labels_data(X, y):
    labels_data = {i:[] for i in np.unique(y)}

    for index, label in enumerate(y):
        labels_data[label].append(X[index])

    return labels_data


In [19]:
feature_cover = generate_feature_cover(X, 1000)
x = X[:] # 400 * 30 array

print(x.shape)

labels_data = create_labels_data(x, y)
labels_median = {i:[] for i in range(21)}

for label in range(21):
    labels_data[label] = np.array(labels_data[label])
    
    print(f"\nClass: {label}")
    print(labels_data[label].shape)
    X_cleaned = []

    median = np.median(labels_data[label], axis=0)
    std = np.array([calculate_std(labels_data[label][:, i], median[i]) for i in range(median.shape[0])])

    labels_median[label] = median

    for row in labels_data[label]:
        is_outlier = False

        # for i, value in enumerate(row):
        #     if value < (median[i] - 2.5 * std[i]) or value > (median[i] + 2.5 * std[i]):
        #         is_outlier = True
        #         break

        if not is_outlier:
            X_cleaned.append(row)

    labels_data[label] = np.array(X_cleaned)

    print(labels_data[label].shape)


(5250, 1041)

Class: 0
(225, 1041)
(225, 1041)

Class: 1
(244, 1041)
(244, 1041)

Class: 2
(226, 1041)
(226, 1041)

Class: 3
(254, 1041)
(254, 1041)

Class: 4
(249, 1041)
(249, 1041)

Class: 5
(257, 1041)
(257, 1041)

Class: 6
(246, 1041)
(246, 1041)

Class: 7
(269, 1041)
(269, 1041)

Class: 8
(226, 1041)
(226, 1041)

Class: 9
(242, 1041)
(242, 1041)

Class: 10
(251, 1041)
(251, 1041)

Class: 11
(251, 1041)
(251, 1041)

Class: 12
(258, 1041)
(258, 1041)

Class: 13
(265, 1041)
(265, 1041)

Class: 14
(282, 1041)
(282, 1041)

Class: 15
(255, 1041)
(255, 1041)

Class: 16
(252, 1041)
(252, 1041)

Class: 17
(249, 1041)
(249, 1041)

Class: 18
(234, 1041)
(234, 1041)

Class: 19
(256, 1041)
(256, 1041)

Class: 20
(259, 1041)
(259, 1041)


In [20]:
colors = [
    'blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'white',
    'navy', 'darkgreen', 'darkred', 'darkcyan', 'darkmagenta', 'goldenrod', 
    'gray', 'lightgray', 'darkgray', 'orange', 'purple', 'pink', 'brown'
]


def plot_save(labels_data, rows=26, cols=40, name=""):
    for label in range(21):
        fig, ax = plt.subplots(rows, cols, figsize=(100, 80))

        for i in range(rows):
            for j in range(cols):
                feature = i * cols + j
                
                if labels_data[label].shape[0] > 0 and labels_data[label][0].size > feature:
                    all_values_for_feature = np.concatenate([data[feature:feature+1] for data in labels_data[label] if feature < data.size])
                    ax[i, j].hist(all_values_for_feature, bins="auto", color=colors[label])
                    ax[i, j].set_title(f"Feature: {feature}")
                    
                    # Calculate and display the mean as a line
                    mean_value = np.mean(all_values_for_feature)
                    ax[i, j].axvline(x=mean_value, color='g', linestyle='-', label=f"Mean: {mean_value:.2f}")
                    
                    # Calculate and display the median as a line
                    median_value = labels_median[label][feature]
                    ax[i, j].axvline(x=median_value, color='r', linestyle='--', label=f"Median: {median_value:.2f}")
                    
                else:
                    ax[i, j].set_visible(False)
        plt.savefig(f"./feature_plots/label_{label}_plot{name}")
        plt.close(fig)

In [None]:
plot_save(labels_data, name="_normal")

In [12]:
from imblearn.over_sampling import SMOTENC

new_X = labels_data[0]
new_y = [0] * labels_data[0].shape[0]

for i in range(1, len(labels_data)):
    new_X = np.vstack([new_X, labels_data[i]])
    new_y += [i] * labels_data[i].shape[0]

new_y = np.array(new_y)
print(new_X.shape)
print(new_y.shape)

smenc = SMOTENC(categorical_features=[i for i in range(21)])

new_X_smenc, new_y_smenc = smenc.fit_resample(new_X, new_y)

print(new_X_smenc.shape)
print(new_y_smenc.shape)

labels_data_smenc = create_labels_data(new_X_smenc, new_y)
for label in range(21):
    labels_data_smenc[label] = np.array(labels_data_smenc[label])

plot_save(labels_data_smenc, name="_augmented")

# np.save("./augmented_traindata", new_X)
# np.save("./augmented_trainlabels", new_y)

(5250, 1041)
(5250,)
(5922, 1041)
(5922,)


In [13]:
np.save("./augmented_traindata", new_X_smenc)
np.save("./augmented_trainlabels", new_y_smenc)