In [1]:
import numpy as np
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [2]:
#Mounting the given dataset file
dataset = np.genfromtxt('data.csv', delimiter=',')
dataset.shape

(3486, 354)

In [3]:
#Mounting the given datalabel file
data_labels = np.genfromtxt('data_labels.csv', delimiter = ',', dtype='int')
data_labels.shape

(3486,)

In [4]:
#Checking the imbalance in data
Counter(data_labels)

Counter({8: 466, 5: 287, 1: 1625, 6: 310, 4: 483, 2: 233, 3: 30, 7: 52})

In [5]:
#Oversampling and undersampling using SMOTE
oversampling = SMOTE(sampling_strategy={8.0: 1400,
         5.0: 1400,
         6.0: 1400,
         4.0: 1400,
         2.0: 1400,
         3.0: 1400,
         7.0: 1400})
undersampling = RandomUnderSampler(sampling_strategy={
         1.0: 1400,})
steps = [('over',oversampling),('u',undersampling)]
pipeline = Pipeline(steps=steps)
dataset, data_labels = pipeline.fit_resample(dataset, data_labels)

In [6]:
#Checking the oversampled and undersampled data labels
Counter(data_labels)

Counter({1: 1400,
         2: 1400,
         3: 1400,
         4: 1400,
         5: 1400,
         6: 1400,
         7: 1400,
         8: 1400})

In [7]:
#Reshaping to allow concatenation
labels = data_labels.reshape(-1,1)
print(labels.shape)

(11200, 1)


In [8]:
#Concatenating and saving the dataset and labels
full_dataset = np.concatenate((dataset,labels),axis=1)
np.savetxt('full_dataset.csv', full_dataset, delimiter=',')
full_dataset[:10,-1]

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [9]:
#Randomly shuffling the ordered final dataset
np.random.shuffle(full_dataset)

In [10]:
#Splitting into three sets
training_set, validation_set, testing_set = np.split(full_dataset, [int(.6 * len(full_dataset)), int(.8 * len(full_dataset))])

In [11]:
#Saving the sets
training_labels = training_set[:,-1]
np.savetxt('training_labels.csv', training_labels, delimiter=',')

training = np.delete(training_set, -1, axis=1)
np.savetxt('training_set.csv', training, delimiter=',')

validation_labels = validation_set[:,-1]
np.savetxt('validation_labels.csv', validation_labels, delimiter=',')

validation = np.delete(validation_set, -1, axis=1)
np.savetxt('validation_set.csv', validation,delimiter=',')

testing_labels = testing_set[:,-1]
np.savetxt('test_labels.csv',testing_labels, delimiter=',')

testing = np.delete(testing_set, -1, axis=1)
np.savetxt('test_set.csv', testing, delimiter=',')