# Preprocessing
## 1. Techniques
Below contains functions to perform a series of preprocessing techniques on a given dataset.

In [74]:
#!pip install imblearn
from collections import Counter
from scipy.io import arff
from sklearn.model_selection import train_test_split as ts
from sklearn.model_selection import KFold
from imblearn.under_sampling import InstanceHardnessThreshold
import numpy as np
import pandas as pd

def Normalize(data):
    i_size = len(data) #The initial size of the data
    #Remove missing values
    r_data = data.replace(0,np.nan).dropna()
    r_size = len(r_data) #The data size after removing missing values
    if r_size/i_size > 0.8: #To ensure no false data removal was performed
        data = r_data
    #Remove outliers
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    result = data[~((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=1)]
    return result

def K_fold(data,f=10):
    kf = KFold(n_splits=10)
    tt_splits = kf.split(data)
    return tt_splits

def IHT(data):   
    gauss_iht = InstanceHardnessThreshold()
    underX, underY = gauss_iht.fit_resample(data[0],data[1])
    return underX, underY

## 2. Additional functions
Below are additional functions included to help manipulate the dataset, improving its usefulness

In [75]:
def data_conversion(data):
    for i in range(len(data)):
        if data[i] == b'N':
            data[i] = 0
        else:
            data[i] = 1
    return data

## 3. Main
Below contains the main algorithm which performs all the techniques on a given dataset.
#### It returns the preprocessed dataset with the following format:
An array with each row containing:
[Training metrics, Test metrics, Training labels, test labels]

In [1]:
def preprocess(filename):
    raw = arff.loadarff(filename)
    data = pd.DataFrame(raw[0])
    #Normalization
    data = Normalize(data)
    #Pandas DF to NumPy array
    SM = np.array(data.iloc[:,:-1]) #Software metrics
    L = data_conversion(np.array(data.iloc[:,-1])).astype(int) #Labels
    #K-fold
    tt_splits = K_fold(SM)
    #IHT
    result = []
    for train, test in tt_splits:
        SM_under, L_under = IHT([SM[train],L[train]])
        result.append([SM_under, SM[test], L_under, L[test]])
    return result

#File
#preprocess('KC4.arff.txt')