# Normalization

In [1]:
import numpy as np
import pandas as pd
from scipy.io import arff

def Normalize(data):
    i_size = len(data) #The initial size of the data
    #Remove missing values
    r_data = data.dropna()
    r_size = len(r_data) #The data size after removing missing values
    if r_size/i_size > 0.8: #To ensure no false data removal was performed
        data = r_data
    #Remove outliers
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    result = data[~((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=1)]
    return result

## Step 1: Removing Missing Values

In [2]:
def read_data(filename):
    data = arff.loadarff(filename)
    loaddata = pd.DataFrame(data[0])
    return loaddata

def process_missing_values(loaddata):
    '''
        Replaces all instances of zero to NaN
    '''
    return loaddata.replace(0,np.nan)

def count_missing_values(loaddata):
    '''
        Counts the number of NaN in the dataset
    '''
    return loaddata.isnull().sum()

def remove_missing_values(loaddata):
    '''
        Removes NaN values
    '''
    return loaddata.dropna()

# Change filename here
# filename = 'KC4.arff.txt'
filename = 'CM1.arff.txt'

loaddata = read_data(filename)

loaddata = process_missing_values(loaddata)

num_missing = count_missing_values(loaddata)
num_missing

LOC_BLANK                          125
BRANCH_COUNT                         0
CALL_PAIRS                           0
LOC_CODE_AND_COMMENT               125
LOC_COMMENTS                       125
CONDITION_COUNT                    125
CYCLOMATIC_COMPLEXITY                0
CYCLOMATIC_DENSITY                 115
DECISION_COUNT                     125
DECISION_DENSITY                   125
DESIGN_COMPLEXITY                    0
DESIGN_DENSITY                       0
EDGE_COUNT                           0
ESSENTIAL_COMPLEXITY                 0
ESSENTIAL_DENSITY                  124
LOC_EXECUTABLE                     125
PARAMETER_COUNT                    125
GLOBAL_DATA_COMPLEXITY             125
GLOBAL_DATA_DENSITY                125
HALSTEAD_CONTENT                   125
HALSTEAD_DIFFICULTY                125
HALSTEAD_EFFORT                    125
HALSTEAD_ERROR_EST                 125
HALSTEAD_LENGTH                    125
HALSTEAD_LEVEL                     125
HALSTEAD_PROG_TIME       

In [7]:
loaddata = remove_missing_values(loaddata)
loaddata.head()

Unnamed: 0,LOC_BLANK,BRANCH_COUNT,CALL_PAIRS,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CONDITION_COUNT,CYCLOMATIC_COMPLEXITY,CYCLOMATIC_DENSITY,DECISION_COUNT,DECISION_DENSITY,...,NODE_COUNT,NORMALIZED_CYLOMATIC_COMPLEXITY,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,NUMBER_OF_LINES,PERCENT_COMMENTS,LOC_TOTAL,Defective
2,38.0,35.0,4.0,5.0,70.0,58.0,18.0,0.17,24.0,2.42,...,51.0,0.08,150.0,222.0,58.0,32.0,218.0,41.9,109.0,b'N'
4,9.0,15.0,4.0,14.0,22.0,28.0,8.0,0.2,14.0,2.0,...,24.0,0.11,29.0,64.0,19.0,18.0,73.0,57.14,41.0,b'N'
5,13.0,9.0,5.0,12.0,16.0,16.0,5.0,0.14,8.0,2.0,...,18.0,0.07,38.0,69.0,20.0,15.0,67.0,52.83,37.0,b'N'
7,4.0,7.0,2.0,1.0,7.0,10.0,4.0,0.16,4.0,2.5,...,12.0,0.11,40.0,67.0,23.0,22.0,37.0,25.0,25.0,b'Y'
16,10.0,11.0,2.0,2.0,14.0,20.0,6.0,0.18,10.0,2.0,...,21.0,0.1,37.0,58.0,17.0,21.0,59.0,33.33,34.0,b'N'


## Step 2: Removing outliers

In [3]:
Q1 = loaddata.quantile(0.25)
Q3 = loaddata.quantile(0.75)
IQR = Q3 - Q1
IQR

LOC_BLANK                              NaN
BRANCH_COUNT                        10.000
CALL_PAIRS                           8.000
LOC_CODE_AND_COMMENT                   NaN
LOC_COMMENTS                           NaN
CONDITION_COUNT                        NaN
CYCLOMATIC_COMPLEXITY                5.000
CYCLOMATIC_DENSITY                   0.000
DECISION_COUNT                         NaN
DECISION_DENSITY                       NaN
DESIGN_COMPLEXITY                    3.000
DESIGN_DENSITY                       0.180
EDGE_COUNT                          56.000
ESSENTIAL_COMPLEXITY                 0.000
ESSENTIAL_DENSITY                    0.000
LOC_EXECUTABLE                         NaN
PARAMETER_COUNT                        NaN
GLOBAL_DATA_COMPLEXITY                 NaN
GLOBAL_DATA_DENSITY                    NaN
HALSTEAD_CONTENT                       NaN
HALSTEAD_DIFFICULTY                    NaN
HALSTEAD_EFFORT                        NaN
HALSTEAD_ERROR_EST                     NaN
HALSTEAD_LE

In [4]:
# tilde operator is a negation operator
result = loaddata[~((loaddata < (Q1 - 1.5 * IQR)) | (loaddata > (Q3 + 1.5 * IQR))).any(axis=1)]
result

Unnamed: 0,LOC_BLANK,BRANCH_COUNT,CALL_PAIRS,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CONDITION_COUNT,CYCLOMATIC_COMPLEXITY,CYCLOMATIC_DENSITY,DECISION_COUNT,DECISION_DENSITY,...,NORMALIZED_CYLOMATIC_COMPLEXITY,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,NUMBER_OF_LINES,PATHOLOGICAL_COMPLEXITY,PERCENT_COMMENTS,LOC_TOTAL,Defective
0,,7.0,5.0,,,,4.0,1.0,,,...,0.04,,,,,100.0,1.0,,100.0,b'N'
3,,3.0,8.0,,,,2.0,1.0,,,...,0.03,,,,,58.0,1.0,,58.0,b'Y'
6,,1.0,3.0,,,,1.0,1.0,,,...,0.03,,,,,36.0,1.0,,36.0,b'Y'
7,,1.0,1.0,,,,1.0,1.0,,,...,0.01,,,,,81.0,1.0,,81.0,b'N'
9,,3.0,8.0,,,,2.0,1.0,,,...,0.03,,,,,58.0,1.0,,58.0,b'Y'
10,,1.0,1.0,,,,1.0,,,,...,,,,,,269.0,1.0,,269.0,b'N'
11,,3.0,15.0,,,,2.0,,,,...,0.01,,,,,181.0,1.0,,181.0,b'N'
12,,1.0,3.0,,,,1.0,,,,...,0.06,,,,,18.0,1.0,,18.0,b'N'
16,,5.0,10.0,,,,3.0,,,,...,0.04,,,,,77.0,1.0,,77.0,b'Y'
19,,3.0,2.0,,,,2.0,,,,...,0.03,,,,,66.0,1.0,,66.0,b'Y'
