# Normalization

In [5]:
import numpy as np
import pandas as pd
from scipy.io import arff

## Step 1: Removing Missing Values

In [6]:
def read_data(filename):
    data = arff.loadarff(filename)
    loaddata = pd.DataFrame(data[0])
    return loaddata

def process_missing_values(loaddata):
    '''
        Replaces all instances of zero to NaN
    '''
    return loaddata.replace(0,np.nan)

def count_missing_values(loaddata):
    '''
        Counts the number of NaN in the dataset
    '''
    return loaddata.isnull().sum()

def remove_missing_values(loaddata):
    '''
        Removes NaN values
    '''
    return loaddata.dropna()

# Change filename here
filename = 'CM1.arff.txt'

loaddata = read_data(filename)

loaddata = process_missing_values(loaddata)

num_missing = count_missing_values(loaddata)
num_missing

LOC_BLANK                           19
BRANCH_COUNT                         0
CALL_PAIRS                          51
LOC_CODE_AND_COMMENT                87
LOC_COMMENTS                        72
CONDITION_COUNT                      0
CYCLOMATIC_COMPLEXITY                0
CYCLOMATIC_DENSITY                   0
DECISION_COUNT                       0
DECISION_DENSITY                     0
DESIGN_COMPLEXITY                    0
DESIGN_DENSITY                       0
EDGE_COUNT                           0
ESSENTIAL_COMPLEXITY                 0
ESSENTIAL_DENSITY                  214
LOC_EXECUTABLE                       0
PARAMETER_COUNT                     72
HALSTEAD_CONTENT                     0
HALSTEAD_DIFFICULTY                  0
HALSTEAD_EFFORT                      0
HALSTEAD_ERROR_EST                   0
HALSTEAD_LENGTH                      0
HALSTEAD_LEVEL                       0
HALSTEAD_PROG_TIME                   0
HALSTEAD_VOLUME                      0
MAINTENANCE_SEVERITY     

In [7]:
loaddata = remove_missing_values(loaddata)
loaddata.head()

Unnamed: 0,LOC_BLANK,BRANCH_COUNT,CALL_PAIRS,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CONDITION_COUNT,CYCLOMATIC_COMPLEXITY,CYCLOMATIC_DENSITY,DECISION_COUNT,DECISION_DENSITY,...,NODE_COUNT,NORMALIZED_CYLOMATIC_COMPLEXITY,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,NUMBER_OF_LINES,PERCENT_COMMENTS,LOC_TOTAL,Defective
2,38.0,35.0,4.0,5.0,70.0,58.0,18.0,0.17,24.0,2.42,...,51.0,0.08,150.0,222.0,58.0,32.0,218.0,41.9,109.0,b'N'
4,9.0,15.0,4.0,14.0,22.0,28.0,8.0,0.2,14.0,2.0,...,24.0,0.11,29.0,64.0,19.0,18.0,73.0,57.14,41.0,b'N'
5,13.0,9.0,5.0,12.0,16.0,16.0,5.0,0.14,8.0,2.0,...,18.0,0.07,38.0,69.0,20.0,15.0,67.0,52.83,37.0,b'N'
7,4.0,7.0,2.0,1.0,7.0,10.0,4.0,0.16,4.0,2.5,...,12.0,0.11,40.0,67.0,23.0,22.0,37.0,25.0,25.0,b'Y'
16,10.0,11.0,2.0,2.0,14.0,20.0,6.0,0.18,10.0,2.0,...,21.0,0.1,37.0,58.0,17.0,21.0,59.0,33.33,34.0,b'N'


## Step 2: Removing outliers

In [12]:
Q1 = loaddata.quantile(0.25)
Q3 = loaddata.quantile(0.75)
IQR = Q3 - Q1
IQR

LOC_BLANK                              26.7500
BRANCH_COUNT                           21.0000
CALL_PAIRS                              5.0000
LOC_CODE_AND_COMMENT                   16.2500
LOC_COMMENTS                           42.5000
CONDITION_COUNT                        42.0000
CYCLOMATIC_COMPLEXITY                  11.2500
CYCLOMATIC_DENSITY                      0.0600
DECISION_COUNT                         20.0000
DECISION_DENSITY                        0.3000
DESIGN_COMPLEXITY                       5.0000
DESIGN_DENSITY                          0.2600
EDGE_COUNT                             51.5000
ESSENTIAL_COMPLEXITY                    5.0000
ESSENTIAL_DENSITY                       0.3400
LOC_EXECUTABLE                         55.2500
PARAMETER_COUNT                         3.0000
HALSTEAD_CONTENT                       46.0475
HALSTEAD_DIFFICULTY                    25.5100
HALSTEAD_EFFORT                    127729.9225
HALSTEAD_ERROR_EST                      0.8825
HALSTEAD_LENG

In [14]:
# tilde operator is a negation operator
result = loaddata[~((loaddata < (Q1 - 1.5 * IQR)) | (loaddata > (Q3 + 1.5 * IQR))).any(axis=1)]
result

Unnamed: 0,LOC_BLANK,BRANCH_COUNT,CALL_PAIRS,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CONDITION_COUNT,CYCLOMATIC_COMPLEXITY,CYCLOMATIC_DENSITY,DECISION_COUNT,DECISION_DENSITY,...,NODE_COUNT,NORMALIZED_CYLOMATIC_COMPLEXITY,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,NUMBER_OF_LINES,PERCENT_COMMENTS,LOC_TOTAL,Defective
2,38.0,35.0,4.0,5.0,70.0,58.0,18.0,0.17,24.0,2.42,...,51.0,0.08,150.0,222.0,58.0,32.0,218.0,41.9,109.0,b'N'
4,9.0,15.0,4.0,14.0,22.0,28.0,8.0,0.2,14.0,2.0,...,24.0,0.11,29.0,64.0,19.0,18.0,73.0,57.14,41.0,b'N'
5,13.0,9.0,5.0,12.0,16.0,16.0,5.0,0.14,8.0,2.0,...,18.0,0.07,38.0,69.0,20.0,15.0,67.0,52.83,37.0,b'N'
7,4.0,7.0,2.0,1.0,7.0,10.0,4.0,0.16,4.0,2.5,...,12.0,0.11,40.0,67.0,23.0,22.0,37.0,25.0,25.0,b'Y'
16,10.0,11.0,2.0,2.0,14.0,20.0,6.0,0.18,10.0,2.0,...,21.0,0.1,37.0,58.0,17.0,21.0,59.0,33.33,34.0,b'N'
21,12.0,37.0,12.0,23.0,65.0,60.0,20.0,0.13,28.0,2.14,...,86.0,0.09,258.0,349.0,95.0,38.0,231.0,40.37,153.0,b'Y'
33,14.0,9.0,2.0,5.0,1.0,16.0,5.0,0.15,8.0,2.0,...,18.0,0.1,35.0,48.0,13.0,16.0,49.0,17.65,33.0,b'N'
38,17.0,21.0,4.0,1.0,8.0,36.0,11.0,0.26,16.0,2.25,...,34.0,0.16,98.0,143.0,45.0,33.0,68.0,18.0,42.0,b'N'
49,10.0,29.0,8.0,2.0,65.0,56.0,15.0,0.1,28.0,2.0,...,61.0,0.07,333.0,448.0,110.0,31.0,227.0,31.02,151.0,b'N'
61,46.0,39.0,6.0,20.0,96.0,70.0,20.0,0.18,32.0,2.19,...,79.0,0.08,250.0,318.0,68.0,28.0,254.0,56.04,111.0,b'Y'
