# Missing Values

### Todo



#### This cell is for defining various OPTIONS used for this notebook (working directory, how many rows and columns pandas displays for a dataframe, etc). 

#### Preferably this cell is also where we do important imports (for example pandas and numpy)

In [1]:
import os 
'''
Input the directory where your joined_data.csv is located 
'''
#os.chdir('C:/Users/Trond/Documents/Master 2020/Processed data')
os.chdir('C:/Users/Briggstone/Documents/Master 2020/Processed data')
#os.chdir('C:/Users/MyPC/Documents/Andrijana/UiS/DATMAS Master oppgave/Processed data')

'''
Folder where you want the output of this notebook placed
'''
#output_filepath = 'C:/Users/Trond/Documents/Master 2020/Processed data'
output_filepath = 'C:/Users/Briggstone/Documents/Master 2020/Processed data'
#output_filepath = 'C:/Users/MyPC/Documents/Andrijana/UiS/DATMAS Master oppgave/Processed data'

import pandas as pd 
import numpy as np

# Set ipython's max row display
pd.set_option('display.max_row', 1000)

# Set iPython's max column width to 50
pd.set_option('display.max_columns', 200)

'''
Missing value handler flag, 0 = MODE/MEAN IMPUTATION, 1 = HEOM
Depending on the flag set, different imputation methods are used
HEOM is computationally expensive and will take some time to complete
'''
MV_FLAG = 0

#Importing data from data_collection notebook
data = pd.read_csv('joined_data.csv') 

### In this cell we do missing value exploration by printing out relevant statistics

In [2]:
# Number of rows with no missing values
print("Number of rows which have no missing values at start: ", data[~data.isnull().any(axis=1)].shape[0])

# number of rows with missing values
print("Number of rows with missing values at start: ", data[data.isnull().any(axis=1)].shape[0])

# min and max number of missing values in a row
print("Minimum number of missing columns in a row: ", np.min(np.sort(data.isnull().sum(axis=1).unique())))
print("Maximum number of missing columns in a row: ", np.max(np.sort(data.isnull().sum(axis=1).unique())))

#Number of unique PATNOs with missing entries
null_data = data[data.isnull().any(axis=1)]
print("Unique PATNOs with missing entries: ", null_data.PATNO.unique().size )

#Summary of missing values in columns
null_columns=data.columns[data.isnull().any()]
print("\nSummary of missing values in columns")
print(data[null_columns].isnull().sum())


Number of rows which have no missing values at start:  0
Number of rows with missing values at start:  2250
Minimum number of missing columns in a row:  2
Maximum number of missing columns in a row:  48
Unique PATNOs with missing entries:  416

Summary of missing values in columns
BIOMOMPD        16
BIODADPD        16
FULSIBPD       125
HAFSIBPD      1517
MAGPARPD        22
PAGPARPD        24
MATAUPD        124
PATAUPD        172
KIDSPD         255
NP3FTAPL         1
NP3HMOVL         1
NP3PRSPL         1
NP3PSTBL         8
NP3PTRMR         1
NP3TTAPR         1
NHY              4
SDMTOTAL         2
MCAALTTM         2
MCACUBE          2
MCACLCKC         2
MCACLCKN         2
MCACLCKH         2
MCALION          2
MCARHINO         2
MCACAMEL         2
MCAFDS           2
MCABDS           2
MCAVIGIL         2
MCASER7          2
MCASNTNC         2
MCAVF            2
MCAABSTR         3
MCAREC1          5
MCAREC2          4
MCAREC3          5
MCAREC4          7
MCAREC5          6
MCADATE        

#### In this cell we process missing values that are not dealt with by Mode/Mean or HEOM

In [3]:
# We deal with missing demographic PD questions by filling the missing entries with no (0). 
# The rationale is that the proportion of subjects who answers 0 in the non-missing entries is much larger than the proportion of subjects who answers 1 in the non-missing entries
vars = ["BIOMOMPD","BIODADPD", "FULSIBPD", "HAFSIBPD", "MAGPARPD", "PAGPARPD", "MATAUPD", "PATAUPD", "KIDSPD" ]
data.loc [:, vars] = data.loc[:, vars].fillna(0)

#We deal with most missing values in SCOPA 22 and 23 by filtering by female (0) and imputing 0 
#Questions are only for males
vars = ["SCAU22", "SCAU23"]
data.loc [data.GENDER == 0, vars] = data.loc[data.GENDER == 0, vars].fillna(0)

#We deal with most missing values in SCOPA 24 and 25 by filtering by male (1) and imputing 0 
#Questions are only for females
vars = ["SCAU24", "SCAU25"]
data.loc [data.GENDER == 1, vars] = data.loc[data.GENDER == 1, vars].fillna(0)

#We deal with missing values in CNSOTH by imputing 0
#Rationale is the same as for missing demographic PD questions
data.loc [:, "CNSOTH"] = data.loc[:, "CNSOTH"].fillna(0)


#We deal with missing LNS values after 3 0s as additional 0s, the reason is that they stop the test if subject does not get any correct items on iteration x of the test
for i in range(1,7):
    sA = "LNS" + str(i) + "A"
    sB = "LNS" + str(i) + "B"
    sC = "LNS" + str(i) + "C"
    for i2 in range(i +1, 8):
        s2A = "LNS" + str(i2) + "A"
        s2B = "LNS" + str(i2) + "B"
        s2C = "LNS" + str(i2) + "C"
        data.loc[(data[sA] == 0) & (data[sB] == 0) & (data[sC] == 0),[s2A,s2B,s2C]] = 0

#### In this cell we do missing value exploration after first round of imputation

In [4]:
# Number of rows with no missing values
print("Number of rows which have no missing values after first imputation: ", data[~data.isnull().any(axis=1)].shape[0])

# number of rows with missing values
print("Number of rows with missing values after first imputation ", data[data.isnull().any(axis=1)].shape[0])

# min and max number of missing values in a row
print("Minimum number of missing columns in a row: ", np.min(np.sort(data.isnull().sum(axis=1).unique())))
print("Maximum number of missing columns in a row: ", np.max(np.sort(data.isnull().sum(axis=1).unique())))

#Number of unique PATNOs with missing entries
null_data = data[data.isnull().any(axis=1)]
print("Unique PATNOs with missing entries: ", null_data.PATNO.unique().size )

#Summary of missing values in columns
null_columns=data.columns[data.isnull().any()]
print("\nSummary of missing values in columns after first imputation")
print(data[null_columns].isnull().sum())

Number of rows which have no missing values after first imputation:  2180
Number of rows with missing values after first imputation  70
Minimum number of missing columns in a row:  0
Maximum number of missing columns in a row:  31
Unique PATNOs with missing entries:  55

Summary of missing values in columns after first imputation
NP3FTAPL       1
NP3HMOVL       1
NP3PRSPL       1
NP3PSTBL       8
NP3PTRMR       1
NP3TTAPR       1
NHY            4
SDMTOTAL       2
MCAALTTM       2
MCACUBE        2
MCACLCKC       2
MCACLCKN       2
MCACLCKH       2
MCALION        2
MCARHINO       2
MCACAMEL       2
MCAFDS         2
MCABDS         2
MCAVIGIL       2
MCASER7        2
MCASNTNC       2
MCAVF          2
MCAABSTR       3
MCAREC1        5
MCAREC2        4
MCAREC3        5
MCAREC4        7
MCAREC5        6
MCADATE        3
MCAMONTH       3
MCAYR          3
MCADAY         3
MCAPLACE       3
MCACITY        3
HVLTRDLY       3
HVLTREC        6
HVLTFPRL       6
HVLTFPUN       6
JLO_TOTRAW     2
LNS1C

#### In this cell we define functions for Mode/Mean and HEOM

In [5]:
# Mode/Mean and HEOM are dependant on Column_Data_Types.csv 
# Download and place Column_Data_Types.csv on the same folder where you place your joined_data.csv file

def mode_mean_imputation (df):

    column_types = pd.read_csv("Column_Data_Types.csv")
    null_columns =df.columns[df.isnull().any()]

    for x in null_columns:
        mean_or_mode = 0
        if (column_types.loc[column_types.COLUMN_NAME == x, "DATA_TYPE"] == "Categorical").values[0]:
            mean_or_mode = 1

        mask = df.loc[:,x].isnull()
        event_ids = df.loc[mask,["EVENT_ID", x]].EVENT_ID.unique()                           
        for e in event_ids:
            view = df.loc[df.EVENT_ID == e, x]
            if mean_or_mode == 0:
                df.loc[df.EVENT_ID == e, x] = df.loc[df.EVENT_ID == e, x].fillna(round(df.loc[df.EVENT_ID == e, x].mean()))
            else:
                df.loc[df.EVENT_ID == e, x] = df.loc[df.EVENT_ID == e, x].fillna(df.loc[df.EVENT_ID == e, x].mode()[0])
                       
                       
# from https://github.com/KacperKubara/distython/blob/master/HEOM.py with minor changes

def heom(x, y, min_max_df, column_types):
        """ Distance metric function which calculates the distance
        between two instances. Handles heterogeneous data and missing values.
        
        Parameters
        ----------
        x : array-like of shape = [n_features]
            First instance 
            
        y : array-like of shape = [n_features]
            Second instance
            
        min_max_df: data frame of numerical column range values
            Third instance
        
        Returns
        -------
        result: float, indices
            Returns the result of the distance metrics function and indices of missing values
        """
        #x = data.iloc[ix]
   
        # Initialise results' array
        results_array = np.zeros(x.shape)

        # Get indices for missing values, if any
        nan_x_ix = x[x.isna()].index
        nan_y_ix = y[y.isna()].index
        nan_ix = np.unique(np.concatenate((nan_x_ix, nan_y_ix)))
        # Calculate the distance for missing values elements
        results_array[0:len(nan_ix)] = 1

        # Get categorical and numerical indices without missing values
        val_ix = set(x.index.values).difference(set(nan_ix))
        cat_ix = []
        num_ix = []
        for idx in val_ix:
            if column_types[idx] == "Categorical":
                cat_ix.append(idx)
            else:
                num_ix.append(idx)
        # Calculate the distance for categorical elements
        results_array[len(nan_ix) : len(nan_ix) + len(cat_ix)]= np.not_equal(x[cat_ix], y[cat_ix]) * 1 # use "* 1" to convert it into int 
        # Calculate the distance for numerical elements
        results_array[-len(num_ix):] = np.abs(x[num_ix] - y[num_ix])/min_max_df[num_ix]

        # Return the final result
        # Square root is not computed in practice
        # As it doesn't change similarity between instances
        return np.sum(np.square(results_array)), nan_x_ix

    
def heom_imputation (data):
    column_types = pd.read_csv("Column_Data_Types.csv")
    num_col = column_types.loc[column_types.DATA_TYPE != "Categorical", "COLUMN_NAME"].values 
    num_col = np.intersect1d(num_col, data.columns.values)  
    
    
    min_max_range = np.nanmax(data[num_col], axis = 0) - np.nanmin(data[num_col], axis = 0) # range of numerical columns
    min_max_df = pd.DataFrame(min_max_range.reshape(1,len(num_col)), columns = num_col)
    mask = data[data.isnull().any(axis=1)].index.values.astype(int) #missing value row indices
    rows = data.index.values.astype(int) #row indices, all
    
    column_types = pd.read_csv("Column_Data_Types.csv")
    column_data_types = {}
    for _, row in column_types.iterrows():
        column_data_types[row["COLUMN_NAME"]] = row["DATA_TYPE"]
        
    # Any further significant speed up would come from parallellization, many rows could technically be compared at once
    
    for ix in mask:
        print(ix)
        min_dist = 1000 # some initial large value
        for iy in rows:
            if iy != ix:
                tmp_dist, nan_x_ix = heom(data.iloc[ix], data.iloc[iy], min_max_df, column_data_types)
                if tmp_dist <= min_dist:
                    min_dist = tmp_dist
                    sim_ix = iy
                    
        # fill missing values with appropriate values from the most simmilar row       
        val_fill = data.iloc[sim_ix][nan_x_ix]        
        data.at[ix, nan_x_ix] = val_fill 



#### In this cell we deal with the rest of our missing values by using one of our functions. We verify that no missing values are left

In [6]:
if MV_FLAG == 0:
    mode_mean_imputation(data)
    data.to_csv(output_filepath + '/joined_data_mm.csv', index = False)  
    
if MV_FLAG == 1:
    while data.isnull().sum().sum() > 0:
        %time heom_imputation (data)
    data.to_csv(output_filepath + '/joined_data_heom.csv', index = False)
    
null_columns=data.columns[data.isnull().any()]
print(data[null_columns].isnull().sum())


Series([], dtype: float64)
