In [1]:
import numpy as np
import pandas as pd

# import sys
# import os
# __file__ = '/Users/rag004/Documents/PhD/Code/HaphazardInputReview/HaphazardInputsReview/Code/DataCode'
# sys.path.append(os.path.dirname(os.path.abspath(__file__)))

In [2]:
data_path = "/Users/rag004/Documents/PhD/Code/HaphazardInputReview/HaphazardInputs/Data/diabetes_us/diabetes_us.csv"

In [3]:
# Load diabetes us dataset:
def data_load_diabetes_us(data_path):
    data_name = "diabetes_us.csv"
    data_initial =  pd.read_csv(data_path, header = None, engine = 'python')
    # 1st row contains the header name so we remove that
    data_initial = data_initial[1:]
    # Only readmission days <30 is positive class, otherwise it is a negative class
    label = np.array(data_initial[49] == '<30')*1
    # The last column contains the labels
    data_initial = data_initial.iloc[:,:49]
    # The first two column is admission and patient id, so we drop these
    data_initial = data_initial.iloc[:,2:]
    # "?", "nan" for glucose serum test result (feat no. 22) and "nan" for A1c test result (feat no. 23), is considered
    # unavailable/unmeasured features. For the time being we denote this by "-1" and later we replace it with np.nan
    data_initial[data_initial == '?'] = "-1"
    data_initial[22] = data_initial[22].fillna("-1")
    data_initial[23] = data_initial[23].fillna("-1")
    # The age feature (feat no. 4) indicates only if the age is in a bracket of 10 ([0,10], [10,20], ...). We consider 
    # the median value of the bracket as the actual value and replace it with that
    val_list = sorted(list(set(np.unique(data_initial[4])).difference({"-1"})))
    age_list = [5, 15, 25, 35, 45, 55, 65, 75, 85, 95]
    for j in range(len(val_list)):
        data_initial.loc[data_initial[4] == val_list[j], 4] = age_list[j]
    # Similar to age feature, the weight feature (feat no. 5) is represented in brackets. We consider the median value 
    # and replace with that
    val_list = ['[0-25)', '[25-50)', '[50-75)', '[75-100)', '[100-125)', '[125-150)', '[150-175)', '[175-200)', '>200']
    weight_list = [12.5, 37.5, 62.5, 87.5, 112.5, 137.5, 162.5, 187.5, 212.5]
    for j in range(len(val_list)):
        data_initial.loc[data_initial[5] == val_list[j], 5] = age_list[j]
    # The below feat_list features contains categorical value. We transform them to numerical value by assigning them
    # value from 1 to the number of categories in each feature
    feat_list = [2, 3, 10, 11, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48]
    for i in feat_list:
        val_list = sorted(list(set(np.unique(data_initial[i])).difference({"-1"})))
        # print(val_list)
        for j in range(len(val_list)):
            data_initial.loc[data_initial[i] == val_list[j], i] = j+1
    # Substitute each position containing -1 with nan value
    data_initial[data_initial == "-1"] = np.nan
    # Convert everything to float type
    for i in data_initial.columns:
            data_initial[i] = np.array(data_initial[i]).astype(float)
    data_initial.insert(0, column="class", value=label)
    data = data_initial.sample(frac = 1)

    Y = np.array(data.iloc[:,:1])
    X = np.array(data.iloc[:,1:])

    # Missing columns are - 2,  5, 10, 11, 18, 19, 20, 22, 23 in the orginal dataset (zero indexing).
    # In the processed dataset missing columns are - [ 0,  3,  8,  9, 16, 17, 18, 20, 21] (zero index).

    return X, Y

In [4]:
def data_load_real(data_path, data_folder = "diabetes_us"):
    if data_folder == "diabetes_us":
        X, Y = data_load_diabetes_us(data_path)    
        mask = np.ones((X.shape))
        mask[np.isnan(X)] = 0
        X_haphazard = np.where(mask, X, 0)
        return X, Y, X_haphazard, mask

In [5]:
X, Y, X_haphazard, mask = data_load_real(data_path)

In [33]:
for i in range(47):
    print(np.sum(np.isnan(X[:, i:i+1])), np.nanmin(X[:, i:i+1]), np.nanmax(X[:, i:i+1]), np.unique(X[:, i:i+1]))

2273 1.0 5.0 [ 1.  2.  3.  4.  5. nan]
0 1.0 3.0 [1. 2. 3.]
0 5.0 95.0 [ 5. 15. 25. 35. 45. 55. 65. 75. 85. 95.]
98569 5.0 85.0 [ 5. 15. 25. 35. 45. 55. 65. 75. 85. nan]
0 1.0 8.0 [1. 2. 3. 4. 5. 6. 7. 8.]
0 1.0 28.0 [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17. 18.
 19. 20. 22. 23. 24. 25. 27. 28.]
0 1.0 25.0 [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 13. 14. 17. 20. 22. 25.]
0 1.0 14.0 [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14.]
40256 1.0 17.0 [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17. nan]
49949 1.0 72.0 [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17. 18.
 19. 20. 21. 22. 23. 24. 25. 26. 27. 28. 29. 30. 31. 32. 33. 34. 35. 36.
 37. 38. 39. 40. 41. 42. 43. 44. 45. 46. 47. 48. 49. 50. 51. 52. 53. 54.
 55. 56. 57. 58. 59. 60. 61. 62. 63. 64. 65. 66. 67. 68. 69. 70. 71. 72.
 nan]
0 1.0 132.0 [  1.   2.   3.   4.   5.   6.   7.   8.   9.  10.  11.  12.  13.  14.
  15.  16.  17.  18.  19.  20.  21.  22. 