## IMPORTING DATA

In [1]:
import numpy as np
np.set_printoptions(suppress=True, linewidth=100, precision=5)

## LOADING
Load the data using $np.loadtxt()$ or np.genfromtxt(). Mostly, genfromtxt is preferred as missing values can be handled as specified.

By default, np.genfromtxt() uses dtype=float  that's why string columns are converted to Nan’s because, after all, they're Not A Number.

Since the string and float values can’t be accessed simultaneously, it requires splitting the columns into string and float columns separately.



In [2]:
data = np.genfromtxt('train.csv', delimiter=',', encoding='unicode_escape', skip_header=0,dtype=float)
data

array([[    nan,     nan,     nan, ...,     nan,     nan,     nan],
       [     1.,     60.,     nan, ...,     nan,     nan, 208500.],
       [     2.,     20.,     nan, ...,     nan,     nan, 181500.],
       ...,
       [  1458.,     70.,     nan, ...,     nan,     nan, 266500.],
       [  1459.,     20.,     nan, ...,     nan,     nan, 142125.],
       [  1460.,     20.,     nan, ...,     nan,     nan, 147500.]])

## Splitting
Take mean column-wise using np.nanmean() to compute the arithmetic mean along the specified axis, ignoring Nan’s.


In [3]:
tmp_mean = np.nanmean(data, axis=0)
tmp_mean

  tmp_mean = np.nanmean(data, axis=0)


array([   730.5    ,     56.89726,          nan,     70.04996,  10516.82808,          nan,
                nan,          nan,          nan,          nan,          nan,          nan,
                nan,          nan,          nan,          nan,          nan,      6.09932,
            5.57534,   1971.26781,   1984.86575,          nan,          nan,          nan,
                nan,          nan,    103.68526,          nan,          nan,          nan,
                nan,          nan,          nan,          nan,    443.63973,          nan,
           46.54932,    567.24041,   1057.42945,          nan,          nan,          nan,
                nan,   1162.62671,    346.99247,      5.84452,   1515.4637 ,      0.42534,
            0.05753,      1.56507,      0.38288,      2.86644,      1.04658,          nan,
            6.51781,          nan,      0.61301,          nan,          nan,   1978.50616,
                nan,      1.76712,    472.98014,          nan,          nan,          nan,

In the below code shell argwhere() is used to find the indices of null values in tmp_mean as NaN values in tmp_mean indicates string columns and for numeric columns find indices of non-null values in tmp_mean.

In [4]:
column_strings = np.argwhere(np.isnan(tmp_mean))
column_numeric = np.argwhere(~np.isnan(tmp_mean))

Using np.squeeze() remove single-dimensional entries from the shape of numeric and string columns.

In [5]:
column_numeric = column_numeric.squeeze()
column_strings=column_strings.squeeze()

In [6]:
column_strings.squeeze(), column_strings.ravel(), column_strings.flatten()

(array([ 2,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31,
        32, 33, 35, 39, 40, 41, 42, 53, 55, 57, 58, 60, 63, 64, 65, 72, 73, 74, 78, 79], dtype=int64),
 array([ 2,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31,
        32, 33, 35, 39, 40, 41, 42, 53, 55, 57, 58, 60, 63, 64, 65, 72, 73, 74, 78, 79], dtype=int64),
 array([ 2,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31,
        32, 33, 35, 39, 40, 41, 42, 53, 55, 57, 58, 60, 63, 64, 65, 72, 73, 74, 78, 79], dtype=int64))

## Accessing String and Numeric Columns 
Re-Import data by specifying data type and indices of columns to consider using dtype and usecols parameters respectively in genfromtxt().

In [7]:
data_numeric= np.genfromtxt('train.csv', delimiter=',', skip_header=True, autostrip=True, encoding='unicode_escape',
                           usecols=column_numeric)

data_string = np.genfromtxt('train.csv', delimiter=',', skip_header=False, autostrip=True, encoding='unicode_escape',
                           usecols=column_strings, dtype=np.str_)



# Pre-processing Steps

## STRING COLUMN PRE-PROCESSING

In [8]:
#dropping na more then 50%
Dropping_Column_Indices=[]
for i in range(0,43):
    x=np.unique(data_string[:, i],return_counts=True)
    for j in range(len(x[0])):
        if(x[0][j]=='NA'):
            index=j
            #print(x[1][index])
            if(x[1][index]>720):
                Dropping_Column_Indices.append(i)  

In [9]:
Dropping_Column_Indices.reverse()
print(Dropping_Column_Indices)

[40, 39, 38, 2]


In [10]:
for i in Dropping_Column_Indices:
    data_string=np.delete(data_string,i,1)

In [11]:
for i in range(0,len(data_string[0])):
    x=np.unique(data_string[:, i],return_counts=True)
    index = np.argmax(x[1], axis=None)
    for j in range(len(data_string[:, i])):
        data_string[:, i][j]=np.char.replace(data_string[:, i][j], 'NA', x[0][index], count = 1)

In [12]:
header_string=data_string[0]
header_string

array(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond',
       'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional',
       'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
       'SaleType', 'SaleCondition'], dtype='<U13')

In [13]:
data_string=data_string[1:]

In [14]:
for i in range(len(data_string[0])):
    keys = np.unique(data_string[:,i])
    values = range(1, keys.shape[0]+1)
    dict_col = dict(zip(keys,values))
    for g in keys:
        data_string[:,i] = np.where(data_string[:,i]==g, dict_col.get(g), data_string[:,i])
    

In [15]:
data_string

array([['4', '2', '4', ..., '3', '9', '5'],
       ['4', '2', '4', ..., '3', '9', '5'],
       ['4', '2', '1', ..., '3', '9', '5'],
       ...,
       ['4', '2', '4', ..., '3', '9', '5'],
       ['4', '2', '4', ..., '3', '9', '5'],
       ['4', '2', '4', ..., '3', '9', '5']], dtype='<U13')

In [16]:
data_string = data_string.astype(np.int32)
data_string

array([[4, 2, 4, ..., 3, 9, 5],
       [4, 2, 4, ..., 3, 9, 5],
       [4, 2, 1, ..., 3, 9, 5],
       ...,
       [4, 2, 4, ..., 3, 9, 5],
       [4, 2, 4, ..., 3, 9, 5],
       [4, 2, 4, ..., 3, 9, 5]])

## NUMERIC COLUMN PRE-PROCESSING

In [17]:
x = np.isnan(data_numeric).sum(axis=0)
for n in x:
    print(n, end=', ')

0, 0, 259, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 81, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [18]:
columnwise_mean=np.nanmean(data_numeric,axis=0)

indices = np.where(np.isnan(data_numeric))

data_numeric[indices] = np.take(columnwise_mean, indices[1])

In [19]:
header_numeric= np.genfromtxt('train.csv', delimiter=',', skip_header=False, autostrip=True, encoding='unicode_escape',
                           usecols=column_numeric,dtype=np.str)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  usecols=column_numeric,dtype=np.str)


In [20]:
header_numeric=header_numeric[0]

# Creating the checkpoints

In [21]:
def checkpoint(filename, checkpoint_header, chekpoint_data):
    np.savez(filename, header=checkpoint_header, data=chekpoint_data)
    checkpoint_variable = np.load(filename+".npz")
    return checkpoint_variable

In [22]:
checkpoint_string = checkpoint("Checkpoint-String", header_string, data_string)

In [23]:
checkpoint_numeric = checkpoint("Checkpoint-numeric", header_numeric, data_numeric)

In [25]:
checkpoint_string["header"]

array(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond',
       'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional',
       'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
       'SaleType', 'SaleCondition'], dtype='<U13')

In [26]:
checkpoint_string["data"]

array([[4, 2, 4, ..., 3, 9, 5],
       [4, 2, 4, ..., 3, 9, 5],
       [4, 2, 1, ..., 3, 9, 5],
       ...,
       [4, 2, 4, ..., 3, 9, 5],
       [4, 2, 4, ..., 3, 9, 5],
       [4, 2, 4, ..., 3, 9, 5]])

In [27]:
checkpoint_numeric["header"]

array(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
       'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
       '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice'],
      dtype='<U13')

In [28]:
checkpoint_numeric["data"]

array([[     1.,     60.,     65., ...,      2.,   2008., 208500.],
       [     2.,     20.,     80., ...,      5.,   2007., 181500.],
       [     3.,     60.,     68., ...,      9.,   2008., 223500.],
       ...,
       [  1458.,     70.,     66., ...,      5.,   2010., 266500.],
       [  1459.,     20.,     68., ...,      4.,   2010., 142125.],
       [  1460.,     20.,     75., ...,      6.,   2008., 147500.]])

# Loading the data from checkpoints

In [30]:
checkpoint_variable = np.load("Checkpoint-numeric"+".npz")

In [32]:
checkpoint_variable["header"]

array(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
       'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
       '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice'],
      dtype='<U13')

In [33]:
checkpoint_variable["data"]

array([[     1.,     60.,     65., ...,      2.,   2008., 208500.],
       [     2.,     20.,     80., ...,      5.,   2007., 181500.],
       [     3.,     60.,     68., ...,      9.,   2008., 223500.],
       ...,
       [  1458.,     70.,     66., ...,      5.,   2010., 266500.],
       [  1459.,     20.,     68., ...,      4.,   2010., 142125.],
       [  1460.,     20.,     75., ...,      6.,   2008., 147500.]])