## Importing Data with NumPy

In [2]:
import numpy as np 

### np.loadtxt() vs np.genfromtxt()

In [2]:
#np.loadtxt() is fast but it breaks when we feed it with incomplete or ill formatted datasets 
lending_co_data_numeric_1 = np.loadtxt("Lending-Company-Numeric-Data.csv", delimiter = ',')
lending_co_data_numeric_1

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [3]:
lending_co_data_numeric_2 = np.genfromtxt("Lending-Company-Numeric-Data.csv", delimiter = ',')
lending_co_data_numeric_2

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [4]:
np.array_equal(lending_co_data_numeric_1, lending_co_data_numeric_2)

True

In [7]:
#empty space counts as a symbol
lending_co_data_numeric_NAN = np.loadtxt("Lending-Company-Numeric-Data.csv", delimiter = ';')
lending_co_data_numeric_NAN

In [9]:
# Not a Number
lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data.csv", delimiter = ';')
lending_co_data_numeric_NAN

array([nan, nan, nan, ..., nan, nan, nan])

In [11]:
#If missing values are there and you want to import everything then you can import them as a string 
# Saved as plain text rather than numbers 
#Use this method only if you want to observe the data 
#If you want to perform mathematical operations then import the data (if missing values are there) use np.genfromtxt
lending_co_data_numeric_NAN = np.loadtxt("Lending-Company-Numeric-Data.csv", delimiter = ';',dtype = np.str_)
lending_co_data_numeric_NAN

array(['2000,40,365,3121,4241,13621', '2000,40,365,3061,4171,15041',
       '1000,40,365,2160,3280,15340', ..., '2000,40,365,4201,5001,16600',
       '1000,40,365,2080,3320,15600', '2000,40,365,4601,4601,16600'],
      dtype='<U30')

### Partial Cleaning While Importing

In [3]:

lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", delimiter = ';')
lending_co_data_numeric_NAN

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [4]:

lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", delimiter = ';',skip_header=2)
lending_co_data_numeric_NAN

array([[ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [5]:

lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", delimiter = ';',skip_footer=2)
lending_co_data_numeric_NAN

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  3401.,    nan, 16600.],
       [ 2000.,    40.,   365.,    nan,  5440., 16600.],
       [   nan,    40.,   365.,  4201.,  5001., 16600.]])

In [7]:
#usecols - tells which columns you want to use 
lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", delimiter = ';',usecols=0)
lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", delimiter = ';',usecols=(0,1,5))
lending_co_data_numeric_NAN

array([[ 2000.,    40., 13621.],
       [ 2000.,    40., 15041.],
       [ 1000.,    40., 15340.],
       ...,
       [   nan,    40., 16600.],
       [ 1000.,    40., 15600.],
       [ 2000.,    40., 16600.]])

In [8]:

lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", delimiter = ';',usecols=0)
lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", delimiter = ';',usecols=(5,0,1))
lending_co_data_numeric_NAN

array([[13621.,  2000.,    40.],
       [15041.,  2000.,    40.],
       [15340.,  1000.,    40.],
       ...,
       [16600.,    nan,    40.],
       [15600.,  1000.,    40.],
       [16600.,  2000.,    40.]])

In [9]:
#usecols - tells which columns you want to use 
lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", delimiter = ';',usecols=0)
lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", delimiter = ';',usecols=(0,1,5),
skip_header=2,skip_footer=2)
lending_co_data_numeric_NAN

array([[ 1000.,    40., 15340.],
       [ 2000.,    40., 15321.],
       [ 2000.,    50., 13720.],
       ...,
       [ 2000.,    40., 16600.],
       [ 2000.,    40., 16600.],
       [   nan,    40., 16600.]])

In [12]:
lending_co_data_5, lending_co_data_0, lending_co_data_1 = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", delimiter = ';',usecols=(5,0,1),
skip_header=2,skip_footer=2, unpack = True)
print(lending_co_data_5)
print(lending_co_data_0)
print(lending_co_data_1)

[15340. 15321. 13720. ... 16600. 16600. 16600.]
[1000. 2000. 2000. ... 2000. 2000.   nan]
[40. 40. 50. ... 40. 40. 40.]


### String vs Object vs Numbers

In [16]:
#Be careful when you ask Python to perform computations after you have imported missing values categorized as integers
# By specifying the input data to be of specific type, the function generates the missing values differently
lending_co_lt = np.genfromtxt('lending-co-LT.csv', delimiter = ',' , dtype = np.int32)
lending_co_lt 
print(lending_co_lt)

[[   -1    -1    -1 ...    -1    -1    -1]
 [    1    -1    -1 ...    -1    -1 16600]
 [    2    -1    -1 ...    -1    -1 16600]
 ...
 [ 1041    -1    -1 ...    -1    -1 16600]
 [ 1042    -1    -1 ...    -1    -1 15600]
 [ 1043    -1    -1 ...    -1    -1 16600]]


In [17]:
lending_co_lt = np.genfromtxt('lending-co-LT.csv', delimiter = ',' , dtype = np.str_)
lending_co_lt 
print(lending_co_lt)

[['LoanID' 'StringID' 'Product' ... 'Location' 'Region' 'TotalPrice']
 ['1' 'id_1' 'Product B' ... 'Location 2' 'Region 2' '16600.0']
 ['2' 'id_2' 'Product B' ... 'Location 3' '' '16600.0']
 ...
 ['1041' 'id_1041' 'Product B' ... 'Location 23' 'Region 4' '16600.0']
 ['1042' 'id_1042' 'Product C' ... 'Location 52' 'Region 6' '15600.0']
 ['1043' 'id_1043' 'Product B' ... 'Location 142' 'Region 6' '16600.0']]


In [18]:
lending_co_lt = np.genfromtxt('lending-co-LT.csv', delimiter = ',' , dtype = np.object_)
lending_co_lt 
print(lending_co_lt)
# b indicates that data inside the file is not just plain text so we cannot freely manipulate the values 

[[b'LoanID' b'StringID' b'Product' ... b'Location' b'Region'
  b'TotalPrice']
 [b'1' b'id_1' b'Product B' ... b'Location 2' b'Region 2' b'16600.0']
 [b'2' b'id_2' b'Product B' ... b'Location 3' b'' b'16600.0']
 ...
 [b'1041' b'id_1041' b'Product B' ... b'Location 23' b'Region 4'
  b'16600.0']
 [b'1042' b'id_1042' b'Product C' ... b'Location 52' b'Region 6'
  b'15600.0']
 [b'1043' b'id_1043' b'Product B' ... b'Location 142' b'Region 6'
  b'16600.0']]


In [20]:
lending_co_lt = np.genfromtxt('lending-co-LT.csv', delimiter = ',' , dtype = (np.int32,np.str_,np.str_,np.str_,np.str_, np.int32,np.int32))
print(lending_co_lt)

[(  -1, '', '', '', '', -1,    -1) (   1, '', '', '', '', -1, 16600)
 (   2, '', '', '', '', -1, 16600) ... (1041, '', '', '', '', -1, 16600)
 (1042, '', '', '', '', -1, 15600) (1043, '', '', '', '', -1, 16600)]
