# Preprocessing with NumPy

In [1]:
import numpy as np

## Checking for Missing Values

In [2]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')

## If np.loadtxt() compiles first time, the dataset consists of only numeric values and has no missing data. 

In [3]:
np.isnan(lending_co_data_numeric).sum()

## isnan() determines whether data is missing data for the individual elements in an array (True -> Missing, False -> Not missing)
## By adding .sum(), we get the total number of missing elements in the data. 

0

In [4]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';')

In [5]:
np.isnan(lending_co_data_numeric_NAN).sum()

260

In [6]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", 
                                            delimiter = ';',
                                            filling_values = 0)

## Filling_values substitutes every nan with the value we're passing (0 in this case)

In [7]:
np.isnan(lending_co_data_numeric_NAN).sum()

## All the previously missing values are now 0s.

0

In [8]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", 
                                            delimiter = ';') 

# We need to reimport the dataset since all the missing values are filled up. 

In [9]:
temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2) + 1

# We use nanmax(), since max() returns nan. 
# We want a value greater than the max, since we have be certain it's unique to the dataset.

In [10]:
temporary_fill

64002.0

In [11]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", 
                                            delimiter = ';',
                                            filling_values = temporary_fill) 

# Filling up all the missing values with the temporary filler. 

In [12]:
np.isnan(lending_co_data_numeric_NAN)

array([[False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       ...,
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False]])

In [13]:
np.isnan(lending_co_data_numeric_NAN).sum()

0

## Substituting Missing Values

In [14]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';')
lending_co_data_numeric_NAN

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [15]:
temporary_mean = np.nanmean(lending_co_data_numeric_NAN, axis = 0).round(2)

## Storing the means of every column. 

In [16]:
temporary_mean[0]

2250.25

In [17]:
temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2) + 1

lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv",
                                            delimiter = ';',
                                            filling_values = temporary_fill)

## Creating a unique filler and using it to take care of all the missing values.

In [18]:
temporary_fill

64002.0

In [19]:
np.mean(lending_co_data_numeric_NAN[:,0]).round(2) 

# Supposed mean (w/ fillers)

4263.25

In [20]:
temporary_mean[0]

# Actual mean (w/0 fillers)

2250.25

In [21]:
lending_co_data_numeric_NAN[:,0] = np.where(lending_co_data_numeric_NAN[:,0] == temporary_fill,
                                            temporary_mean[0], 
                                            lending_co_data_numeric_NAN[:,0])

# Going through the first column and substituting any temporary fillers (previously missing) with the mean for that column.

In [22]:
np.mean(lending_co_data_numeric_NAN[:,0]).round(2)

# New mean equals old mean. 

2250.25

In [23]:
for i in range(lending_co_data_numeric_NAN.shape[1]):        
    lending_co_data_numeric_NAN[:,i] = np.where(lending_co_data_numeric_NAN[:,i] == temporary_fill, 
                                                temporary_mean[i], 
                                                lending_co_data_numeric_NAN[:,i])
    
# We're generalizing the filling from earlier and going through all the columns. 

In [90]:
for i in range(lending_co_data_numeric_NAN.shape[1]):        
    lending_co_data_numeric_NAN[:,i] = np.where(lending_co_data_numeric_NAN[:, i] < 0,
                                                0, 
                                                lending_co_data_numeric_NAN[:,i])
    
# We can use this approach for other applications as well (e.g. remove all negative values and set them to 0)

## Reshaping

In [25]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')

In [26]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [27]:
lending_co_data_numeric.shape

(1043, 6)

In [28]:
np.reshape(lending_co_data_numeric, (6,1043))

# Reshaping (1043,6) to (6,1043) is not the same as transposing.

array([[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
       [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
       [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.],
       [ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
       [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
       [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]])

In [29]:
np.transpose(lending_co_data_numeric)

array([[ 2000.,  2000.,  1000., ...,  2000.,  1000.,  2000.],
       [   40.,    40.,    40., ...,    40.,    40.,    40.],
       [  365.,   365.,   365., ...,   365.,   365.,   365.],
       [ 3121.,  3061.,  2160., ...,  4201.,  2080.,  4601.],
       [ 4241.,  4171.,  3280., ...,  5001.,  3320.,  4601.],
       [13621., 15041., 15340., ..., 16600., 15600., 16600.]])

In [30]:
np.reshape(lending_co_data_numeric, (1,1,2,3,1043))

# We can choose whatever shape we wish as long as the product of the dimensions matches the total number of elements in the array.

array([[[[[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
          [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
          [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.]],

         [[ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
          [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
          [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]]]]])

In [31]:
lending_co_data_numeric

# Reshaping doesn't alter the original array. 

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [32]:
lending_co_data_numeric_2 = np.reshape(lending_co_data_numeric, (6,1043))
lending_co_data_numeric_2

array([[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
       [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
       [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.],
       [ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
       [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
       [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]])

In [33]:
lending_co_data_numeric.reshape(6,1043)

# Equivalent method. 

array([[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
       [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
       [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.],
       [ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
       [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
       [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]])

In [34]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

## Removing Values

In [35]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 

In [36]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [37]:
np.delete(lending_co_data_numeric, 0).shape

# Removes the first value of the flattened array. 

(6257,)

In [38]:
lending_co_data_numeric.size

6258

In [39]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [40]:
np.delete(lending_co_data_numeric, [0,2,4] , axis = 1)

# By setting an axis, we can simultaneously delete entire rows or columns. 

array([[   40.,  3121., 13621.],
       [   40.,  3061., 15041.],
       [   40.,  2160., 15340.],
       ...,
       [   40.,  4201., 16600.],
       [   40.,  2080., 15600.],
       [   40.,  4601., 16600.]])

In [41]:
np.delete(np.delete(lending_co_data_numeric, [0,2,4] , axis = 1), [0,2,-1] , axis = 0)

# We can simultaneously delete rows AND columns. 

array([[   40.,  3061., 15041.],
       [   40.,  3041., 15321.],
       [   50.,  3470., 13720.],
       ...,
       [   40.,  4240., 16600.],
       [   40.,  4201., 16600.],
       [   40.,  2080., 15600.]])

## Sorting Data

In [42]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [43]:
np.sort(lending_co_data_numeric).shape

(1043, 6)

In [44]:
lending_co_data_numeric.shape

(1043, 6)

In [45]:
np.sort(lending_co_data_numeric, axis = None)

array([-2870., -2870., -2550., ..., 54625., 54625., 64001.])

In [46]:
np.set_printoptions(suppress = True)

# Supresses scientific notatoin when printing. 

In [47]:
np.sort(lending_co_data_numeric)

array([[   40.,   365.,  2000.,  3121.,  4241., 13621.],
       [   40.,   365.,  2000.,  3061.,  4171., 15041.],
       [   40.,   365.,  1000.,  2160.,  3280., 15340.],
       ...,
       [   40.,   365.,  2000.,  4201.,  5001., 16600.],
       [   40.,   365.,  1000.,  2080.,  3320., 15600.],
       [   40.,   365.,  2000.,  4601.,  4601., 16600.]])

In [48]:
-np.sort(-lending_co_data_numeric)

## Adding two minus signs sorts the array in descending order

array([[13621.,  4241.,  3121.,  2000.,   365.,    40.],
       [15041.,  4171.,  3061.,  2000.,   365.,    40.],
       [15340.,  3280.,  2160.,  1000.,   365.,    40.],
       ...,
       [16600.,  5001.,  4201.,  2000.,   365.,    40.],
       [15600.,  3320.,  2080.,  1000.,   365.,    40.],
       [16600.,  4601.,  4601.,  2000.,   365.,    40.]])

In [49]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [50]:
lending_co_data_numeric.sort(axis = 0)
lending_co_data_numeric

# The equivalent method stores the values in place. 

array([[ 1000.,    35.,   365., -2870., -2870.,  -350.],
       [ 1000.,    35.,   365., -2550., -2100.,   150.],
       [ 1000.,    35.,   365., -2450., -2000.,  1100.],
       ...,
       [ 9000.,   125.,   365., 16751., 18751., 54625.],
       [ 9000.,   165.,   365., 17650., 20001., 54625.],
       [ 9000.,   165.,   365., 19001., 22001., 64001.]])

## Argument Functions

### np.argsort()

In [51]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [52]:
np.argsort(lending_co_data_numeric)

# Returns the order which will sort the array. 

array([[1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       ...,
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5]], dtype=int64)

In [53]:
np.sort(lending_co_data_numeric, axis = 0)

array([[ 1000.,    35.,   365., -2870., -2870.,  -350.],
       [ 1000.,    35.,   365., -2550., -2100.,   150.],
       [ 1000.,    35.,   365., -2450., -2000.,  1100.],
       ...,
       [ 9000.,   125.,   365., 16751., 18751., 54625.],
       [ 9000.,   165.,   365., 17650., 20001., 54625.],
       [ 9000.,   165.,   365., 19001., 22001., 64001.]])

In [54]:
np.argsort(lending_co_data_numeric, axis = 0)

array([[ 537,  443,    0,   32,   32,  482],
       [ 639,  327,  687,  166,  166,  493],
       [ 849,  432,  688,   85,   85,  166],
       ...,
       [  27,  326,  355,  568, 1019,  568],
       [ 277,   27,  357,  718, 1033,  534],
       [ 420,  408, 1042,  912,  912,   27]], dtype=int64)

In [55]:
lending_co_data_numeric[482,5]

-350.0

In [56]:
lending_co_data_numeric = lending_co_data_numeric[np.argsort(lending_co_data_numeric[:,0])]
lending_co_data_numeric

# Sorts the array based on the values in the 1st column. 

array([[ 1000.,    40.,   365.,  2200.,  3400., 15600.],
       [ 1000.,    40.,   365.,  2200.,  3800., 15600.],
       [ 1000.,    40.,   365.,  2000.,  3950., 15600.],
       ...,
       [ 9000.,   165.,   365., 14501., 16846., 64001.],
       [ 9000.,   125.,   365., 12001., 15751., 38626.],
       [ 9000.,   125.,   365., 12251., 14251., 25626.]])

In [57]:
lending_co_data_numeric.argsort(axis = 0)

# The method doesn't sort in place. 

array([[   0,   22,    0,  199,  199,  172],
       [ 155,   62,  687,   53,   53,  160],
       [ 156,   38,  688,  169,  169,   53],
       ...,
       [1022, 1042,  355, 1024, 1037, 1023],
       [1031, 1039,  357,  941, 1029, 1024],
       [1042, 1040, 1042, 1027, 1027, 1040]], dtype=int64)

In [58]:
lending_co_data_numeric

array([[ 1000.,    40.,   365.,  2200.,  3400., 15600.],
       [ 1000.,    40.,   365.,  2200.,  3800., 15600.],
       [ 1000.,    40.,   365.,  2000.,  3950., 15600.],
       ...,
       [ 9000.,   165.,   365., 14501., 16846., 64001.],
       [ 9000.,   125.,   365., 12001., 15751., 38626.],
       [ 9000.,   125.,   365., 12251., 14251., 25626.]])

### np.argwhere()

In [59]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [60]:
np.argwhere(lending_co_data_numeric == False)

# Default condition is to return values different from 0. 

array([[116,   4],
       [430,   3]], dtype=int64)

In [61]:
lending_co_data_numeric[430]

array([1000.,   50.,  365.,    0.,  550., 5650.])

In [62]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [63]:
np.argwhere(lending_co_data_numeric %2 == 0)

# The condition can be more complex 

array([[   0,    0],
       [   0,    1],
       [   1,    0],
       ...,
       [1042,    0],
       [1042,    1],
       [1042,    5]], dtype=int64)

In [64]:
np.isnan(lending_co_data_numeric).sum()

0

In [65]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';') 
lending_co_data_numeric_NAN

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [66]:
np.argwhere(np.isnan(lending_co_data_numeric_NAN))

# Returns the coordinates of all the missing values within the array. 

array([[  11,    3],
       [  15,    3],
       [  27,    3],
       [  58,    3],
       [  60,    4],
       [  85,    4],
       [ 117,    5],
       [ 152,    1],
       [ 152,    2],
       [ 152,    4],
       [ 172,    1],
       [ 175,    1],
       [ 175,    2],
       [ 176,    3],
       [ 177,    4],
       [ 178,    5],
       [ 211,    3],
       [ 229,    0],
       [ 230,    1],
       [ 237,    1],
       [ 247,    3],
       [ 251,    5],
       [ 252,    4],
       [ 258,    1],
       [ 260,    3],
       [ 262,    4],
       [ 271,    5],
       [ 272,    4],
       [ 284,    2],
       [ 284,    3],
       [ 297,    1],
       [ 297,    2],
       [ 300,    3],
       [ 315,    3],
       [ 315,    5],
       [ 327,    4],
       [ 336,    4],
       [ 343,    0],
       [ 344,    2],
       [ 346,    2],
       [ 363,    3],
       [ 375,    3],
       [ 377,    2],
       [ 398,    5],
       [ 416,    4],
       [ 428,    0],
       [ 432,    1],
       [ 433,

In [67]:
lending_co_data_numeric_NAN[175]

array([ 2000.,    nan,    nan,  1851.,  3051., 13561.])

In [68]:
for array_index in np.argwhere(np.isnan(lending_co_data_numeric_NAN)):
    lending_co_data_numeric_NAN[array_index[0], array_index[1]] = 0

## By going through the coordinates of all the mising values of the array, we can fill them up. 

In [69]:
lending_co_data_numeric_NAN[175]

array([ 2000.,     0.,     0.,  1851.,  3051., 13561.])

In [70]:
np.isnan(lending_co_data_numeric_NAN).sum()

0

## Shuffling Data

In [71]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')[:8]
lending_co_data_numeric

# We can directly index the output of the np.loadtxt() function to only take certain parts of the dataset. 

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       [ 2000.,    40.,   365.,  3201.,  4141., 14141.],
       [ 2000.,    50.,   365.,  1851.,  3251., 17701.],
       [ 2000.,    40.,   365.,  3971.,  4131., 15351.]])

In [72]:
np.random.shuffle(lending_co_data_numeric)

# Shuffles the array (and automatically overwrites it).

In [73]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 2000.,    40.,   365.,  3201.,  4141., 14141.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3971.,  4131., 15351.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    50.,   365.,  1851.,  3251., 17701.]])

In [74]:
np.random.shuffle(lending_co_data_numeric)
lending_co_data_numeric

# We can shuffle the array as many times as we wish (although 1 usually suffices).

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    50.,   365.,  1851.,  3251., 17701.],
       [ 2000.,    40.,   365.,  3971.,  4131., 15351.],
       [ 2000.,    40.,   365.,  3201.,  4141., 14141.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.]])

In [75]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')
lending_co_data_numeric

# We can now use the entire dataset. 

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [76]:
from numpy.random import shuffle

# We can import functions we use multiple times for convenience. 

In [77]:
shuffle(lending_co_data_numeric)
lending_co_data_numeric

# We write shuffle() instead of numpy.random.shuffle() since we imported the function earlier. 

array([[ 2000.,    40.,   365.,  3211.,  5181., 16600.],
       [ 2000.,    50.,   365.,  1851.,  3051., 13561.],
       [ 1000.,    40.,   365.,  2000.,  3340., 15600.],
       ...,
       [ 4000.,    50.,   365.,  5850.,  7350., 22250.],
       [ 2000.,    50.,   365.,  7251.,  7251., 20250.],
       [ 4000.,    50.,   365.,  5350.,  6850., 22250.]])

In [78]:
from numpy.random import Generator as gen
from numpy.random import PCG64 as pcg

# Random generators can be used for shuffling. 

In [79]:
array_RG = gen(pcg(seed = 365))
array_RG.shuffle(lending_co_data_numeric)
lending_co_data_numeric

# Seeds don't work for shuffling (and it's intended).

array([[ 2000.,    40.,   365.,  3170.,  4340., 11340.],
       [ 1000.,    50.,   365.,   450.,  1700.,  9370.],
       [ 2000.,    50.,   365.,  4251.,  6251., 20250.],
       ...,
       [ 2000.,    40.,   365.,  3600.,  4240., 16600.],
       [ 2000.,    40.,   365.,  3321.,  4421., 15791.],
       [ 1000.,    40.,   365.,  2120.,  3320., 14870.]])

## Casting

In [80]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [81]:
lending_co_data_numeric.astype(dtype = np.int32)

# Creates an integer version of the array. 

array([[ 2000,    40,   365,  3121,  4241, 13621],
       [ 2000,    40,   365,  3061,  4171, 15041],
       [ 1000,    40,   365,  2160,  3280, 15340],
       ...,
       [ 2000,    40,   365,  4201,  5001, 16600],
       [ 1000,    40,   365,  2080,  3320, 15600],
       [ 2000,    40,   365,  4601,  4601, 16600]])

In [82]:
lending_co_data_numeric = lending_co_data_numeric.astype(dtype = np.str)

# We need to overwrite the variable in order to work with strings. 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  lending_co_data_numeric = lending_co_data_numeric.astype(dtype = np.str)


In [83]:
lending_co_data_numeric

array([['2000.0', '40.0', '365.0', '3121.0', '4241.0', '13621.0'],
       ['2000.0', '40.0', '365.0', '3061.0', '4171.0', '15041.0'],
       ['1000.0', '40.0', '365.0', '2160.0', '3280.0', '15340.0'],
       ...,
       ['2000.0', '40.0', '365.0', '4201.0', '5001.0', '16600.0'],
       ['1000.0', '40.0', '365.0', '2080.0', '3320.0', '15600.0'],
       ['2000.0', '40.0', '365.0', '4601.0', '4601.0', '16600.0']],
      dtype='<U32')

In [84]:
type(lending_co_data_numeric)

numpy.ndarray

In [85]:
lending_co_data_numeric = lending_co_data_numeric.astype(dtype = np.float32)
lending_co_data_numeric.astype(dtype = np.int32)

## We can't directly cast strings to integers. We can go through floats (string -> float -> integer).

array([[ 2000,    40,   365,  3121,  4241, 13621],
       [ 2000,    40,   365,  3061,  4171, 15041],
       [ 1000,    40,   365,  2160,  3280, 15340],
       ...,
       [ 2000,    40,   365,  4201,  5001, 16600],
       [ 1000,    40,   365,  2080,  3320, 15600],
       [ 2000,    40,   365,  4601,  4601, 16600]])

In [86]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')
lending_co_data_numeric = lending_co_data_numeric.astype(dtype = np.str)
lending_co_data_numeric

# To showcase the other way to go from strings to integers, we need to get the strings version of the array once again. 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  lending_co_data_numeric = lending_co_data_numeric.astype(dtype = np.str)


array([['2000.0', '40.0', '365.0', '3121.0', '4241.0', '13621.0'],
       ['2000.0', '40.0', '365.0', '3061.0', '4171.0', '15041.0'],
       ['1000.0', '40.0', '365.0', '2160.0', '3280.0', '15340.0'],
       ...,
       ['2000.0', '40.0', '365.0', '4201.0', '5001.0', '16600.0'],
       ['1000.0', '40.0', '365.0', '2080.0', '3320.0', '15600.0'],
       ['2000.0', '40.0', '365.0', '4601.0', '4601.0', '16600.0']],
      dtype='<U32')

In [87]:
lending_co_data_numeric.astype(dtype = np.float32).astype(dtype = np.int32)
lending_co_data_numeric

## We can chain methods in NumPy.

array([['2000.0', '40.0', '365.0', '3121.0', '4241.0', '13621.0'],
       ['2000.0', '40.0', '365.0', '3061.0', '4171.0', '15041.0'],
       ['1000.0', '40.0', '365.0', '2160.0', '3280.0', '15340.0'],
       ...,
       ['2000.0', '40.0', '365.0', '4201.0', '5001.0', '16600.0'],
       ['1000.0', '40.0', '365.0', '2080.0', '3320.0', '15600.0'],
       ['2000.0', '40.0', '365.0', '4601.0', '4601.0', '16600.0']],
      dtype='<U32')

## Stripping Data

In [88]:
lending_co_total_price = np.genfromtxt("Lending-Company-Total-Price.csv",
                                       delimiter = ',',
                                       dtype = np.str,
                                       skip_header = 1, 
                                       usecols = [1,2,4])
lending_co_total_price

# We don't neeed the entire array. We only want a few columns to showcase how stripping data works.

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype = np.str,


OSError: Lending-Company-Total-Price.csv not found.

In [None]:
lending_co_total_price[:,0] = np.chararray.strip(lending_co_total_price[:,0], "id_")
lending_co_total_price[:,1] = np.chararray.strip(lending_co_total_price[:,1], "Product ")
lending_co_total_price[:,2] = np.chararray.strip(lending_co_total_price[:,2], "Location ")
lending_co_total_price

# Remove "id_" from the 1st column, as well as "Product " from the second and "Location " from the third one. 

array([['1', 'B', '2'],
       ['2', 'B', '3'],
       ['3', 'C', '5'],
       ...,
       ['413', 'B', '135'],
       ['414', 'C', '200'],
       ['415', 'A', '8']], dtype='<U12')

In [None]:
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'A', 1, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'B', 2, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'C', 3, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'D', 4, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'E', 5, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'F', 6, lending_co_total_price[:,1]) 

lending_co_total_price

# We can combine stripping with substituting to transform all the letters in numbers. 

array([['1', '2', '2'],
       ['2', '2', '3'],
       ['3', '3', '5'],
       ...,
       ['413', '2', '135'],
       ['414', '3', '200'],
       ['415', '1', '8']], dtype='<U12')

In [None]:
lending_co_total_price = lending_co_total_price.astype(dtype = np.int32)
lending_co_total_price

# Even though the values look like numbers, they're actually just text, so we need to cast them once again. 

array([[  1,   2,   2],
       [  2,   2,   3],
       [  3,   3,   5],
       ...,
       [413,   2, 135],
       [414,   3, 200],
       [415,   1,   8]])

## Stacking

In [None]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [None]:
# Recall

lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';')

temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2) + 1
temporary_mean = np.nanmean(lending_co_data_numeric_NAN, axis = 0).round(2)

lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", 
                                            delimiter = ';', 
                                            filling_values = temporary_fill)

for i in range(lending_co_data_numeric_NAN.shape[1]):
    lending_co_data_numeric_NAN[:,i] = np.where(lending_co_data_numeric_NAN[:,i] == temporary_fill,
                                                temporary_mean[i],
                                                lending_co_data_numeric_NAN[:,i])
lending_co_data_numeric_NAN


## We create a filler, reimport and fill all the nan-s, then subsitute all the temporary fillers with more appropriate values

array([[ 2000.  ,    40.  ,   365.  ,  3121.  ,  4241.  , 13621.  ],
       [ 2000.  ,    40.  ,   365.  ,  3061.  ,  4171.  , 15041.  ],
       [ 1000.  ,    40.  ,   365.  ,  2160.  ,  3280.  , 15340.  ],
       ...,
       [ 2250.25,    40.  ,   365.  ,  4201.  ,  5001.  , 16600.  ],
       [ 1000.  ,    40.  ,   365.  ,  2080.  ,  3320.  , 15600.  ],
       [ 2000.  ,    40.  ,   365.  ,  4601.  ,  4601.  , 16600.  ]])

In [None]:
np.stack((lending_co_data_numeric[:,1],lending_co_data_numeric[:,0]))

# Stacking the first 2 columns. (We can stack them in any order we like)

array([[  40.,   40.,   40., ...,   40.,   40.,   40.],
       [2000., 2000., 1000., ..., 2000., 1000., 2000.]])

In [None]:
np.transpose(lending_co_data_numeric[:,:2])

array([[2000., 2000., 1000., ..., 2000., 1000., 2000.],
       [  40.,   40.,   40., ...,   40.,   40.,   40.]])

In [None]:
np.stack((lending_co_data_numeric[:,0],lending_co_data_numeric[:,1], lending_co_data_numeric[:,2]), axis = 1)

# We can stack more than 2 arrays. 

array([[2000.,   40.,  365.],
       [2000.,   40.,  365.],
       [1000.,   40.,  365.],
       ...,
       [2000.,   40.,  365.],
       [1000.,   40.,  365.],
       [2000.,   40.,  365.]])

In [None]:
lending_co_data_numeric_NAN.shape

(1043, 6)

In [None]:
np.dstack((lending_co_data_numeric, lending_co_data_numeric_NAN))[0,:,0]

# We can stack 2-D arrays as well. 

array([ 2000.,    40.,   365.,  3121.,  4241., 13621.])

In [None]:
np.stack((lending_co_data_numeric, lending_co_data_numeric_NAN), axis = -1)

# We can stack along a given axis (with np.stack())

array([[[ 2000.  ,  2000.  ],
        [   40.  ,    40.  ],
        [  365.  ,   365.  ],
        [ 3121.  ,  3121.  ],
        [ 4241.  ,  4241.  ],
        [13621.  , 13621.  ]],

       [[ 2000.  ,  2000.  ],
        [   40.  ,    40.  ],
        [  365.  ,   365.  ],
        [ 3061.  ,  3061.  ],
        [ 4171.  ,  4171.  ],
        [15041.  , 15041.  ]],

       [[ 1000.  ,  1000.  ],
        [   40.  ,    40.  ],
        [  365.  ,   365.  ],
        [ 2160.  ,  2160.  ],
        [ 3280.  ,  3280.  ],
        [15340.  , 15340.  ]],

       ...,

       [[ 2000.  ,  2250.25],
        [   40.  ,    40.  ],
        [  365.  ,   365.  ],
        [ 4201.  ,  4201.  ],
        [ 5001.  ,  5001.  ],
        [16600.  , 16600.  ]],

       [[ 1000.  ,  1000.  ],
        [   40.  ,    40.  ],
        [  365.  ,   365.  ],
        [ 2080.  ,  2080.  ],
        [ 3320.  ,  3320.  ],
        [15600.  , 15600.  ]],

       [[ 2000.  ,  2000.  ],
        [   40.  ,    40.  ],
        [  365.  

In [None]:
array_example_1 = np.array([[[1,2,3,4],[5,6,7,8],[9,10,11,12]],[[21,22,23,24],[25,26,27,28],[29,30,31,32]]])
array_example_2 = array_example_1 * 2

# We're quickly creating some 3-D arrays to showcase how dstack works for higher dimensions. 

In [None]:
np.dstack((array_example_1, array_example_2)).shape

(2, 3, 8)

In [None]:
np.stack((array_example_1, array_example_2), axis = 2).shape

# We can no longer replicate the output of dstack by simply specifying an axis. 

(2, 3, 2, 4)

## Concatenate

In [None]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [None]:
np.concatenate((lending_co_data_numeric[0,:], lending_co_data_numeric[1,:]))

# The concatenated array has the same number of dimensions as the inputs. 

array([ 2000.,    40.,   365.,  3121.,  4241., 13621.,  2000.,    40.,
         365.,  3061.,  4171., 15041.])

In [None]:
#Recall: 
    
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';')

temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2) + 1
temporary_mean = np.nanmean(lending_co_data_numeric_NAN, axis = 0).round(2)

lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv",
                                            delimiter = ';', 
                                            filling_values = temporary_fill)

for i in range(lending_co_data_numeric_NAN.shape[1]):        
    lending_co_data_numeric_NAN[:,i] = np.where(lending_co_data_numeric_NAN[:,i] == temporary_fill,
                                                temporary_mean[i],
                                                lending_co_data_numeric_NAN[:,i])
    
lending_co_data_numeric_NAN

array([[ 2000.  ,    40.  ,   365.  ,  3121.  ,  4241.  , 13621.  ],
       [ 2000.  ,    40.  ,   365.  ,  3061.  ,  4171.  , 15041.  ],
       [ 1000.  ,    40.  ,   365.  ,  2160.  ,  3280.  , 15340.  ],
       ...,
       [ 2250.25,    40.  ,   365.  ,  4201.  ,  5001.  , 16600.  ],
       [ 1000.  ,    40.  ,   365.  ,  2080.  ,  3320.  , 15600.  ],
       [ 2000.  ,    40.  ,   365.  ,  4601.  ,  4601.  , 16600.  ]])

In [None]:
np.concatenate((lending_co_data_numeric, lending_co_data_numeric_NAN), axis = 1).shape

(1043, 12)

In [None]:
array_example_1 = np.array([[[1,2,3,4],[5,6,7,8],[9,10,11,12]],[[21,22,23,24],[25,26,27,28],[29,30,31,32]]])
array_example_2 = array_example_1 * 2

# We create 3-D arrays to showcase concatenate vs stacking

In [None]:
np.concatenate((array_example_1, array_example_2), axis = 2)

array([[[ 1,  2,  3,  4,  2,  4,  6,  8],
        [ 5,  6,  7,  8, 10, 12, 14, 16],
        [ 9, 10, 11, 12, 18, 20, 22, 24]],

       [[21, 22, 23, 24, 42, 44, 46, 48],
        [25, 26, 27, 28, 50, 52, 54, 56],
        [29, 30, 31, 32, 58, 60, 62, 64]]])

In [None]:
np.dstack((array_example_1, array_example_2))

NameError: name 'np' is not defined

In [None]:
np.concatenate((lending_co_data_numeric, lending_co_data_numeric[:,:1]), axis = 1)

array([[ 2000.,    40.,   365., ...,  4241., 13621.,  2000.],
       [ 2000.,    40.,   365., ...,  4171., 15041.,  2000.],
       [ 1000.,    40.,   365., ...,  3280., 15340.,  1000.],
       ...,
       [ 2000.,    40.,   365., ...,  5001., 16600.,  2000.],
       [ 1000.,    40.,   365., ...,  3320., 15600.,  1000.],
       [ 2000.,    40.,   365., ...,  4601., 16600.,  2000.]])

## Unique 

In [None]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [None]:
np.unique(lending_co_data_numeric[:,1], return_counts = True, return_index = True)

# Unique -> returns the unique values within the array in increasing order
# return_counts -> returns how many times each unique value appears in the array
# return_index -> returns the index of the first encounter with each unique value

(array([ 35.,  40.,  50., 125., 165.]),
 array([327,   0,   4,  19,  27], dtype=int64),
 array([  4, 567, 451,  19,   2], dtype=int64))

In [None]:
array_example = np.array(["a1", "a3","A1","A3","A3","AA1","B1","A2","B1","A2","B2","B2", "B3","a2","a3","B3","B3","a3" ])
np.unique(array_example)

# If the values of the array are text, the unique function sorts them in "alphabetical" order by their ASCII codes. 

array(['A1', 'A2', 'A3', 'AA1', 'B1', 'B2', 'B3', 'a1', 'a2', 'a3'],
      dtype='<U3')