# Data Manipulation

In [2]:
import numpy as np 

## Check for missing values

In [3]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')
## If np.loadtxt() compiles first time, the dataset consists of only numeric values and has no missing data. 

In [4]:
np.isnan(lending_co_data_numeric).sum()

## isnan() determines whether data is missing data for the individual elements in an array (True -> Missing, False -> Not missing)
## By adding .sum(), we get the total number of missing elements in the data. 

0

In [6]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';')

In [7]:
np.isnan(lending_co_data_numeric_NAN).sum()

260

In [8]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';', filling_values=0)


## Filling_values substitutes every nan with the value we're passing (0 in this case)

In [9]:
np.isnan(lending_co_data_numeric_NAN).sum()
## All the previously missing values are now 0s.

0

In [None]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';')
# We need to reimport the dataset since all the missing values are filled up. 

In [11]:
temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2) + 1 
# We use nanmax(), since max() returns nan. 
# We want a value greater than the max, since we have be certain it's unique to the dataset.

In [12]:
temporary_fill

64002.0

In [13]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';', filling_values = temporary_fill)

# Filling up all the missing values with the temporary filler. 

In [14]:
np.isnan(lending_co_data_numeric_NAN)

array([[False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       ...,
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False]])

In [15]:
np.isnan(lending_co_data_numeric_NAN).sum()

0

## Substituting Missing Values

In [16]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';')
lending_co_data_numeric_NAN

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [18]:
temporary_mean = np.nanmean(lending_co_data_numeric_NAN, axis = 0).round(2)
temporary_mean

array([ 2250.25,    46.11,   365.  ,  3895.99,  5160.75, 16571.44])

In [19]:
temporary_mean[0]

2250.25

In [20]:
temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2) + 1 
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter=';', filling_values=temporary_fill)
## Creating a unique filler and using it to take care of all the missing values.


In [21]:
temporary_fill

64002.0

In [22]:
np.mean(lending_co_data_numeric_NAN[:,0]).round(2)
# Supposed mean (w/ fillers)

4263.25

In [23]:
temporary_mean[0]
# Actual mean (w/0 fillers)

2250.25

In [27]:
lending_co_data_numeric_NAN[:,0] = np.where(lending_co_data_numeric_NAN[:,0] == temporary_fill,temporary_mean[0], lending_co_data_numeric_NAN[:,0])

# Going through the first column and substituting any temporary fillers (previously missing) with the mean for that column.

In [25]:
np.mean(lending_co_data_numeric_NAN[:,0]).round(2)
# New mean equals old mean. 

2250.25

In [28]:
#Iterating along the columns 
for i in range(lending_co_data_numeric_NAN.shape[1]):
    lending_co_data_numeric_NAN[:,i] = np.where(lending_co_data_numeric_NAN[:,i] == temporary_fill,temporary_mean[i], lending_co_data_numeric_NAN[:,i])
# We're generalizing the filling from earlier and going through all the columns. 

In [30]:
for i in range(lending_co_data_numeric_NAN.shape[1]):
    lending_co_data_numeric_NAN[:,i] = np.where(lending_co_data_numeric_NAN[:,i] < 0, 0,
    lending_co_data_numeric_NAN[:,i])
# We can use this approach for other applications as well (e.g. remove all negative values and set them to 0)


## Reshaping

In [None]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')

In [31]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

- Why is reshaping useful ? 
    - Certain conditions need to be met about shapes and sizes 
    - It is not always possible to store the outputs of a function as a part of an existing array (or series)

In [32]:
lending_co_data_numeric.shape

(1043, 6)

In [34]:
np.reshape(lending_co_data_numeric, (6, 1043))

array([[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
       [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
       [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.],
       [ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
       [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
       [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]])

In [35]:
np.transpose(lending_co_data_numeric)

array([[ 2000.,  2000.,  1000., ...,  2000.,  1000.,  2000.],
       [   40.,    40.,    40., ...,    40.,    40.,    40.],
       [  365.,   365.,   365., ...,   365.,   365.,   365.],
       [ 3121.,  3061.,  2160., ...,  4201.,  2080.,  4601.],
       [ 4241.,  4171.,  3280., ...,  5001.,  3320.,  4601.],
       [13621., 15041., 15340., ..., 16600., 15600., 16600.]])

- The number of elements of the new array and the old array must be same when reshaping 

In [38]:
# np.reshape(lending_co_data_numeric, (3,500))
#If you want 3 rows then divide the total no of columns by no_of_rows_desired
# 2086 in this case 
np.reshape(lending_co_data_numeric, (3,2086))

array([[ 2000.,    40.,   365., ...,    50.,   365.,  5350.],
       [ 6850., 15150.,  1000., ..., 16600.,  2000.,    40.],
       [  365.,  3441.,  4661., ...,  4601.,  4601., 16600.]])

- You can keep as many dimensions as you want 

In [40]:
#This will give me a 3-d array with 2 arrays of 3 columns
np.reshape(lending_co_data_numeric, (2,3,1043))

array([[[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
        [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
        [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.]],

       [[ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
        [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
        [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]]])

In [42]:
# If you want to higher no of dimensions 
# 1 for each dimensions 
np.reshape(lending_co_data_numeric, (1,1,2,3,1043))

array([[[[[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
          [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
          [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.]],

         [[ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
          [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
          [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]]]]])

- If you want to use an reshape array multiple times then you should store it

In [43]:
lending_co_data_numeric_2 = np.reshape(lending_co_data_numeric, (6,1043))
lending_co_data_numeric_2

array([[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
       [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
       [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.],
       [ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
       [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
       [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]])

## Removing Values

In [5]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')

In [6]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [5]:
np.delete(lending_co_data_numeric, 0)
np.delete(lending_co_data_numeric, 0).shape 

(6257,)

In [6]:
lending_co_data_numeric.size 

6258

In [4]:
lending_co_data_numeric

NameError: name 'lending_co_data_numeric' is not defined

In [8]:
#What if you want to remove entire rows or columns ? 
# 0 - column 
np.delete(lending_co_data_numeric,0, axis = 0)
# 1 - rows
# np.delete(lending_co_data_numeric, 0, axis = 1)

array([[ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [14]:
#Deleting multiple columns 
np.delete(lending_co_data_numeric, (0,2,4), axis = 1)
np.delete(lending_co_data_numeric, [0,2,4], axis = 1)

array([[   40.,  3121., 13621.],
       [   40.,  3061., 15041.],
       [   40.,  2160., 15340.],
       ...,
       [   40.,  4201., 16600.],
       [   40.,  2080., 15600.],
       [   40.,  4601., 16600.]])

In [16]:
#Deleting rows and columns simultaneously 
np.delete(np.delete(lending_co_data_numeric, (0,2,4), axis = 1), (0,2,-1), axis = 0)

array([[   40.,  3061., 15041.],
       [   40.,  3041., 15321.],
       [   50.,  3470., 13720.],
       ...,
       [   40.,  4240., 16600.],
       [   40.,  4201., 16600.],
       [   40.,  2080., 15600.]])

### Sorting Data

In [17]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [20]:
#np.sort() traverses the array backwards 
np.sort(lending_co_data_numeric)
np.sort(lending_co_data_numeric).shape

(1043, 6)

In [21]:
lending_co_data_numeric.shape 

(1043, 6)

In [25]:
np.sort(lending_co_data_numeric, axis = 0)
np.sort(lending_co_data_numeric, axis = None)

array([-2870., -2870., -2550., ..., 54625., 54625., 64001.])

In [23]:
#If you want numpy to remove scientific notations 
#This setting will apply to all of your code 
np.set_printoptions(suppress = True)

In [27]:
np.sort(lending_co_data_numeric)

array([[   40.,   365.,  2000.,  3121.,  4241., 13621.],
       [   40.,   365.,  2000.,  3061.,  4171., 15041.],
       [   40.,   365.,  1000.,  2160.,  3280., 15340.],
       ...,
       [   40.,   365.,  2000.,  4201.,  5001., 16600.],
       [   40.,   365.,  1000.,  2080.,  3320., 15600.],
       [   40.,   365.,  2000.,  4601.,  4601., 16600.]])

In [26]:
#Converts positive number to an negative number and sorts them 
np.sort(-lending_co_data_numeric)

array([[-13621.,  -4241.,  -3121.,  -2000.,   -365.,    -40.],
       [-15041.,  -4171.,  -3061.,  -2000.,   -365.,    -40.],
       [-15340.,  -3280.,  -2160.,  -1000.,   -365.,    -40.],
       ...,
       [-16600.,  -5001.,  -4201.,  -2000.,   -365.,    -40.],
       [-15600.,  -3320.,  -2080.,  -1000.,   -365.,    -40.],
       [-16600.,  -4601.,  -4601.,  -2000.,   -365.,    -40.]])

In [32]:
#All the signs of the number are flipped (sign is converted from positive to negative)
-np.sort(lending_co_data_numeric)

array([[   -40.,   -365.,  -2000.,  -3121.,  -4241., -13621.],
       [   -40.,   -365.,  -2000.,  -3061.,  -4171., -15041.],
       [   -40.,   -365.,  -1000.,  -2160.,  -3280., -15340.],
       ...,
       [   -40.,   -365.,  -2000.,  -4201.,  -5001., -16600.],
       [   -40.,   -365.,  -1000.,  -2080.,  -3320., -15600.],
       [   -40.,   -365.,  -2000.,  -4601.,  -4601., -16600.]])

In [31]:
#To sort in descending order 
-np.sort(-lending_co_data_numeric)

array([[13621.,  4241.,  3121.,  2000.,   365.,    40.],
       [15041.,  4171.,  3061.,  2000.,   365.,    40.],
       [15340.,  3280.,  2160.,  1000.,   365.,    40.],
       ...,
       [16600.,  5001.,  4201.,  2000.,   365.,    40.],
       [15600.,  3320.,  2080.,  1000.,   365.,    40.],
       [16600.,  4601.,  4601.,  2000.,   365.,    40.]])

In [36]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [37]:
# The np.sort() function returns a sorted version of the original array, rather than sorting it in place 
np.sort(lending_co_data_numeric[:,3])

array([-2870., -2550., -2450., ..., 16751., 17650., 19001.])

In [38]:
#ND_array.sort() 
#Takes the array variable
#Sorts the array variable 
#Stores the sorted version over the original

In [39]:
lending_co_data_numeric[:,3].sort()

In [40]:
lending_co_data_numeric

array([[ 2000.,    40.,   365., -2870.,  4241., 13621.],
       [ 2000.,    40.,   365., -2550.,  4171., 15041.],
       [ 1000.,    40.,   365., -2450.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365., 16751.,  5001., 16600.],
       [ 1000.,    40.,   365., 17650.,  3320., 15600.],
       [ 2000.,    40.,   365., 19001.,  4601., 16600.]])

### Argument Funtions

#### np.argsort()

In [4]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [5]:
#Returns you an array of indices through which it can be sorted 
np.argsort(lending_co_data_numeric)

array([[1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       ...,
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5]])

In [6]:
np.sort(lending_co_data_numeric)

array([[   40.,   365.,  2000.,  3121.,  4241., 13621.],
       [   40.,   365.,  2000.,  3061.,  4171., 15041.],
       [   40.,   365.,  1000.,  2160.,  3280., 15340.],
       ...,
       [   40.,   365.,  2000.,  4201.,  5001., 16600.],
       [   40.,   365.,  1000.,  2080.,  3320., 15600.],
       [   40.,   365.,  2000.,  4601.,  4601., 16600.]])

In [8]:
#Each column contains the values that will sort the column (in acscending order)
np.argsort(lending_co_data_numeric, axis = 0)

array([[ 537,  443,    0,   32,   32,  482],
       [ 639,  327,  687,  166,  166,  493],
       [ 849,  432,  688,   85,   85,  166],
       ...,
       [  27,  326,  355,  568, 1019,  568],
       [ 277,   27,  357,  718, 1033,  534],
       [ 420,  408, 1042,  912,  912,   27]])

In [12]:
lending_co_data_numeric[482,5]

-350.0

In [13]:
np.argsort(lending_co_data_numeric[:,0])

array([537, 639, 849, ...,  27, 277, 420])

In [7]:
lending_co_data_numeric = lending_co_data_numeric[np.argsort(lending_co_data_numeric[:,0])]
lending_co_data_numeric

array([[ 1000.,    40.,   365.,  2200.,  3400., 15600.],
       [ 1000.,    40.,   365.,  2240.,  3680., 15600.],
       [ 1000.,    40.,   365.,  2575.,  3635., 15600.],
       ...,
       [ 9000.,   125.,   365., 13001., 16726., 54625.],
       [ 9000.,   125.,   365., 10001., 10501., 24126.],
       [ 9000.,   125.,   365., 12251., 14251., 25626.]])

In [6]:
np.set_printoptions(suppress = True)

### np.argwhere()

In [2]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [3]:
np.argwhere(lending_co_data_numeric)

array([[   0,    0],
       [   0,    1],
       [   0,    2],
       ...,
       [1042,    3],
       [1042,    4],
       [1042,    5]])

- np.argwhere() goes over the entire ND array and checks whether the individual elemetns satisfy a given condition
- The outputs are indices for all the individual elements where the condition is met 
- The default condition is to check for values different from 0

In [5]:
#To get the indices of elements where 0 value is present
np.argwhere(lending_co_data_numeric == False)

array([[116,   4],
       [430,   3]])

In [7]:
lending_co_data_numeric[116][4]

0.0

In [8]:
lending_co_data_numeric[430, 3]

0.0

In [9]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [10]:
np.argwhere(lending_co_data_numeric > 1000)

array([[   0,    0],
       [   0,    3],
       [   0,    4],
       ...,
       [1042,    3],
       [1042,    4],
       [1042,    5]])

In [11]:
np.argwhere(lending_co_data_numeric % 2 == 0)

array([[   0,    0],
       [   0,    1],
       [   1,    0],
       ...,
       [1042,    0],
       [1042,    1],
       [1042,    5]])

- Slicing gives us the actual value 
- np.argwhere() returns their coordinates within the array 

In [12]:
np.isnan(lending_co_data_numeric).sum()

0

In [10]:
lending_co_data_numeric_NAN = np.genfromtxt('Lending-company-Numeric-NAN.csv', delimiter = ';')
lending_co_data_numeric_NAN

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [21]:
np.isnan(lending_co_data_numeric_NAN)

array([[False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       ...,
       [ True, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False]])

In [26]:
#0-false, 1-true
# Determine missing values in a dataset efficiently and accurately  
np.argwhere(np.isnan(lending_co_data_numeric_NAN))

array([[  11,    3],
       [  15,    3],
       [  27,    3],
       [  58,    3],
       [  60,    4],
       [  85,    4],
       [ 117,    5],
       [ 152,    1],
       [ 152,    2],
       [ 152,    4],
       [ 172,    1],
       [ 175,    1],
       [ 175,    2],
       [ 176,    3],
       [ 177,    4],
       [ 178,    5],
       [ 211,    3],
       [ 229,    0],
       [ 230,    1],
       [ 237,    1],
       [ 247,    3],
       [ 251,    5],
       [ 252,    4],
       [ 258,    1],
       [ 260,    3],
       [ 262,    4],
       [ 271,    5],
       [ 272,    4],
       [ 284,    2],
       [ 284,    3],
       [ 297,    1],
       [ 297,    2],
       [ 300,    3],
       [ 315,    3],
       [ 315,    5],
       [ 327,    4],
       [ 336,    4],
       [ 343,    0],
       [ 344,    2],
       [ 346,    2],
       [ 363,    3],
       [ 375,    3],
       [ 377,    2],
       [ 398,    5],
       [ 416,    4],
       [ 428,    0],
       [ 432,    1],
       [ 433,

In [24]:
lending_co_data_numeric_NAN[11,3]

nan

In [25]:
lending_co_data_numeric_NAN[1036,2]

nan

In [27]:
#Filling all the values in a single pass 
for array_index in np.argwhere(np.isnan(lending_co_data_numeric_NAN)):
    lending_co_data_numeric_NAN[array_index[0], array_index[1]] = 0 

In [30]:
lending_co_data_numeric_NAN[175,2]
lending_co_data_numeric_NAN[175]

array([ 2000.,     0.,     0.,  1851.,  3051., 13561.])

In [31]:
np.isnan(lending_co_data_numeric_NAN).sum()

0

### Shuffling Data
- Rearranging the parts of as dataset , without a fixed pattern 
- The output would be a random sample that would represent the entire dataset

In [32]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')[:8]
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [33]:
np.random.shuffle(lending_co_data_numeric)

In [34]:
lending_co_data_numeric

array([[ 1000.,    40.,   365.,  2380.,  3880., 15600.],
       [ 1000.,    40.,   365.,  2080.,  3280., 12560.],
       [ 2000.,    40.,   365.,  2920.,  4200., 15400.],
       ...,
       [ 2000.,    50.,   365., 12751., 15751., 20250.],
       [ 2000.,    50.,   365.,  3301.,  4751., 18351.],
       [ 2500.,    50.,   365.,  3250.,  4750., 20750.]])

In [36]:
np.random.shuffle(lending_co_data_numeric)
lending_co_data_numeric

array([[ 4.0000e+03,  5.0000e+01,  3.6500e+02,  5.6000e+03,  7.4000e+03,
         2.2250e+04],
       [ 4.0000e+03,  5.0000e+01,  3.6500e+02,  5.5000e+03,  6.9000e+03,
         2.2250e+04],
       [ 2.0000e+03,  4.0000e+01,  3.6500e+02,  3.0010e+03,  4.0010e+03,
         1.0381e+04],
       ...,
       [ 9.0000e+03,  1.2500e+02,  3.6500e+02,  1.2270e+04,  1.6070e+04,
         4.5745e+04],
       [ 2.0000e+03,  4.0000e+01,  3.6500e+02,  3.9400e+03,  4.9400e+03,
         1.6600e+04],
       [ 1.0000e+03,  5.0000e+01,  3.6500e+02, -2.1000e+03, -1.2000e+03,
         7.5200e+03]])

In [37]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

- Whenever you are using the same function or method many times in your analysis, it is a good idea to directly import it 

In [38]:
from numpy.random import shuffle

In [40]:
shuffle(lending_co_data_numeric)
lending_co_data_numeric

array([[ 9000.,   125.,   365., 12251., 15626., 34514.],
       [ 2000.,    50.,   365.,  6501.,  7301., 20250.],
       [ 2000.,    40.,   365.,  3561.,  4701., 15851.],
       ...,
       [ 2000.,    40.,   365.,  3121.,  4241., 16600.],
       [ 1000.,    40.,   365.,  3000.,  4240., 15600.],
       [ 1000.,    50.,   365., -2150.,  -750.,  3600.]])

In [41]:
from numpy.random import Generator as gen 
from numpy.random import PCG64 as pcg 

In [48]:
array_RG = gen(pcg())
array_RG.shuffle(lending_co_data_numeric)
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3600.,  4240., 16600.],
       [ 2000.,    40.,   365.,  3080.,  4300., 16600.],
       [ 1000.,    40.,   365.,  2240.,  4560., 14520.],
       ...,
       [ 2000.,    40.,   365.,  3261.,  4366.,  7276.],
       [ 2000.,    40.,   365.,  3121.,  3921., 12721.],
       [ 2000.,    40.,   365.,  3211.,  5181., 16600.]])

### Casting
- Taking an object with values of a certain datatype and creating an identical object that contains values of a different type 
- Creating an new array that stores the values of original array under a different type

In [2]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [5]:
#astype - assign type 
#float to integers
lending_co_data_numeric.astype(dtype = np.int32)

array([[ 2000,    40,   365,  3121,  4241, 13621],
       [ 2000,    40,   365,  3061,  4171, 15041],
       [ 1000,    40,   365,  2160,  3280, 15340],
       ...,
       [ 2000,    40,   365,  4201,  5001, 16600],
       [ 1000,    40,   365,  2080,  3320, 15600],
       [ 2000,    40,   365,  4601,  4601, 16600]], dtype=int32)

In [7]:
#np.astype() does not overwrite the array
lending_co_data_numeric.astype(dtype = np.str_)
lending_co_data_numeric = lending_co_data_numeric.astype(dtype = np.str_)

In [8]:
lending_co_data_numeric

array([['2000.0', '40.0', '365.0', '3121.0', '4241.0', '13621.0'],
       ['2000.0', '40.0', '365.0', '3061.0', '4171.0', '15041.0'],
       ['1000.0', '40.0', '365.0', '2160.0', '3280.0', '15340.0'],
       ...,
       ['2000.0', '40.0', '365.0', '4201.0', '5001.0', '16600.0'],
       ['1000.0', '40.0', '365.0', '2080.0', '3320.0', '15600.0'],
       ['2000.0', '40.0', '365.0', '4601.0', '4601.0', '16600.0']],
      dtype='<U32')

In [9]:
type(lending_co_data_numeric)

numpy.ndarray

- We cannot cast strings directly into integers 

In [10]:
lending_co_data_numeric.astype(dtype = np.int32)

ValueError: invalid literal for int() with base 10: '2000.0'

In [13]:
lending_co_data_numeric.astype(dtype = np.float32)
lending_co_data_numeric = lending_co_data_numeric.astype(dtype = np.float32)
lending_co_data_numeric.astype(dtype = np.float32)

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]], dtype=float32)

- method to cast strings directly into integers 

In [14]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')
lending_co_data_numeric = lending_co_data_numeric.astype(dtype = np.str_)
lending_co_data_numeric

array([['2000.0', '40.0', '365.0', '3121.0', '4241.0', '13621.0'],
       ['2000.0', '40.0', '365.0', '3061.0', '4171.0', '15041.0'],
       ['1000.0', '40.0', '365.0', '2160.0', '3280.0', '15340.0'],
       ...,
       ['2000.0', '40.0', '365.0', '4201.0', '5001.0', '16600.0'],
       ['1000.0', '40.0', '365.0', '2080.0', '3320.0', '15600.0'],
       ['2000.0', '40.0', '365.0', '4601.0', '4601.0', '16600.0']],
      dtype='<U32')

In [16]:
# Python executes code from left to right 
lending_co_data_numeric.astype(dtype = np.float32).astype(dtype = np.int32)

array([[ 2000,    40,   365,  3121,  4241, 13621],
       [ 2000,    40,   365,  3061,  4171, 15041],
       [ 1000,    40,   365,  2160,  3280, 15340],
       ...,
       [ 2000,    40,   365,  4201,  5001, 16600],
       [ 1000,    40,   365,  2080,  3320, 15600],
       [ 2000,    40,   365,  4601,  4601, 16600]], dtype=int32)

In [17]:
lending_co_data_numeric

array([['2000.0', '40.0', '365.0', '3121.0', '4241.0', '13621.0'],
       ['2000.0', '40.0', '365.0', '3061.0', '4171.0', '15041.0'],
       ['1000.0', '40.0', '365.0', '2160.0', '3280.0', '15340.0'],
       ...,
       ['2000.0', '40.0', '365.0', '4201.0', '5001.0', '16600.0'],
       ['1000.0', '40.0', '365.0', '2080.0', '3320.0', '15600.0'],
       ['2000.0', '40.0', '365.0', '4601.0', '4601.0', '16600.0']],
      dtype='<U32')

## Stripping Data 

In [18]:
lending_co_total_price = np.genfromtxt("Lending-Company-Total-Price.csv",delimiter = ',', dtype = np.str_, skip_header= 1, usecols = (1,2,4))
lending_co_total_price

array([['id_1', 'Product B', 'Location 2'],
       ['id_2', 'Product B', 'Location 3'],
       ['id_3', 'Product C', 'Location 5'],
       ...,
       ['id_413', 'Product B', 'Location 135'],
       ['id_414', 'Product C', 'Location 200'],
       ['id_415', 'Product A', 'Location 8']], dtype='<U12')

- Removing specific parts of strings 

In [20]:
np.chararray.strip(lending_co_total_price[:,0], "id_")

chararray(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
           '13', '14', '15', '16', '17', '18', '19', '20', '21', '22',
           '23', '24', '25', '26', '27', '28', '29', '30', '31', '32',
           '33', '34', '35', '36', '37', '38', '39', '40', '41', '42',
           '43', '44', '45', '46', '47', '48', '49', '50', '51', '52',
           '53', '54', '55', '56', '57', '58', '59', '60', '61', '62',
           '63', '64', '65', '66', '67', '68', '69', '70', '71', '72',
           '73', '74', '75', '76', '77', '78', '79', '80', '81', '82',
           '83', '84', '85', '86', '87', '88', '89', '90', '91', '92',
           '93', '94', '95', '96', '97', '98', '99', '100', '101', '102',
           '103', '104', '105', '106', '107', '108', '109', '110', '111',
           '112', '113', '114', '115', '116', '117', '118', '119', '120',
           '121', '122', '123', '124', '125', '126', '127', '128', '129',
           '130', '131', '132', '133', '134', '135', '136', '1

In [22]:
lending_co_total_price[:,0] = np.chararray.strip(lending_co_total_price[:,0], "id_")
lending_co_total_price[:,1] = np.chararray.strip(lending_co_total_price[:,1], "Product")
lending_co_total_price[:,2] = np.chararray.strip(lending_co_total_price[:,2], "Location")
lending_co_total_price 

array([[' 2', ' B', ' 2'],
       [' 3', ' B', ' 3'],
       [' 5', ' C', ' 5'],
       ...,
       [' 135', ' B', ' 135'],
       [' 200', ' C', ' 200'],
       [' 8', ' A', ' 8']], dtype='<U12')

In [31]:
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'A', 1, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'B', 2, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'C', 3, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'D', 4, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'E', 5, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'F', 6, lending_co_total_price[:,1]) 

lending_co_total_price

array([[' 2', ' B', ' 2'],
       [' 3', ' B', ' 3'],
       [' 5', ' C', ' 5'],
       ...,
       [' 135', ' B', ' 135'],
       [' 200', ' C', ' 200'],
       [' 8', ' A', ' 8']], dtype='<U12')

In [29]:
lending_co_total_price = lending_co_total_price.astype(dtype = np.int32)
lending_co_total_price 

ValueError: invalid literal for int() with base 10: ' B'

## Stacking 

In [2]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

- Cleaning the Lending-company-Numeric.csv
- Filling it with the maximum value 
- Replacing all the values with the appropriate mean 

In [35]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';')

temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2) + 1 
temporary_mean = np.nanmean(lending_co_data_numeric_NAN, axis = 0).round(2)

lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';', filling_values= temporary_fill)

for i in range(lending_co_data_numeric_NAN.shape[1]):
    lending_co_data_numeric_NAN[:,i] = np.where(lending_co_data_numeric_NAN[:,i] == temporary_fill, temporary_mean[i], lending_co_data_numeric_NAN[:,i])

lending_co_data_numeric_NAN

array([[ 2000.  ,    40.  ,   365.  ,  3121.  ,  4241.  , 13621.  ],
       [ 2000.  ,    40.  ,   365.  ,  3061.  ,  4171.  , 15041.  ],
       [ 1000.  ,    40.  ,   365.  ,  2160.  ,  3280.  , 15340.  ],
       ...,
       [ 2250.25,    40.  ,   365.  ,  4201.  ,  5001.  , 16600.  ],
       [ 1000.  ,    40.  ,   365.  ,  2080.  ,  3320.  , 15600.  ],
       [ 2000.  ,    40.  ,   365.  ,  4601.  ,  4601.  , 16600.  ]])

- Placing multiple objects on top of one another to create a bigger (larger) object 
- We can just stack arrays of matching shapes to create a larger array - a "stack"

*  np.stack()

In [3]:
np.stack((lending_co_data_numeric[:,1], lending_co_data_numeric[:,0]))

array([[  40.,   40.,   40., ...,   40.,   40.,   40.],
       [2000., 2000., 1000., ..., 2000., 1000., 2000.]])

In [37]:
np.transpose(lending_co_data_numeric[:,:2])

array([[2000., 2000., 1000., ..., 2000., 1000., 2000.],
       [  40.,   40.,   40., ...,   40.,   40.,   40.]])

In [5]:
#Stacking along the columns
np.stack((lending_co_data_numeric[:,0], lending_co_data_numeric[:,1]), axis = 1)

array([[2000.,   40.],
       [2000.,   40.],
       [1000.,   40.],
       ...,
       [2000.,   40.],
       [1000.,   40.],
       [2000.,   40.]])

In [6]:
np.stack((lending_co_data_numeric[:,0], lending_co_data_numeric[:,1], lending_co_data_numeric[:,2]), axis = 1)

array([[2000.,   40.,  365.],
       [2000.,   40.,  365.],
       [1000.,   40.,  365.],
       ...,
       [2000.,   40.,  365.],
       [1000.,   40.,  365.],
       [2000.,   40.,  365.]])

In [8]:
#Arrays must have the same shape during stacking
np.stack((lending_co_data_numeric[:,0], lending_co_data_numeric[:,1], lending_co_data_numeric[:,:2]), axis = 1)

ValueError: all input arrays must have the same shape

### np.vstack()
- Vertical Stack 
- The function stacks 2-D arrays vertically 
- Places the first array on top of the second one 
- Results in a longer array 

In [11]:
lending_co_data_numeric_NAN.shape 

(1043, 6)

In [12]:
np.vstack((lending_co_data_numeric, lending_co_data_numeric_NAN))

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [13]:
np.vstack((lending_co_data_numeric, lending_co_data_numeric_NAN)).shape 

(2086, 6)

### np.hstack()
- h stack = horizontal stack 
- Stacks values horizontally 
- The result should be a wider array 

In [15]:
np.hstack((lending_co_data_numeric, lending_co_data_numeric_NAN))

array([[ 2000.,    40.,   365., ...,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365., ...,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365., ...,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365., ...,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365., ...,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365., ...,  4601.,  4601., 16600.]])

In [16]:
np.hstack((lending_co_data_numeric, lending_co_data_numeric_NAN)).shape 

(1043, 12)

### np.dstack()
- d stack = depth stack 
- Stack arrays in the higher dimension 
- Returns an array of a higher dimension

In [17]:
np.dstack((lending_co_data_numeric, lending_co_data_numeric_NAN))

array([[[ 2000.,  2000.],
        [   40.,    40.],
        [  365.,   365.],
        [ 3121.,  3121.],
        [ 4241.,  4241.],
        [13621., 13621.]],

       [[ 2000.,  2000.],
        [   40.,    40.],
        [  365.,   365.],
        [ 3061.,  3061.],
        [ 4171.,  4171.],
        [15041., 15041.]],

       [[ 1000.,  1000.],
        [   40.,    40.],
        [  365.,   365.],
        [ 2160.,  2160.],
        [ 3280.,  3280.],
        [15340., 15340.]],

       ...,

       [[ 2000.,    nan],
        [   40.,    40.],
        [  365.,   365.],
        [ 4201.,  4201.],
        [ 5001.,  5001.],
        [16600., 16600.]],

       [[ 1000.,  1000.],
        [   40.,    40.],
        [  365.,   365.],
        [ 2080.,  2080.],
        [ 3320.,  3320.],
        [15600., 15600.]],

       [[ 2000.,  2000.],
        [   40.,    40.],
        [  365.,   365.],
        [ 4601.,  4601.],
        [ 4601.,  4601.],
        [16600., 16600.]]])

In [18]:
np.dstack((lending_co_data_numeric, lending_co_data_numeric_NAN)).shape 

(1043, 6, 2)

In [23]:
np.dstack((lending_co_data_numeric, lending_co_data_numeric_NAN))[0]
np.dstack((lending_co_data_numeric, lending_co_data_numeric_NAN))[0, : ,0]

array([ 2000.,    40.,   365.,  3121.,  4241., 13621.])

In [25]:
# The stack function always returns an output that is exactly 1 dimension more than its inputs
np.stack((lending_co_data_numeric, lending_co_data_numeric_NAN), axis = -1)

array([[[ 2000.,  2000.],
        [   40.,    40.],
        [  365.,   365.],
        [ 3121.,  3121.],
        [ 4241.,  4241.],
        [13621., 13621.]],

       [[ 2000.,  2000.],
        [   40.,    40.],
        [  365.,   365.],
        [ 3061.,  3061.],
        [ 4171.,  4171.],
        [15041., 15041.]],

       [[ 1000.,  1000.],
        [   40.,    40.],
        [  365.,   365.],
        [ 2160.,  2160.],
        [ 3280.,  3280.],
        [15340., 15340.]],

       ...,

       [[ 2000.,    nan],
        [   40.,    40.],
        [  365.,   365.],
        [ 4201.,  4201.],
        [ 5001.,  5001.],
        [16600., 16600.]],

       [[ 1000.,  1000.],
        [   40.,    40.],
        [  365.,   365.],
        [ 2080.,  2080.],
        [ 3320.,  3320.],
        [15600., 15600.]],

       [[ 2000.,  2000.],
        [   40.,    40.],
        [  365.,   365.],
        [ 4601.,  4601.],
        [ 4601.,  4601.],
        [16600., 16600.]]])

- np.dstack() and np.stack() works along the 'third' axis, the two funtions work identically for 1-D and 2-D arrays

In [26]:
array_example_1 = np.array([[[1,2,3,4], [5,6,7,8], [9,10,11,12], [21,22,23,24], [25,26,27,28], [29,30,31,32]]])
array_example_2 = array_example_1 * 2 

In [28]:
np.dstack((array_example_1, array_example_2))

array([[[ 1,  2,  3,  4,  2,  4,  6,  8],
        [ 5,  6,  7,  8, 10, 12, 14, 16],
        [ 9, 10, 11, 12, 18, 20, 22, 24],
        [21, 22, 23, 24, 42, 44, 46, 48],
        [25, 26, 27, 28, 50, 52, 54, 56],
        [29, 30, 31, 32, 58, 60, 62, 64]]])

In [29]:
np.stack((array_example_1, array_example_2), axis = -1)

array([[[[ 1,  2],
         [ 2,  4],
         [ 3,  6],
         [ 4,  8]],

        [[ 5, 10],
         [ 6, 12],
         [ 7, 14],
         [ 8, 16]],

        [[ 9, 18],
         [10, 20],
         [11, 22],
         [12, 24]],

        [[21, 42],
         [22, 44],
         [23, 46],
         [24, 48]],

        [[25, 50],
         [26, 52],
         [27, 54],
         [28, 56]],

        [[29, 58],
         [30, 60],
         [31, 62],
         [32, 64]]]])

In [30]:
np.dstack((array_example_1, array_example_2)).shape 

(1, 6, 8)

In [31]:
np.stack((array_example_1, array_example_2), axis = -1).shape 

(1, 6, 4, 2)

### Concatenating Arrays
- Linking objects in a chain 
- Creating a larger array by merging existing arrays along a given axis 
- The inputs and outputs of the np.concatenate() function always have the same number of dimensions

In [2]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [3]:
np.concatenate((lending_co_data_numeric[0,:], lending_co_data_numeric[1,:]))

array([ 2000.,    40.,   365.,  3121.,  4241., 13621.,  2000.,    40.,
         365.,  3061.,  4171., 15041.])

In [4]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';')

temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2) + 1 
temporary_mean = np.nanmean(lending_co_data_numeric_NAN, axis = 0).round(2)

lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';', filling_values= temporary_fill)

for i in range(lending_co_data_numeric_NAN.shape[1]):
    lending_co_data_numeric_NAN[:,i] = np.where(lending_co_data_numeric_NAN[:,i] == temporary_fill, temporary_mean[i], lending_co_data_numeric_NAN[:,i])

lending_co_data_numeric_NAN

array([[ 2000.  ,    40.  ,   365.  ,  3121.  ,  4241.  , 13621.  ],
       [ 2000.  ,    40.  ,   365.  ,  3061.  ,  4171.  , 15041.  ],
       [ 1000.  ,    40.  ,   365.  ,  2160.  ,  3280.  , 15340.  ],
       ...,
       [ 2250.25,    40.  ,   365.  ,  4201.  ,  5001.  , 16600.  ],
       [ 1000.  ,    40.  ,   365.  ,  2080.  ,  3320.  , 15600.  ],
       [ 2000.  ,    40.  ,   365.  ,  4601.  ,  4601.  , 16600.  ]])

In [5]:
np.concatenate((lending_co_data_numeric, lending_co_data_numeric_NAN))

array([[ 2000.  ,    40.  ,   365.  ,  3121.  ,  4241.  , 13621.  ],
       [ 2000.  ,    40.  ,   365.  ,  3061.  ,  4171.  , 15041.  ],
       [ 1000.  ,    40.  ,   365.  ,  2160.  ,  3280.  , 15340.  ],
       ...,
       [ 2250.25,    40.  ,   365.  ,  4201.  ,  5001.  , 16600.  ],
       [ 1000.  ,    40.  ,   365.  ,  2080.  ,  3320.  , 15600.  ],
       [ 2000.  ,    40.  ,   365.  ,  4601.  ,  4601.  , 16600.  ]])

In [6]:
np.concatenate((lending_co_data_numeric, lending_co_data_numeric_NAN)).shape 

(2086, 6)

In [7]:
np.concatenate((lending_co_data_numeric, lending_co_data_numeric_NAN), axis = 1)

array([[ 2000.,    40.,   365., ...,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365., ...,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365., ...,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365., ...,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365., ...,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365., ...,  4601.,  4601., 16600.]])

In [8]:
np.concatenate((lending_co_data_numeric, lending_co_data_numeric_NAN), axis = 1).shape 

(1043, 12)

In [10]:
array_example_1 = np.array([[[1,2,3,4], [5,6,7,8], [9,10,11,12], [21,22,23,24], [25,26,27,28], [29,30,31,32]]])
array_example_2 = array_example_1 * 2 

In [11]:
np.concatenate((array_example_1, array_example_2), axis = 0)

array([[[ 1,  2,  3,  4],
        [ 5,  6,  7,  8],
        [ 9, 10, 11, 12],
        [21, 22, 23, 24],
        [25, 26, 27, 28],
        [29, 30, 31, 32]],

       [[ 2,  4,  6,  8],
        [10, 12, 14, 16],
        [18, 20, 22, 24],
        [42, 44, 46, 48],
        [50, 52, 54, 56],
        [58, 60, 62, 64]]])

In [13]:
np.dstack((array_example_1, array_example_2))

array([[[ 1,  2,  3,  4,  2,  4,  6,  8],
        [ 5,  6,  7,  8, 10, 12, 14, 16],
        [ 9, 10, 11, 12, 18, 20, 22, 24],
        [21, 22, 23, 24, 42, 44, 46, 48],
        [25, 26, 27, 28, 50, 52, 54, 56],
        [29, 30, 31, 32, 58, 60, 62, 64]]])

In [14]:
np.concatenate((array_example_1, array_example_2), axis = 1)

array([[[ 1,  2,  3,  4],
        [ 5,  6,  7,  8],
        [ 9, 10, 11, 12],
        [21, 22, 23, 24],
        [25, 26, 27, 28],
        [29, 30, 31, 32],
        [ 2,  4,  6,  8],
        [10, 12, 14, 16],
        [18, 20, 22, 24],
        [42, 44, 46, 48],
        [50, 52, 54, 56],
        [58, 60, 62, 64]]])

In [16]:
np.hstack((array_example_1, array_example_2))

array([[[ 1,  2,  3,  4],
        [ 5,  6,  7,  8],
        [ 9, 10, 11, 12],
        [21, 22, 23, 24],
        [25, 26, 27, 28],
        [29, 30, 31, 32],
        [ 2,  4,  6,  8],
        [10, 12, 14, 16],
        [18, 20, 22, 24],
        [42, 44, 46, 48],
        [50, 52, 54, 56],
        [58, 60, 62, 64]]])

- 1 & 2 D arrays 
- np.hstack() : np.concatenate((), axis = 0)
- np.vstack() : np.concatenate((), axis = 1)
- np.dstack() : np.concatenate((), axis = 2)

- Concatenating inputs of different shapes 
- If the two arrays are 1-d then they do not hold the same number of elements 

In [17]:
np.concatenate((lending_co_data_numeric[0,:], lending_co_data_numeric[:,0]))

array([2000.,   40.,  365., ..., 2000., 1000., 2000.])

- If we have arrays of the same dimensions, but different shapes, we can still concatenate them. But only if their dimensions match for the axis we are concatenating along 

In [18]:
np.concatenate((lending_co_data_numeric, lending_co_data_numeric[:,0]))

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 1 dimension(s)

In [20]:
np.concatenate((lending_co_data_numeric, lending_co_data_numeric[:,:1]), axis = 1)

array([[ 2000.,    40.,   365., ...,  4241., 13621.,  2000.],
       [ 2000.,    40.,   365., ...,  4171., 15041.,  2000.],
       [ 1000.,    40.,   365., ...,  3280., 15340.,  1000.],
       ...,
       [ 2000.,    40.,   365., ...,  5001., 16600.,  2000.],
       [ 1000.,    40.,   365., ...,  3320., 15600.,  1000.],
       [ 2000.,    40.,   365., ...,  4601., 16600.,  2000.]])

### Finding Unique Values in a Dataset 

In [21]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

- np.unique() takes an array from an input and creates another array that contains all the different values from the first one 
- Arranged in increasing order in case of numbers but varys if the dataset has strings

In [22]:
np.unique(lending_co_data_numeric)

array([-2870., -2550., -2450., ..., 52751., 54625., 64001.])

In [23]:
np.unique(lending_co_data_numeric[:,1])

array([ 35.,  40.,  50., 125., 165.])

In [24]:
array_example = np.array(["a1", "a3","A1","A3","A3","AA1","B1","A2","B1","A2","B2","B2", "B3","a2","a3","B3","B3","a3" ])
np.unique(array_example)

# If the values of the array are text, the unique function sorts them in "alphabetical" order by their ASCII codes. 

array(['A1', 'A2', 'A3', 'AA1', 'B1', 'B2', 'B3', 'a1', 'a2', 'a3'],
      dtype='<U3')

In [27]:
np.unique(lending_co_data_numeric[:,1],return_counts = True)
# The first array depicts unique values 
# The second array depicts how many times these values occur 

(array([ 35.,  40.,  50., 125., 165.]), array([  4, 567, 451,  19,   2]))

In [29]:
#The middle array is the index array 
# Gives the index where the unique values occur 
np.unique(lending_co_data_numeric[:,1], return_counts = True, return_index = True)

(array([ 35.,  40.,  50., 125., 165.]),
 array([327,   0,   4,  19,  27]),
 array([  4, 567, 451,  19,   2]))