In [None]:
#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## DAY 2 DATA WRANGLING WITH PYTHON/DAY 2 DATA WRANGLING WITH PYTHON ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs




In [1]:
#=================================================-
#### Slide 5: Creating arrays  ####

# Import numpy as 'np' sets 'np' as the shortcut/alias.
import numpy as np

# Create an array from a list. 
arr = np.array([17, -10, 16.8, 11])
print(arr)

# Check the type of the object.
print(type(arr))




[ 17.  -10.   16.8  11. ]
<class 'numpy.ndarray'>


In [2]:
#=================================================-
#### Slide 6: Dtype in arrays  ####

# Check the data type stored in the array.
print(arr.dtype)




float64


In [4]:
#=================================================-
#### Slide 7: Using ndarray  ####

x = np.array([3, 19, 7, 11])
print(x)




[ 3 19  7 11]


In [5]:
#=================================================-
#### Slide 9: Building an array with linspace  ####

y = np.linspace(-2, -1, 25)
print(y)




[-2.         -1.95833333 -1.91666667 -1.875      -1.83333333 -1.79166667
 -1.75       -1.70833333 -1.66666667 -1.625      -1.58333333 -1.54166667
 -1.5        -1.45833333 -1.41666667 -1.375      -1.33333333 -1.29166667
 -1.25       -1.20833333 -1.16666667 -1.125      -1.08333333 -1.04166667
 -1.        ]


In [7]:
#=================================================-
#### Slide 10: Alternative ways of accessing functions  ####

from numpy import array, linspace
x = array([0.01, 0.45, -0.3])
y = linspace(0, 1, 50)




In [8]:
x

array([ 0.01,  0.45, -0.3 ])

In [9]:
y

array([0.        , 0.02040816, 0.04081633, 0.06122449, 0.08163265,
       0.10204082, 0.12244898, 0.14285714, 0.16326531, 0.18367347,
       0.20408163, 0.2244898 , 0.24489796, 0.26530612, 0.28571429,
       0.30612245, 0.32653061, 0.34693878, 0.36734694, 0.3877551 ,
       0.40816327, 0.42857143, 0.44897959, 0.46938776, 0.48979592,
       0.51020408, 0.53061224, 0.55102041, 0.57142857, 0.59183673,
       0.6122449 , 0.63265306, 0.65306122, 0.67346939, 0.69387755,
       0.71428571, 0.73469388, 0.75510204, 0.7755102 , 0.79591837,
       0.81632653, 0.83673469, 0.85714286, 0.87755102, 0.89795918,
       0.91836735, 0.93877551, 0.95918367, 0.97959184, 1.        ])

In [10]:
#=================================================-
#### Slide 13: Arrays vs Lists  ####

mixed_array = np.array([1, 2, "apple", "XYZ", 5.5])
print(mixed_array)
print(mixed_array.dtype)
mixed_array = np.array([3, 12, 5.56])
print(mixed_array)
print(mixed_array.dtype)




['1' '2' 'apple' 'XYZ' '5.5']
<U21
[ 3.   12.    5.56]
float64


In [11]:
#=================================================-
#### Slide 15: Arrays from sequences (cont'd)  ####

rng = np.arange(0, 51)
print(rng)




[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50]


In [19]:
# Similar to range()
', '.join([str(i) for i in list(range(0,51))])

'0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50'

In [20]:
#=================================================-
#### Slide 16: Arrays from sequences - using a step size  ####

evens = np.arange(0, 23, 2)
print(evens)

quarters = np.arange(0, 1, .25)  #<- contains 0 to 0.75
print(quarters)




[ 0  2  4  6  8 10 12 14 16 18 20 22]
[0.   0.25 0.5  0.75]


In [22]:
#=================================================-
#### Slide 17: Helper functions: min, max, and sum  ####

# Generate 5 numbers between 15 and 19.
x = np.linspace(15, 19, 5)


In [23]:
# Find the min of x.
np.amin(x)

15.0

In [24]:
# Find the max of x.
np.amax(x)

19.0

In [25]:
# Find the max of x.
np.sum(x)

85.0

In [26]:
#=================================================-
#### Slide 18: Convert an array to a list  ####

print(list(evens))




[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22]


In [27]:
#=================================================-
#### Slide 19: Operations on arrays  ####

# Save two arrays as variables.
a = np.array([1,1,1,1])
b = np.array([2,2,2,2])

# Addition of arrays.
print(a + b)

# Subtraction of arrays.
print(a - b)
# Multiplication of arrays.
print(a * b)
# Division of arrays.
print(a / b)


[3 3 3 3]
[-1 -1 -1 -1]
[2 2 2 2]
[0.5 0.5 0.5 0.5]


In [28]:
#=================================================-
#### Slide 20: Mathematical functions on lists   ####

abs([-2, -7, 1])


TypeError: ignored

In [30]:
[abs(i) for i in [-2, -7, 1]]

[2, 7, 1]

In [31]:
#=================================================-
#### Slide 21: Mathematical functions on arrays  ####

print(np.abs(-3))
print(np.abs([-2, -7, 1]))
nums = np.arange(20, 30, .5)
print(len(nums))


3
[2 7 1]
20


In [33]:
nums

array([20. , 20.5, 21. , 21.5, 22. , 22.5, 23. , 23.5, 24. , 24.5, 25. ,
       25.5, 26. , 26.5, 27. , 27.5, 28. , 28.5, 29. , 29.5])

In [32]:
#=================================================-
#### Slide 22: User-defined functions on arrays  ####

# Define a function to multiply every element in array with 3 and add 1
def some_calculation(arr):
    return 3*arr+1
    
print(some_calculation(nums))




[61.  62.5 64.  65.5 67.  68.5 70.  71.5 73.  74.5 76.  77.5 79.  80.5
 82.  83.5 85.  86.5 88.  89.5]


In [None]:
#=================================================-
#### Slide 24: Exercise 1  ####






In [34]:
#=================================================-
#### Slide 26: Accessing array values  ####

# Import numpy as 'np' sets 'np' as the shortcut/alias.
import numpy as np

nums = np.arange(20, 30, .5) #<- Create array
print(len(nums)) #<- get the length of array
print(nums[1])  #<- get the second element

print(nums[0:3]) #<- get the first three elements




20
20.5
[20.  20.5 21. ]


In [35]:
#=================================================-
#### Slide 27: Logical filtering  ####

print(nums)

large_nums = nums[nums > 26]
print(large_nums)




[20.  20.5 21.  21.5 22.  22.5 23.  23.5 24.  24.5 25.  25.5 26.  26.5
 27.  27.5 28.  28.5 29.  29.5]
[26.5 27.  27.5 28.  28.5 29.  29.5]


In [36]:
#=================================================-
#### Slide 28: Logical filtering (cont'd)  ####

print(nums)

large_nums = nums[nums > 26]
print(large_nums)




[20.  20.5 21.  21.5 22.  22.5 23.  23.5 24.  24.5 25.  25.5 26.  26.5
 27.  27.5 28.  28.5 29.  29.5]
[26.5 27.  27.5 28.  28.5 29.  29.5]


In [37]:
#=================================================-
#### Slide 29: Two-dimensional arrays  ####

mat = np.array([
        [8, 2, 6, 8],
        [4, 5, 7, 2],
        [3, 9, 7, 1]
       ])
print(mat)


[[8 2 6 8]
 [4 5 7 2]
 [3 9 7 1]]


In [38]:
#=================================================-
#### Slide 31: Two-dimensional arrays - shape (cont'd)  ####

print(mat.shape) #<- 3 rows and 4 columns -- returned as a tuple
nrows, ncols = mat.shape
print(nrows)




(3, 4)
3


In [39]:
#=================================================-
#### Slide 32: Two-dimensional arrays - extracting elements  ####

print(mat[1, 3]) #<- 2nd row 4th column - remember that indexing starts at 0!




2


In [40]:
#=================================================-
#### Slide 33: Two-dimensional arrays - rows  ####

print(mat[0, :]) #<- first row
print(mat[0, 0:2]) #<- first row and just first 2 columns


[8 2 6 8]
[8 2]


In [41]:
#=================================================-
#### Slide 34: Two-dimensional arrays - columns  ####

print(mat[:, 2]) #<- 3rd column
print(mat[1:3, 2]) #<- 3rd column but skipping over the first row
print(mat[1:3, 2:3]) #<- same as previous, but maintains the vertical structure of the column


[6 7 7]
[7 7]
[[7]
 [7]]


In [42]:
#=================================================-
#### Slide 35: Reshaping arrays  ####

arr = np.arange(1,13)
print(arr)
print(arr.reshape(3, 4))




[ 1  2  3  4  5  6  7  8  9 10 11 12]
[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]


In [43]:
#=================================================-
#### Slide 36: Reshaping arrays (cont'd)  ####

print(arr.reshape(2,      #<- specify number of rows=2
              -1))    #<- number of columns=-1 lets Python infer it
print(arr.reshape(5,      #<- specify number of rows=5
              -2))    #<- number of columns=-2 lets Python infer it


[[ 1  2  3  4  5  6]
 [ 7  8  9 10 11 12]]


ValueError: ignored

In [52]:
#=================================================-
#### Slide 41: Import Pandas and os  ####

import pandas as pd
import os


In [53]:
#=================================================-
#### Slide 42: Directory settings  ####

# Set `main_dir` to the location of your `skill-soft` folder (for Linux).
main_dir = "/home/[username]/Desktop/skill-soft"
# Set `main_dir` to the location of your `skill-soft` folder (for Mac).
main_dir = '/Users/[username]/Desktop/skill-soft'
# Set `main_dir` to the location of your `skill-soft` folder (for Windows).
main_dir = "C:\\Users\\[username]\\Desktop\\skill-soft"
# Make `data_dir` from the `main_dir` and 
# remainder of the path to data directory.
data_dir = main_dir + "/data"



In [54]:
#=================================================-
#### Slide 43: Working directory  ####

# Set working directory.
os.chdir(data_dir)
# Check working directory.
print(os.getcwd())




FileNotFoundError: ignored