#Numpy Tutorial

Numpy is a computational library for Python that is optimized for operations on multi-dimensional arrays. In this notebook we will use numpy to work with 1-d arrays (often called vectors) and 2-d arrays (often called matrices).

For a the full user guide and reference for numpy see: http://docs.scipy.org/doc/numpy/

In [57]:
import numpy as np # importing this way allows us to refer to numpy as np
import pandas as pd
import graphlab

In [73]:
sales = graphlab.SFrame('kc_house_data.gl/')

train_data,test_data = sales.random_split(.8,seed=0)
train_data.save('train_pd.csv',format= "csv")
test_data.save('test_pd.csv')
train_data= pd.read_csv('train_pd.csv')
test_data=pd.read_csv('test_pd.csv')

In [116]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [141]:
fea=['sqft_living','sqft_living15']
da=train_data[fea]
pd.concat([train_data['price'],da],axis=1)
#train_data['sqft_living','sqft_living15']

Unnamed: 0,price,sqft_living,sqft_living15
0,221900.0,1180,1340
1,538000.0,2570,1690
2,180000.0,770,2720
3,604000.0,1960,1360
4,510000.0,1680,1800
5,1225000.0,5420,4760
6,257500.0,1715,2238
7,291850.0,1060,1650
8,229500.0,1780,1780
9,323000.0,1890,2390


In [142]:
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1 # add a constant column to an SFrame
    # prepend variable 'constant' to the features list
    #features = ['constant'] + features
    df=pd.concat([data_sframe['constant'],data_sframe[features]],axis=1)
    # select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’
    features_matrix = df.values#[['constant',features]].values
    # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
    #features_matrix = features_sframe.to_numpy()
    # assign the column of data_sframe associated with the target to the variable ‘output_sarray’
    output_sarray = data_sframe[output]
    # this will convert the SArray into a numpy array:
    output_array = output_sarray.to_numpy() # GraphLab Create>= 1.7!!
    return(features_matrix, output_array)


In [134]:
fea=['sqft_living','sqft_living15']
get_numpy_data(train_data,fea,'price')

(array([[1.00e+00,      nan,      nan],
        [1.00e+00,      nan,      nan],
        [1.00e+00,      nan,      nan],
        ...,
        [     nan, 1.53e+03, 1.53e+03],
        [     nan, 1.60e+03, 1.41e+03],
        [     nan, 1.02e+03, 1.02e+03]]),
 array([221900., 538000., 180000., ..., 360000., 400000., 325000.]))

In [77]:
def predict_outcome(feature_matrix, weights):
    predictions=np.dot(feature_matrix,weights)
    return(predictions)



In [19]:
def feature_derivative(errors, feature):
    derivative= 2*np.dot(errors, feature)
    return(derivative)

dtype: float
Rows: 17384
[1180.0, 2570.0, 770.0, 1960.0, 1680.0, 5420.0, 1715.0, 1060.0, 1780.0, 1890.0, 3560.0, 1160.0, 1370.0, 1810.0, 1890.0, 1600.0, 1200.0, 1250.0, 1620.0, 3050.0, 2270.0, 1070.0, 2450.0, 2450.0, 1400.0, 1520.0, 2570.0, 1190.0, 2330.0, 2060.0, 2300.0, 1660.0, 2360.0, 1220.0, 2570.0, 3595.0, 1570.0, 1280.0, 3160.0, 990.0, 2290.0, 1250.0, 2753.0, 1190.0, 3150.0, 1410.0, 1980.0, 2730.0, 2830.0, 2420.0, 3250.0, 1850.0, 2150.0, 2519.0, 1540.0, 1660.0, 2770.0, 2720.0, 2240.0, 1000.0, 3200.0, 4770.0, 1260.0, 2380.0, 3430.0, 1760.0, 1040.0, 1410.0, 3450.0, 2350.0, 2020.0, 1680.0, 960.0, 2140.0, 2660.0, 2770.0, 1610.0, 1030.0, 3520.0, 1200.0, 1580.0, 1580.0, 3300.0, 1160.0, 1810.0, 2320.0, 2070.0, 1980.0, 2190.0, 2920.0, 1210.0, 2340.0, 1670.0, 1240.0, 3140.0, 2310.0, 1260.0, 1540.0, 2080.0, 4380.0, ... ]

In [69]:
import math
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    while not converged:
        # compute the predictions based on feature_matrix and weights:
        predictions = np.dot(feature_matrix, weights)
        # compute the errors as predictions - output:
        errors = predictions - output
        gradient_sum_squares = 0 # initialize the gradient
        # while not converged, update each weight individually:
        for i in range(len(weights)):
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            derivative = 2*np.dot(errors, feature_matrix[:,i])
            # add the squared derivative to the gradient magnitude
            gradient_sum_squares = gradient_sum_squares+derivative*derivative
            # update the weight based on step size and derivative:
            weights = weights-step_size*derivative
        gradient_magnitude = math.sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)


In [70]:
simple_features = 'sqft_living'
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7
(simple_feature_matrix,output,initial_weights)


(array([[   1, 1180],
        [   1, 2570],
        [   1,  770],
        ...,
        [   1, 1530],
        [   1, 1600],
        [   1, 1020]]),
 array([221900., 538000., 180000., ..., 360000., 400000., 325000.]),
 array([-4.7e+04,  1.0e+00]))

In [71]:
simple_weights = regression_gradient_descent(simple_feature_matrix, output,initial_weights, step_size, tolerance)

In [72]:
simple_weights

array([-46719.20069412,    281.79930588])

In [78]:
(test_simple_feature_matrix,test_output)= get_numpy_data(test_data, simple_features, my_output)

In [85]:
t_prediction=predict_outcome(test_simple_feature_matrix,simple_weights)

In [86]:
error=t_prediction-test_output
#len(test_output)
SS = np.dot(error,error)
RSS = sqrt(SS)
print RSS


275393140686060.2

In [144]:
model_features =['sqft_living','sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features,my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [145]:
simple_2_weights = regression_gradient_descent(feature_matrix, output,initial_weights, step_size, tolerance)

  from ipykernel import kernelapp as app


KeyboardInterrupt: 

# Creating Numpy Arrays

New arrays can be made in several ways. We can take an existing list and convert it to a numpy array:

In [2]:
mylist = [1., 2., 3., 4.]
mynparray = np.array(mylist)
mynparray

array([1., 2., 3., 4.])

You can initialize an array (of any dimension) of all ones or all zeroes with the ones() and zeros() functions:

In [3]:
one_vector = np.ones(4)
print one_vector # using print removes the array() portion

[1. 1. 1. 1.]


In [18]:
one2Darray = np.ones((2, 4)) # an 2D array with 2 "rows" and 4 "columns"
print one2Darray

[[1. 1. 1. 1.]
 [1. 1. 1. 1.]]


In [5]:
zero_vector = np.zeros(4)
print zero_vector

[0. 0. 0. 0.]


You can also initialize an empty array which will be filled with values. This is the fastest way to initialize a fixed-size numpy array however you must ensure that you replace all of the values.

In [14]:
empty_vector = np.empty((5,5))
print empty_vector

[[0.00000000e+000 2.14027814e+161 6.16553830e+223 4.30808520e-096
  6.32299154e+233]
 [3.99910946e+252 2.16385138e+190 9.32260730e-067 6.54878222e-043
  6.95799097e-042]
 [5.39385113e-062 6.21173359e-144 4.82337433e+228 6.14415221e-144
  4.82337723e+228]
 [2.37733711e+184 9.42106097e-067 1.49108900e+161 1.45094644e+165
  4.50607560e-144]
 [3.38040122e-067 1.26037325e-076 2.17046565e-028 6.74928942e-067
  2.91474422e+126]]


#Accessing array elements

Accessing an array is straight forward. For vectors you access the index by referring to it inside square brackets. Recall that indices in Python start with 0.

In [19]:
mynparray[2]

3.0

2D arrays are accessed similarly by referring to the row and column index separated by a comma:

In [20]:
my_matrix = np.array([[1, 2, 3], [4, 5, 6]])
print my_matrix

[[1 2 3]
 [4 5 6]]


In [21]:
print my_matrix[1, 2]

6


Sequences of indices can be accessed using ':' for example

In [22]:
print my_matrix[0:2, 2] # recall 0:2 = [0, 1]

[3 6]


In [23]:
print my_matrix[0, 0:3]

[1 2 3]


You can also pass a list of indices. 

In [26]:
fib_indices = np.array([1, 1, 2, 3])
random_vector = np.random.random(10) # 10 random numbers between 0 and 1
print random_vector

[0.29824429 0.80171477 0.33748712 0.94698499 0.77643002 0.6902945
 0.85187497 0.6071626  0.70458221 0.10099704]


In [27]:
print random_vector[fib_indices]

[0.80171477 0.80171477 0.33748712 0.94698499]


You can also use true/false values to select values

In [28]:
my_vector = np.array([1, 2, 3, 4])
select_index = np.array([True, False, True, False])
print my_vector[select_index]

[1 3]


For 2D arrays you can select specific columns and specific rows. Passing ':' selects all rows/columns

In [29]:
select_cols = np.array([True, False, True]) # 1st and 3rd column
select_rows = np.array([False, True]) # 2nd row

In [30]:
print my_matrix[select_rows, :] # just 2nd row but all columns

[[4 5 6]]


In [31]:
print my_matrix[:, select_cols] # all rows and just the 1st and 3rd column

[[1 3]
 [4 6]]


#Operations on Arrays

You can use the operations '\*', '\*\*', '\\', '+' and '-' on numpy arrays and they operate elementwise.

In [33]:
my_array = np.array([1., 2., 3., 4.])
print my_array*my_array

[ 1.  4.  9. 16.]


In [34]:
print my_array**2

[ 1.  4.  9. 16.]


In [35]:
print my_array - np.ones(4)

[0. 1. 2. 3.]


In [36]:
print my_array + np.ones(4)

[2. 3. 4. 5.]


In [37]:
print my_array / 3

[0.33333333 0.66666667 1.         1.33333333]


In [38]:
print my_array / np.array([2., 3., 4., 5.]) # = [1.0/2.0, 2.0/3.0, 3.0/4.0, 4.0/5.0]

[0.5        0.66666667 0.75       0.8       ]


You can compute the sum with np.sum() and the average with np.average()

In [39]:
print np.sum(my_array)

10.0


In [40]:
print np.average(my_array)

2.5


In [41]:
print np.sum(my_array)/len(my_array)

2.5


#The dot product

An important mathematical operation in linear algebra is the dot product. 

When we compute the dot product between two vectors we are simply multiplying them elementwise and adding them up. In numpy you can do this with np.dot()

In [42]:
array1 = np.array([1., 2., 3., 4.])
array2 = np.array([2., 3., 4., 5.])
print np.dot(array1, array2)

40.0


In [43]:
print np.sum(array1*array2)

40.0


Recall that the Euclidean length (or magnitude) of a vector is the squareroot of the sum of the squares of the components. This is just the squareroot of the dot product of the vector with itself:

In [44]:
array1_mag = np.sqrt(np.dot(array1, array1))
print array1_mag

5.477225575051661


In [45]:
print np.sqrt(np.sum(array1*array1))

5.477225575051661


We can also use the dot product when we have a 2D array (or matrix). When you have an vector with the same number of elements as the matrix (2D array) has columns you can right-multiply the matrix by the vector to get another vector with the same number of elements as the matrix has rows. For example this is how you compute the predicted values given a matrix of features and an array of weights.

In [46]:
my_features = np.array([[1., 2.], [3., 4.], [5., 6.], [7., 8.]])
print my_features

[[1. 2.]
 [3. 4.]
 [5. 6.]
 [7. 8.]]


In [47]:
my_weights = np.array([0.4, 0.5])
print my_weights

[0.4 0.5]


In [48]:
my_predictions = np.dot(my_features, my_weights) # note that the weights are on the right
print my_predictions # which has 4 elements since my_features has 4 rows

[1.4 3.2 5.  6.8]


Similarly if you have a vector with the same number of elements as the matrix has *rows* you can left multiply them.

In [49]:
my_matrix = my_features
my_array = np.array([0.3, 0.4, 0.5, 0.6])

In [50]:
print np.dot(my_array, my_matrix) # which has 2 elements because my_matrix has 2 columns

[ 8.2 10. ]


#Multiplying Matrices

If we have two 2D arrays (matrices) matrix_1 and matrix_2 where the number of columns of matrix_1 is the same as the number of rows of matrix_2 then we can use np.dot() to perform matrix multiplication.

In [51]:
matrix_1 = np.array([[1., 2., 3.],[4., 5., 6.]])
print matrix_1

[[1. 2. 3.]
 [4. 5. 6.]]


In [52]:
matrix_2 = np.array([[1., 2.], [3., 4.], [5., 6.]])
print matrix_2

[[1. 2.]
 [3. 4.]
 [5. 6.]]


In [53]:
print np.dot(matrix_1, matrix_2)

[[22. 28.]
 [49. 64.]]
