In [1]:
import numpy as np
import pandas as pd

# Using Numpy

## Basic Operations

NumPy is a Python library that is used to handle linear algebra operations. It does a couple amazing things under the hood that make certain operations lightning fast, and makes large scale data processing possible (like Pandas).

NumPy holds data in **arrays**.

In [98]:
v = np.array([1, 2, 3, 4])
v

array([1, 2, 3, 4])

In [97]:
for x in np.nditer(v):
    print(x)

1
2
3
4


In [40]:
A = (10 * np.random.rand(4, 6)).astype(int)
A

array([[1, 6, 6, 9, 4, 0],
       [9, 8, 6, 4, 8, 9],
       [7, 2, 3, 5, 5, 9],
       [6, 9, 3, 3, 8, 5]])

In [41]:
A[3, 3] # element selection

3

In [42]:
A[1, :] # second row of the matrix

array([9, 8, 6, 4, 8, 9])

In [43]:
A[:, 2] # third column of the matrix

array([6, 6, 3, 3])

In [44]:
A.shape

(4, 6)

In [46]:
A.reshape((8, 3)) # will reshape and fill it in by rows

array([[1, 6, 6],
       [9, 4, 0],
       [9, 8, 6],
       [4, 8, 9],
       [7, 2, 3],
       [5, 5, 9],
       [6, 9, 3],
       [3, 8, 5]])

In [47]:
A.reshape((2, 12))

array([[1, 6, 6, 9, 4, 0, 9, 8, 6, 4, 8, 9],
       [7, 2, 3, 5, 5, 9, 6, 9, 3, 3, 8, 5]])

## Broadcasting

The most important thing NumPy does is **broadcasting**, which means that it allows for arithmetic operations on arrays of different shapes.

In [3]:
# See https://docs.scipy.org/doc/numpy-1.13.0/user/basics.broadcasting.html

a = np.array([1.0, 2.0, 3.0])
b = 2.0
a * b

array([ 2.,  4.,  6.])

In [4]:
a = np.array([1.0, 2.0, 3.0])
b = np.array([2.0, 2.0, 2.0])
a * b

array([ 2.,  4.,  6.])

The rule of thumb is that NumPy does arithmetic operations pairwise, but if a certain dimension is 1, then it will **broadcast** that effect across the dimension.

In [5]:
a = np.array([1.0, 2.0, 3.0])

B = np.zeros((3, 3)) # means a 3x3 matrix of all zeros

a + B

array([[ 1.,  2.,  3.],
       [ 1.,  2.,  3.],
       [ 1.,  2.,  3.]])

In [6]:
a = np.array([[1.0], [2.0], [3.0]])

B = np.zeros((3, 3))

a + B

array([[ 1.,  1.,  1.],
       [ 2.,  2.,  2.],
       [ 3.,  3.,  3.]])

In [7]:
1 + np.zeros((3, 3))

array([[ 1.,  1.,  1.],
       [ 1.,  1.,  1.],
       [ 1.,  1.,  1.]])

In [73]:
x = np.array([1, 2])
y = np.array([[3], [4]])

In [74]:
x

array([1, 2])

In [75]:
y

array([[3],
       [4]])

In [None]:
x + y # what will this output?

# Linear Algebra

NumPy also supports many linear algebra operations.

In [54]:
A = 10 * np.random.rand(3, 3)
A = A.astype(int)
A

array([[6, 5, 0],
       [9, 6, 3],
       [1, 0, 1]])

In [55]:
A.T

array([[6, 9, 1],
       [5, 6, 0],
       [0, 3, 1]])

In [56]:
x = np.ones(3)
x

array([ 1.,  1.,  1.])

In [57]:
A @ x # Matrix vector multiplication

array([ 11.,  18.,   2.])

In [58]:
np.dot(A, x) # equivalent to above

array([ 11.,  18.,   2.])

In [59]:
A * x # does not work as expected! see the broadcasting section

array([[ 6.,  5.,  0.],
       [ 9.,  6.,  3.],
       [ 1.,  0.,  1.]])

In [60]:
def generate_vector_in_subspace(A):
    return np.dot(A, np.random.rand(A.shape[1], 1))

In [61]:
b = generate_vector_in_subspace(A)
b

array([[ 4.67041514],
       [ 6.40992066],
       [ 0.30901937]])

In [62]:
np.linalg.solve(A, b)

array([[ 0.10136301],
       [ 0.81244741],
       [ 0.20765636]])

In [63]:
np.dot(np.linalg.inv(A), b)

array([[ 0.10136301],
       [ 0.81244741],
       [ 0.20765636]])

## Conditions

In [91]:
A = np.arange(1, 10).reshape(3, 3) # arange is similar to range()

In [92]:
cond = (A < 5)
A[cond]

array([1, 2, 3, 4])

In [93]:
# np.random.rand generates a random matrix of some shape
B = np.random.rand(1, 9).reshape(3, 3)
B

array([[ 0.98747927,  0.13205083,  0.28123437],
       [ 0.67669654,  0.03791182,  0.15350346],
       [ 0.61305012,  0.8633316 ,  0.42297049]])

In [94]:
B[cond] # selects the first four elements of the matrix (by row)

array([ 0.98747927,  0.13205083,  0.28123437,  0.67669654])

## Other Operations

In [107]:
a = np.random.rand(100)

In [108]:
a.mean()

0.52310980507263449

In [109]:
a.sum()

52.310980507263451

In [110]:
np.median(a)

0.55726370011931703

# Pandas

Pandas is a commonly used data processing library. 

Data is stored in **DataFrame** objects, which is a collection of **Series** objects, which represent columns.

We'll go over just a couple important functions on DataFrames in Pandas.

In [157]:
titanic_train = pd.read_csv('data/titanic/train.csv')
titanic_test = pd.read_csv('data/titanic/test.csv')

In [158]:
titanic_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [159]:
# Missing values
titanic_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [160]:
titanic_train['Age'] = titanic_train['Age'].fillna(titanic_train['Age'].mean())

In [161]:
titanic_train['Sex'].unique()

array(['male', 'female'], dtype=object)

In [162]:
titanic_train['SibSp'].unique()

array([1, 0, 3, 4, 2, 5, 8])

In [163]:
titanic_train['Pclass'].unique()

array([3, 1, 2])

In [164]:
titanic_train['Parch'].unique()

array([0, 1, 2, 5, 3, 4, 6])

In [165]:
titanic_train['Cabin'].unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6', 'C23 C25 C27',
       'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33', 'F G73', 'E31',
       'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101', 'F E69', 'D47',
       'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4', 'A32', 'B4',
       'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35', 'C87', 'B77',
       'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19', 'B49', 'D',
       'C22 C26', 'C106', 'C65', 'E36', 'C54', 'B57 B59 B63 B66', 'C7',
       'E34', 'C32', 'B18', 'C124', 'C91', 'E40', 'T', 'C128', 'D37',
       'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44', 'A34', 'C104', 'C111',
       'C92', 'E38', 'D21', 'E12', 'E63', 'A14', 'B37', 'C30', 'D20',
       'B79', 'E25', 'D46', 'B73', 'C95', 'B38', 'B39', 'B22', 'C86',
       'C70', 'A16', 'C101', 'C68', 'A10', 'E68', 'B41', 'A20', 'D19',
       'D50', 'D9', 'A23', 'B50', 'A26', 'D48', 'E58', 'C126', 'B71',
       'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63', 'C62 C64', 'E24',

In [166]:
titanic_train['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

## One-hot encoding

In [150]:
def one_hot(df, columns):
    for column in columns:
        col_onehot = pd.get_dummies(df[column], prefix=column)
        df.drop(column, axis=1, inplace=True)
        df = df.join(col_onehot)
    return df

titanic_train = one_hot(titanic_train, ['PClass', 'Sex', ])

Unnamed: 0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Pclass_1,Pclass_2,Pclass_3
0,1,0,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0,0
2,3,1,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,0,0
4,5,0,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,0,1
