<title>pandas Data Structures</title>

<h1>pandas Data Structures</h1>

In [1]:
import numpy as np
import pandas as pd

<h2>NumPy array creation via numpy.array</h2>

In [2]:
ar1 = np.array([0,1,2,3]) # 1 dimensional array

In [3]:
ar2 = np.array([[0,3,5],[2,8,7]]) # 2D array

In [4]:
ar1

array([0, 1, 2, 3])

In [5]:
ar2

array([[0, 3, 5],
       [2, 8, 7]])

<b>Shape of the array</b>

In [6]:
ar2.shape

(2, 3)

<b>Number of dimensions</b>

In [7]:
ar2.ndim

2

<h2>NumPy array creation via numpy.arange</h2>

In [8]:
ar3 = np.arange(12); ar3

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [9]:
# start, end (exclusive)
ar4=np.arange(3,10,3);ar4

array([3, 6, 9])

<h2>NumPy array creation via numpy.linspace</h2>

In [10]:
# args - start element,end element, number of elements
ar5=np.linspace(0,2.0/3,4); ar5

array([0.        , 0.22222222, 0.44444444, 0.66666667])

<h2>NumPy array via various other functions</h2>

<h3>numpy.ones</h3>

In [13]:
# Produces 2x3x2 array of 1's.
ar7=np.ones((2,3,2)); ar7

array([[[1., 1.],
        [1., 1.],
        [1., 1.]],

       [[1., 1.],
        [1., 1.],
        [1., 1.]]])

<h3>numpy.zeros</h3>

In [14]:
# Produce 4x2 array of zeros.
ar8=np.zeros((4,2));ar8

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.]])

<h3>numpy.eye</h3>

In [15]:
# Produces identity matrix
ar9 = np.eye(3);ar9

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [16]:
f_ar = np.array([3,-2,8.18])
f_ar

array([ 3.  , -2.  ,  8.18])

<h3>numpy.diag</h3>

In [17]:
# Create diagonal array
ar10=np.diag((2,1,4,6));ar10

array([[2, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 4, 0],
       [0, 0, 0, 6]])

<h3>numpy.random.rand</h3>

In [18]:
# Using the rand, randn functions
# rand(m) produces m random numbers uniformly distributed on (0, 1)
#np.random.seed(100) # Set seed
ar11=np.random.rand(5); ar11

array([0.57245061, 0.5190307 , 0.14068758, 0.50650863, 0.9959748 ])

In [19]:
np.random.rand(3,3)

array([[0.15490285, 0.08403214, 0.43326617],
       [0.72748651, 0.47427596, 0.91967169],
       [0.12560968, 0.17572647, 0.70605433]])

In [20]:
# randn(m) produces m normally distributed (Gaussian) random numbers
ar12=np.random.randn(5); ar12

array([ 1.37456982, -1.60707614,  1.60905663,  1.83598424,  2.24718054])

In [21]:
np.random.randn(5,2)

array([[-0.28236729, -1.5592708 ],
       [ 0.48075434,  0.68626624],
       [-1.19854675,  1.80997595],
       [-0.0075812 , -0.62141469],
       [-0.28071463,  0.92646361]])

<h3>numpy.empty</h3>

Using np.empty to create an uninitialized array is a cheaper and faster way to
allocate an array, rather than using np.ones or np.zeros (malloc versus. cmalloc).
However, you should only use it if you're sure that all the elements will be
initialized later:

In [22]:
ar13=np.empty((3,2)); ar13

array([[0., 0.],
       [0., 0.],
       [0., 0.]])

<h3>numpy.tile</h3>

In [23]:
np.array([[1,2],[6,7]])

array([[1, 2],
       [6, 7]])

In [24]:
np.tile(np.array([[1,2],[6,7]]),3)

array([[1, 2, 1, 2, 1, 2],
       [6, 7, 6, 7, 6, 7]])

In [25]:
np.tile(np.array([[1,2],[6,7]]),(2,3))

array([[1, 2, 1, 2, 1, 2],
       [6, 7, 6, 7, 6, 7],
       [1, 2, 1, 2, 1, 2],
       [6, 7, 6, 7, 6, 7]])

<h2>NumPy datatypes</h2>

In [26]:
ar=np.array([2,-1,6,3],dtype='float'); ar

array([ 2., -1.,  6.,  3.])

In [27]:
ar.dtype

dtype('float64')

In [28]:
ar=np.array([2,4,6,8]); ar.dtype

dtype('int32')

In [29]:
ar=np.array([2.,4,6,8]); ar.dtype

dtype('float64')

In [30]:
sar=np.array(['Goodbye','Welcome','Tata','Goodnight']); sar.dtype

dtype('<U9')

In [31]:
bar=np.array([True, False, True]); bar.dtype

dtype('bool')

In [32]:
f_ar = np.array([3,-2,8.18])
f_ar

array([ 3.  , -2.  ,  8.18])

In [33]:
f_ar.astype(int)

array([ 3, -2,  8])

<h2>NumPy indexing and slicing</h2>

In [34]:
# print entire array, element 0, element 1, last element.
ar = np.arange(5); print(ar); ar[0], ar[1], ar[-1]

[0 1 2 3 4]


(0, 1, 4)

In [35]:
# 2nd, last and 1st elements
ar=np.arange(5); ar[1], ar[-1], ar[0]

(1, 4, 0)

In [36]:
# Reverse array using ::-1 idiom
ar=np.arange(5); ar[::-1]

array([4, 3, 2, 1, 0])

In [37]:
# Index multi-dimensional array

In [38]:
ar = np.array([[2,3,4],[9,8,7],[11,12,13]]); ar

array([[ 2,  3,  4],
       [ 9,  8,  7],
       [11, 12, 13]])

In [39]:
ar[1,1]

8

In [40]:
ar[1,1]=5; ar

array([[ 2,  3,  4],
       [ 9,  5,  7],
       [11, 12, 13]])

In [41]:
ar[2]

array([11, 12, 13])

In [42]:
ar[2,:]

array([11, 12, 13])

In [43]:
ar[:,1]

array([ 3,  5, 12])

In [44]:
ar = np.array([0,1,2])

In [45]:
ar[5]

IndexError: index 5 is out of bounds for axis 0 with size 3

<h2>Array slicing</h2>

In [46]:
ar=2*np.arange(6); ar

array([ 0,  2,  4,  6,  8, 10])

In [47]:
ar[1:5:2]

array([2, 6])

In [48]:
ar[1:6:2]

array([ 2,  6, 10])

In [49]:
ar[:4]

array([0, 2, 4, 6])

In [50]:
ar[4:]

array([ 8, 10])

In [51]:
ar[::3]

array([0, 6])

In [52]:
ar

array([ 0,  2,  4,  6,  8, 10])

In [53]:
ar[:3]=1;ar

array([ 1,  1,  1,  6,  8, 10])

In [54]:
ar[2:]=np.ones(4);ar

array([1, 1, 1, 1, 1, 1])

<h2>Array masking</h2>

In [55]:
np.random.seed(10)
ar=np.random.randint(0,25,10); ar

array([ 9,  4, 15,  0, 17, 16, 17,  8,  9,  0])

In [56]:
evenMask=(ar % 2==0); evenMask

array([False,  True, False,  True, False,  True, False,  True, False,
        True])

In [57]:
evenNums=ar[evenMask]; evenNums

array([ 4,  0, 16,  8,  0])

In [58]:
ar[(ar%2==0)]

array([ 4,  0, 16,  8,  0])

In [59]:
ar=np.array(['Hungary','Nigeria',
'Guatemala','','Poland','','Japan']); ar

array(['Hungary', 'Nigeria', 'Guatemala', '', 'Poland', '', 'Japan'],
      dtype='<U9')

In [60]:
ar[ar=='']='USA'; ar

array(['Hungary', 'Nigeria', 'Guatemala', 'USA', 'Poland', 'USA', 'Japan'],
      dtype='<U9')

In [61]:
ar=11*np.arange(0,10); ar

array([ 0, 11, 22, 33, 44, 55, 66, 77, 88, 99])

In [62]:
ar[[1,3,4,2,7]]

array([11, 33, 44, 22, 77])

In [63]:
ar[1,3,4,2,7]

IndexError: too many indices for array

In [64]:
ar[[1,3]]=50; ar

array([ 0, 50, 22, 50, 44, 55, 66, 77, 88, 99])

<h2>Complex indexing</h2>

In [65]:
ar=np.arange(15); ar

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [66]:
ar2=np.arange(0,-10,-1); ar2

array([ 0, -1, -2, -3, -4, -5, -6, -7, -8, -9])

In [67]:
ar2[::-1]

array([-9, -8, -7, -6, -5, -4, -3, -2, -1,  0])

In [68]:
np.arange(-10,0)

array([-10,  -9,  -8,  -7,  -6,  -5,  -4,  -3,  -2,  -1])

In [69]:
ar[:10]=ar2; ar

array([ 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, 10, 11, 12, 13, 14])

<h2>Copies and views</h2>

<b>Modifying view modifies original array</b>

In [70]:
ar1=np.arange(12); ar1

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [71]:
ar2=ar1[::2]; ar2

array([ 0,  2,  4,  6,  8, 10])

In [72]:
ar2[1]=-1; ar1

array([ 0,  1, -1,  3,  4,  5,  6,  7,  8,  9, 10, 11])

<b>Use np.copy to force a copy</b>

In [73]:
ar=np.arange(8);ar

array([0, 1, 2, 3, 4, 5, 6, 7])

In [74]:
arc=ar[:3].copy(); arc

array([0, 1, 2])

In [75]:
arc[0]=-1; arc

array([-1,  1,  2])

In [76]:
ar

array([0, 1, 2, 3, 4, 5, 6, 7])

In [144]:
del arc

<h1>Operations</h1>
<h2>Basic Operations</h2>

<b>Element-wise</b>

In [77]:
ar=np.arange(0,7)*5; ar

array([ 0,  5, 10, 15, 20, 25, 30])

In [78]:
ar=np.arange(5) ** 4 ; ar

array([  0,   1,  16,  81, 256], dtype=int32)

In [79]:
ar ** 0.5

array([ 0.,  1.,  4.,  9., 16.])

In [80]:
ar=3+np.arange(0, 30,3); ar

array([ 3,  6,  9, 12, 15, 18, 21, 24, 27, 30])

In [81]:
ar2=np.arange(1,11); ar2

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [82]:
ar-ar2

array([ 2,  4,  6,  8, 10, 12, 14, 16, 18, 20])

In [83]:
ar/ar2

array([3., 3., 3., 3., 3., 3., 3., 3., 3., 3.])

In [84]:
ar*ar2

array([  3,  12,  27,  48,  75, 108, 147, 192, 243, 300])

<b>NumPy faster for this than Python</b>

In [85]:
ar=np.arange(1000)
%timeit ar**3

6.78 µs ± 307 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [86]:
ar=range(1000)
%timeit [ar[i]**3 for i in ar]

1.21 ms ± 69.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


<b>Array multiplication is element wise</b>

In [87]:
ar=np.array([[1,1],[1,1]]); ar

array([[1, 1],
       [1, 1]])

In [88]:
ar2=np.array([[2,2],[2,2]]); ar2

array([[2, 2],
       [2, 2]])

In [89]:
ar.dot(ar2)

array([[4, 4],
       [4, 4]])

In [90]:
ar*ar2

array([[2, 2],
       [2, 2]])

<b>Comparison and logical operations are also elememt-wise</b>

In [91]:
ar=np.arange(1,5); ar

array([1, 2, 3, 4])

In [92]:
ar2=np.arange(5,1,-1);ar2

array([5, 4, 3, 2])

In [93]:
ar < ar2

array([ True,  True, False, False])

In [94]:
ar[ar < ar2]

array([1, 2])

In [95]:
l1 = np.array([True,False,True,False])
l2 = np.array([False,False,True, False])
np.logical_and(l1,l2)

array([False, False,  True, False])

In [96]:
l1 and l2

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [97]:
[True,False,True,False] and [False,False,True, False]

[False, False, True, False]

<b>Other operations are also element-wise</b>

In [98]:
ar=np.array([np.pi, np.pi/2]); np.sin(ar)

array([1.2246468e-16, 1.0000000e+00])

<b>For element-wise operations, the 2 arrays must be the same shape else an error results</b>

In [99]:
ar=np.arange(0,6); ar

array([0, 1, 2, 3, 4, 5])

In [100]:
ar2=np.arange(0,8); ar2

array([0, 1, 2, 3, 4, 5, 6, 7])

In [101]:
ar*ar2

ValueError: operands could not be broadcast together with shapes (6,) (8,) 

<b>NumPy arrays can be transposed</b>

In [102]:
ar=np.array([[1,2,3],[4,5,6]]); ar

array([[1, 2, 3],
       [4, 5, 6]])

In [103]:
ar.T

array([[1, 4],
       [2, 5],
       [3, 6]])

In [104]:
np.transpose(ar)

array([[1, 4],
       [2, 5],
       [3, 6]])

<b>Compare arrays not element-wise but array-wise</b>

In [105]:
ar=np.arange(0,6)
ar2=np.array([0,1,2,3,4,5])
np.array_equal(ar, ar2)

True

In [106]:
np.all(ar==ar2)

True

<h1>Reduction Operations</h1>

In [107]:
ar=np.arange(1,5)
ar.prod()

24

In [108]:
ar=np.array([np.arange(1,6),np.arange(1,6)]);ar

array([[1, 2, 3, 4, 5],
       [1, 2, 3, 4, 5]])

In [109]:
#Columns
np.prod(ar,axis=0)

array([ 1,  4,  9, 16, 25])

In [110]:
#Rows
np.prod(ar,axis=1)

array([120, 120])

In [111]:
ar=np.array([[2,3,4],[5,6,7],[8,9,10]]); ar.sum()

54

In [112]:
ar.mean()

6.0

In [113]:
np.median(ar)

6.0

<h1>Statistical operators</h1>

In [114]:
np.random.seed(10)
ar=np.random.randint(0,10, size=(4,5));ar

array([[9, 4, 0, 1, 9],
       [0, 1, 8, 9, 0],
       [8, 6, 4, 3, 0],
       [4, 6, 8, 1, 8]])

In [115]:
ar.mean()

4.45

In [116]:
ar.mean(axis=0)

array([5.25, 4.25, 5.  , 3.5 , 4.25])

In [117]:
ar.std()

3.427462618322773

In [118]:
ar.var(axis=0) #across rows

array([12.6875,  4.1875, 11.    , 10.75  , 18.1875])

In [119]:
ar.cumsum()

array([ 9, 13, 13, 14, 23, 23, 24, 32, 41, 41, 49, 55, 59, 62, 62, 66, 72,
       80, 81, 89], dtype=int32)

In [120]:
ar.cumsum(axis=0)

array([[ 9,  4,  0,  1,  9],
       [ 9,  5,  8, 10,  9],
       [17, 11, 12, 13,  9],
       [21, 17, 20, 14, 17]], dtype=int32)

<h1>Logical operators</h1>

In [121]:
np.random.seed(100)
ar=np.random.randint(1,10, size=(4,4));ar

array([[9, 9, 4, 8],
       [8, 1, 5, 3],
       [6, 3, 3, 3],
       [2, 1, 9, 5]])

In [122]:
(ar%7)==0

array([[False, False, False, False],
       [False, False, False, False],
       [False, False, False, False],
       [False, False, False, False]])

In [123]:
np.any((ar%7)==0)

False

In [124]:
np.all(ar<11)

True

<h1>Broadcasting</h1>

In [125]:
ar=np.ones([3,2]); ar

array([[1., 1.],
       [1., 1.],
       [1., 1.]])

In [126]:
ar2=np.array([2,3]); ar2

array([2, 3])

In [127]:
ar+ar2

array([[3., 4.],
       [3., 4.],
       [3., 4.]])

<b>Broadcasting works across dimensions</b>

In [128]:
ar=np.array([[23,24,25]]); ar

array([[23, 24, 25]])

In [129]:
ar.T

array([[23],
       [24],
       [25]])

In [130]:
ar.T+ar

array([[46, 47, 48],
       [47, 48, 49],
       [48, 49, 50]])

In [131]:
ar = np.array([[1,2,3]]); ar

array([[1, 2, 3]])

In [132]:
ar.T * ar

array([[1, 2, 3],
       [2, 4, 6],
       [3, 6, 9]])

<h1>Array shape manipulation<h1>
<h2>Flattening a multi-dimensional array<h2>

In [2]:
ar=np.array([np.arange(1,6), np.arange(10,15)]); ar

array([[ 1,  2,  3,  4,  5],
       [10, 11, 12, 13, 14]])

In [3]:
ar.ravel()

array([ 1,  2,  3,  4,  5, 10, 11, 12, 13, 14])

In [4]:
ar.T.ravel()

array([ 1, 10,  2, 11,  3, 12,  4, 13,  5, 14])

In [5]:
ar.ravel(order='C') # row-wise - last index changing fastest - C-style

array([ 1,  2,  3,  4,  5, 10, 11, 12, 13, 14])

In [6]:
ar.ravel('C')  # it's the first argument

array([ 1,  2,  3,  4,  5, 10, 11, 12, 13, 14])

In [7]:
ar.ravel('F')  # column-wise - first index changing fastest - Fortran style

array([ 1, 10,  2, 11,  3, 12,  4, 13,  5, 14])

<h2>Reshaping<h2>

In [136]:
ar=np.arange(1,16);ar

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])

In [137]:
ar.reshape(3,5)

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10],
       [11, 12, 13, 14, 15]])

In [138]:
ar

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])

<h2>Resizing</h2>

In [8]:
ar=np.arange(5);ar

array([0, 1, 2, 3, 4])

In [9]:
ar.resize((8,));ar

ValueError: cannot resize an array that references or is referenced
by another array in this way.
Use the np.resize function or refcheck=False

<b>resize() only works if no otehr references to array, else error</b>

In [10]:
ar2=ar

In [11]:
ar.resize((9,));

ValueError: cannot resize an array that references or is referenced
by another array in this way.
Use the np.resize function or refcheck=False

<b>Workaround is to use numpy.ndarray.resize() instead</b>

In [12]:
np.resize(ar,(9,))

array([0, 1, 2, 3, 4, 0, 1, 2, 3])

In [13]:
ar

array([0, 1, 2, 3, 4])

<h2>Adding a dimension</h2>

In [151]:
ar=np.array([14,15,16]); ar.shape

(3,)

In [152]:
ar

array([14, 15, 16])

In [153]:
ar=ar[:, np.newaxis]; ar.shape

(3, 1)

In [154]:
ar

array([[14],
       [15],
       [16]])

In [155]:
ar=ar[:, np.newaxis]; ar.shape

(3, 1, 1)

In [156]:
ar

array([[[14]],

       [[15]],

       [[16]]])

In [157]:
ar=np.array([14,15,16]); ar

array([14, 15, 16])

In [158]:
ar[np.newaxis, :]

array([[14, 15, 16]])

In [159]:
ar[np.newaxis, :,np.newaxis]

array([[[14],
        [15],
        [16]]])

<h1>Array sorting</h1>

<b>Along y-axis<b>

In [160]:
ar=np.array([[3,2],[10,-1]])
ar

array([[ 3,  2],
       [10, -1]])

In [174]:
ar.sort(axis=1)
ar

array([[ 2,  3],
       [-1, 10]])

<b>Along x-axis</b>

In [175]:
ar=np.array([[3,2],[10,-1]])
ar

array([[ 3,  2],
       [10, -1]])

In [176]:
ar.sort(axis=0)
ar

array([[ 3, -1],
       [10,  2]])

<h1>Data structures in pandas<h1>

<h1>Series</h1>

<h2>Series creation</h2>

<h2>Using numpy.ndarray</h2>

In [14]:
np.random.seed(100)
ser=pd.Series(np.random.rand(7)); ser

0    0.543405
1    0.278369
2    0.424518
3    0.844776
4    0.004719
5    0.121569
6    0.670749
dtype: float64

In [15]:
ser.index = list('abcdefg'); ser

a    0.543405
b    0.278369
c    0.424518
d    0.844776
e    0.004719
f    0.121569
g    0.670749
dtype: float64

In [16]:
import calendar as cal
months=pd.Series(np.arange(1,6),index=cal.month_name[1:6]);months

January     1
February    2
March       3
April       4
May         5
dtype: int32

In [17]:
months.index

Index(['January', 'February', 'March', 'April', 'May'], dtype='object')

In [18]:
months.values

array([1, 2, 3, 4, 5])

In [19]:
months['March']

3

In [20]:
months[2]

3

In [21]:
months[['March','April']]

March    3
April    4
dtype: int32

In [22]:
months[cal.month_name[1:3]]

January     1
February    2
dtype: int32

<h2>Using Python dictionary</h2>

In [23]:
currDict={'US' : 'dollar', 'UK' : 'pound',
'Germany': 'euro', 'Mexico':'peso',
'Nigeria':'naira',
'China':'yuan', 'Japan':'yen'}
currSeries=pd.Series(currDict); currSeries

US         dollar
UK          pound
Germany      euro
Mexico       peso
Nigeria     naira
China        yuan
Japan         yen
dtype: object

In [26]:
currSeries.name

In [27]:
stockPrices = {'GOOG':1180.97,'FB':62.57,
'TWTR': 64.50, 'AMZN':358.69,
'AAPL':500.6}
stockPriceSeries=pd.Series(stockPrices,
index=['GOOG','FB','YHOO',
'TWTR','AMZN','AAPL'],
name='stockPrices')
stockPriceSeries

GOOG    1180.97
FB        62.57
YHOO        NaN
TWTR      64.50
AMZN     358.69
AAPL     500.60
Name: stockPrices, dtype: float64

In [28]:
stockPriceSeries.name

'stockPrices'

<h2>Using scalar values</h2>

In [29]:
dogSeries=pd.Series('chihuahua', index=['breed','countryOfOrigin', 'name', 'gender'])
dogSeries

breed              chihuahua
countryOfOrigin    chihuahua
name               chihuahua
gender             chihuahua
dtype: object

In [30]:
dogSeries=pd.Series('pekingese'); dogSeries

0    pekingese
dtype: object

In [31]:
type(dogSeries)

pandas.core.series.Series

In [32]:
ss = pd.Series(['a','b','c']); ss

0    a
1    b
2    c
dtype: object

In [33]:
ss = pd.Series(['a','b','c'],index = ['p','q','r']); ss

p    a
q    b
r    c
dtype: object

In [34]:
ss[['p','q']]

p    a
q    b
dtype: object

In [35]:
ss = pd.Series(range(3), index=range(3)); ss

0    0
1    1
2    2
dtype: int64

In [36]:
ss = pd.Series(0, index=range(3)); ss

0    0
1    0
2    0
dtype: int64

In [37]:
ss = pd.Series([0, 1], index=range(3)); ss

ValueError: Length of passed values is 2, index implies 3

<h1>Operations on Series</h1>

<h2>Assignment</h2>

In [38]:
currDict['China']

'yuan'

In [39]:
stockPriceSeries['GOOG']=1200.0
stockPriceSeries

GOOG    1200.00
FB        62.57
YHOO        NaN
TWTR      64.50
AMZN     358.69
AAPL     500.60
Name: stockPrices, dtype: float64

<b>KeyError is raised if you try to retrieve a missing label</b>

In [40]:
stockPriceSeries['MSFT']

KeyError: 'MSFT'

<b>Use get() to avoid this</b>

In [41]:
stockPriceSeries.get('MSFT',np.NaN)

nan

In [42]:
stockPriceSeries.get('MSFT')

<h2>Slicing</h2>

In [43]:
stockPriceSeries[:4]

GOOG    1200.00
FB        62.57
YHOO        NaN
TWTR      64.50
Name: stockPrices, dtype: float64

In [44]:
stockPriceSeries[stockPriceSeries > 100]



GOOG    1200.00
AMZN     358.69
AAPL     500.60
Name: stockPrices, dtype: float64

<h2>Other operations</h2>

In [45]:
np.mean(stockPriceSeries)

437.27200000000005

In [46]:
stockPriceSeries.mean()

437.27200000000005

In [47]:
np.std(stockPriceSeries)

417.4446361087899

In [48]:
stockPriceSeries.std()

466.7172915909588

<b>Element-wise operations</b>

In [49]:
ser

a    0.543405
b    0.278369
c    0.424518
d    0.844776
e    0.004719
f    0.121569
g    0.670749
dtype: float64

In [50]:
ser*ser

a    0.295289
b    0.077490
c    0.180215
d    0.713647
e    0.000022
f    0.014779
g    0.449904
dtype: float64

In [51]:
np.sqrt(ser)

a    0.737160
b    0.527607
c    0.651550
d    0.919117
e    0.068694
f    0.348668
g    0.818993
dtype: float64

<b>Data is automatically aligned on basis of the label</b>

In [52]:
ser[1:]

b    0.278369
c    0.424518
d    0.844776
e    0.004719
f    0.121569
g    0.670749
dtype: float64

In [53]:
ser[1:] + ser[:-2]

a         NaN
b    0.556739
c    0.849035
d    1.689552
e    0.009438
f         NaN
g         NaN
dtype: float64

<h1>DataFrame</h1>


<h1>DataFrame Creation</h1>


<h2>Using dictionaries of Series</h2>

In [54]:
stockSummaries={'AMZN': pd.Series([346.15,0.59,459,0.52,589.8,158.88],
                                  index=['Closing price','EPS','Shares Outstanding(M)',
                                         'Beta', 'P/E','Market Cap(B)']),
                'GOOG': pd.Series([1133.43,36.05,335.83,0.87,31.44,380.64],
                                  index=['Closing price','EPS','Shares Outstanding(M)',
                                         'Beta','P/E','Market Cap(B)']),
                  'FB': pd.Series([61.48,0.59,2450,104.93,150.92],
                                  index=['Closing price','EPS','Shares Outstanding(M)',
                                         'P/E', 'Market Cap(B)']),
                'YHOO': pd.Series([34.90,1.27,1010,27.48,0.66,35.36],
                                  index=['Closing price','EPS','Shares Outstanding(M)',
                                         'P/E','Beta', 'Market Cap(B)']),
               'TWTR':pd.Series([65.25,-0.3,555.2,36.23],
                                  index=['Closing price','EPS','Shares Outstanding(M)',
                                         'Market Cap(B)']),
               'AAPL':pd.Series([501.53,40.32,892.45,12.44,447.59,0.84],
                                  index=['Closing price','EPS','Shares Outstanding(M)','P/E',
                                         'Market Cap(B)','Beta'])}

In [55]:
stockDF=pd.DataFrame(stockSummaries); stockDF


Unnamed: 0,AMZN,GOOG,FB,YHOO,TWTR,AAPL
Beta,0.52,0.87,,0.66,,0.84
Closing price,346.15,1133.43,61.48,34.9,65.25,501.53
EPS,0.59,36.05,0.59,1.27,-0.3,40.32
Market Cap(B),158.88,380.64,150.92,35.36,36.23,447.59
P/E,589.8,31.44,104.93,27.48,,12.44
Shares Outstanding(M),459.0,335.83,2450.0,1010.0,555.2,892.45


In [56]:
stockDF=pd.DataFrame(stockSummaries,
index=['Closing price','EPS', 'Shares Outstanding(M)', 'P/E', 'Market Cap(B)','Beta']);stockDF

Unnamed: 0,AMZN,GOOG,FB,YHOO,TWTR,AAPL
Closing price,346.15,1133.43,61.48,34.9,65.25,501.53
EPS,0.59,36.05,0.59,1.27,-0.3,40.32
Shares Outstanding(M),459.0,335.83,2450.0,1010.0,555.2,892.45
P/E,589.8,31.44,104.93,27.48,,12.44
Market Cap(B),158.88,380.64,150.92,35.36,36.23,447.59
Beta,0.52,0.87,,0.66,,0.84


In [57]:
stockDF=pd.DataFrame(stockSummaries,
        index=['Closing price','EPS',
               'Shares Outstanding(M)',
               'P/E', 'Market Cap(B)','Beta'],
        columns=['FB','TWTR','SCNW'])
stockDF

Unnamed: 0,FB,TWTR,SCNW
Closing price,61.48,65.25,
EPS,0.59,-0.3,
Shares Outstanding(M),2450.0,555.2,
P/E,104.93,,
Market Cap(B),150.92,36.23,
Beta,,,


In [58]:
stockDF.index

Index(['Closing price', 'EPS', 'Shares Outstanding(M)', 'P/E', 'Market Cap(B)',
       'Beta'],
      dtype='object')

In [59]:
stockDF.columns

Index(['FB', 'TWTR', 'SCNW'], dtype='object')

<h2>Using a dictionary of ndarrays/lists</h2>

In [60]:
algos={'search':['DFS','BFS','Binary Search','Linear','ShortestPath (Djikstra)'],
      'sorting': ['Quicksort','Mergesort', 'Heapsort','Bubble Sort', 'Insertion Sort'],
      'machine learning':['RandomForest','K Nearest Neighbor','Logistic Regression',
                          'K-Means Clustering','Linear Regression']}
algoDF=pd.DataFrame(algos); algoDF

Unnamed: 0,search,sorting,machine learning
0,DFS,Quicksort,RandomForest
1,BFS,Mergesort,K Nearest Neighbor
2,Binary Search,Heapsort,Logistic Regression
3,Linear,Bubble Sort,K-Means Clustering
4,ShortestPath (Djikstra),Insertion Sort,Linear Regression


In [61]:
pd.DataFrame(algos,index=['algo_1','algo_2','algo_3','algo_4','algo_5'])

Unnamed: 0,search,sorting,machine learning
algo_1,DFS,Quicksort,RandomForest
algo_2,BFS,Mergesort,K Nearest Neighbor
algo_3,Binary Search,Heapsort,Logistic Regression
algo_4,Linear,Bubble Sort,K-Means Clustering
algo_5,ShortestPath (Djikstra),Insertion Sort,Linear Regression


<h2>Using a structured array</h2>

In [62]:
memberData = np.zeros((4,),
             dtype=[('Name','a15'),
                    ('Age','i4'),
                   ('Weight','f4')])
memberData

array([(b'', 0, 0.), (b'', 0, 0.), (b'', 0, 0.), (b'', 0, 0.)],
      dtype=[('Name', 'S15'), ('Age', '<i4'), ('Weight', '<f4')])

In [63]:
memberData[:] = [('Sanjeev',37,162.4),
                 ('Yingluck',45,137.8),
                 ('Emeka',28,153.2),
                 ('Amy',67,101.3)]
memberData

array([(b'Sanjeev', 37, 162.4), (b'Yingluck', 45, 137.8),
       (b'Emeka', 28, 153.2), (b'Amy', 67, 101.3)],
      dtype=[('Name', 'S15'), ('Age', '<i4'), ('Weight', '<f4')])

In [64]:
memberDF=pd.DataFrame(memberData)
memberDF

Unnamed: 0,Name,Age,Weight
0,b'Sanjeev',37,162.399994
1,b'Yingluck',45,137.800003
2,b'Emeka',28,153.199997
3,b'Amy',67,101.300003


In [65]:
pd.DataFrame(memberData, index=['a','b','c','d'])

Unnamed: 0,Name,Age,Weight
a,b'Sanjeev',37,162.399994
b,b'Yingluck',45,137.800003
c,b'Emeka',28,153.199997
d,b'Amy',67,101.300003


<h2>Using a Series structure</h2>

In [66]:
currSeries.name='currency'
pd.DataFrame(currSeries)

Unnamed: 0,currency
US,dollar
UK,pound
Germany,euro
Mexico,peso
Nigeria,naira
China,yuan
Japan,yen


<h1>DataFrame Operations</h1>

<h2>Selection</h2>

In [67]:
memberDF['Name']

0     b'Sanjeev'
1    b'Yingluck'
2       b'Emeka'
3         b'Amy'
Name: Name, dtype: object

<h2>Assignment</h2>

In [68]:
memberDF['Height']=60;memberDF

Unnamed: 0,Name,Age,Weight,Height
0,b'Sanjeev',37,162.399994,60
1,b'Yingluck',45,137.800003,60
2,b'Emeka',28,153.199997,60
3,b'Amy',67,101.300003,60


<h2>Deletion</h2>

In [69]:
del memberDF['Height']; memberDF

Unnamed: 0,Name,Age,Weight
0,b'Sanjeev',37,162.399994
1,b'Yingluck',45,137.800003
2,b'Emeka',28,153.199997
3,b'Amy',67,101.300003


In [70]:
memberDF['BloodType']='O'
bloodType=memberDF.pop('BloodType'); bloodType

0    O
1    O
2    O
3    O
Name: BloodType, dtype: object

In [71]:
memberDF

Unnamed: 0,Name,Age,Weight
0,b'Sanjeev',37,162.399994
1,b'Yingluck',45,137.800003
2,b'Emeka',28,153.199997
3,b'Amy',67,101.300003


<h2>Insertion</h2>

In [72]:
memberDF.insert(2,'isSenior',memberDF['Age']>60);
memberDF

Unnamed: 0,Name,Age,isSenior,Weight
0,b'Sanjeev',37,False,162.399994
1,b'Yingluck',45,False,137.800003
2,b'Emeka',28,False,153.199997
3,b'Amy',67,True,101.300003


In [66]:
memberDF.insert(3,'isLight',memberDF['Weight']<150); memberDF

Unnamed: 0,Name,Age,isSenior,isLight,Weight
0,b'Sanjeev',37,False,False,162.399994
1,b'Yingluck',45,False,True,137.800003
2,b'Emeka',28,False,False,153.199997
3,b'Amy',67,True,True,101.300003


<h2>Alignment</h2>

In [73]:
ore1DF=pd.DataFrame(np.array([[20,35,25,20],
                              [11,28,32,29]]),
                    columns=['iron','magnesium',
                             'copper','silver'])
ore1DF

Unnamed: 0,iron,magnesium,copper,silver
0,20,35,25,20
1,11,28,32,29


In [74]:
ore2DF=pd.DataFrame(np.array([[14,34,26,26],
                              [33,19,25,23]]),
                    columns=['iron','magnesium',
                            'gold','silver'])
ore2DF

Unnamed: 0,iron,magnesium,gold,silver
0,14,34,26,26
1,33,19,25,23


In [75]:
ore1DF+ore2DF

Unnamed: 0,copper,gold,iron,magnesium,silver
0,,,34,69,46
1,,,44,47,52


In [76]:
ore1DF + pd.Series([25,25,25,25],
                   index=['iron','magnesium','copper','silver'])

Unnamed: 0,iron,magnesium,copper,silver
0,45,60,50,45
1,36,53,57,54


<h2>Other mathematical operations</h2>

In [77]:
np.sqrt(ore1DF)

Unnamed: 0,iron,magnesium,copper,silver
0,4.472136,5.91608,5.0,4.472136
1,3.316625,5.291503,5.656854,5.385165


<h1>Panel</h1>
<h1>Panel Creation</h1>

<h2>Using 3D NumPy array with axis labels</h2>

In [78]:
stockData=np.array([[[63.03,61.48,75],
                     [62.05,62.75,46],
                     [62.74,62.19,53]],
                   [[411.90, 404.38, 2.9],
                    [405.45, 405.91, 2.6],
                    [403.15, 404.42, 2.4]]])
stockData

array([[[ 63.03,  61.48,  75.  ],
        [ 62.05,  62.75,  46.  ],
        [ 62.74,  62.19,  53.  ]],

       [[411.9 , 404.38,   2.9 ],
        [405.45, 405.91,   2.6 ],
        [403.15, 404.42,   2.4 ]]])

In [71]:
stockHistoricalPrices = pd.Panel(stockData,
                                 items=['FB', 'NFLX'],
                                 major_axis=pd.date_range('2/3/2014',periods=3),
                                 minor_axis=['open price', 'closing price', 'volume'])
stockHistoricalPrices


Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Pandas provides a `.to_xarray()` method to help automate this conversion.

  exec(code_obj, self.user_global_ns, self.user_ns)


<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 3 (major_axis) x 3 (minor_axis)
Items axis: FB to NFLX
Major_axis axis: 2014-02-03 00:00:00 to 2014-02-05 00:00:00
Minor_axis axis: open price to volume

<h2>Using a Python dictionary of DataFrame objects</h2>

In [72]:
USData=pd.DataFrame(np.array([[249.62 , 8900],
                              [282.16,12680],
                              [309.35,14940]]),
                    columns=['Population(M)','GDP($B)'],
                    index=[1990,2000,2010])
USData

Unnamed: 0,Population(M),GDP($B)
1990,249.62,8900.0
2000,282.16,12680.0
2010,309.35,14940.0


In [243]:
ChinaData=pd.DataFrame(np.array([[1133.68, 390.28],
                                 [1266.83,1198.48],
                                 [1339.72, 6988.47]]),
                       columns=['Population(M)','GDP($B)'],
                       index=[1990,2000,2010])
ChinaData

Unnamed: 0,Population(M),GDP($B)
1990,1133.68,390.28
2000,1266.83,1198.48
2010,1339.72,6988.47


In [244]:
US_ChinaData={'US' : USData,
              'China': ChinaData}
pd.Panel(US_ChinaData)

<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 3 (major_axis) x 2 (minor_axis)
Items axis: China to US
Major_axis axis: 1990 to 2010
Minor_axis axis: Population(M) to GDP($B)

<h2>Using the DataFrame.to_panel method</h2>

In [245]:
mIdx = pd.MultiIndex(levels=[['US', 'China'],
                             [1990,2000, 2010]],
                     labels=[[1,1,1,0,0,0],[0,1,2,0,1,2]])
mIdx

MultiIndex(levels=[['US', 'China'], [1990, 2000, 2010]],
           labels=[[1, 1, 1, 0, 0, 0], [0, 1, 2, 0, 1, 2]])

In [246]:
ChinaUSDF = pd.DataFrame({'Population(M)' : [1133.68, 1266.83, 1339.72, 
                                                        249.62, 282.16,309.35], 
                                     'GDB($B)': [390.28, 1198.48, 6988.47, 8900,12680,14940]}, 
                          index=mIdx)
ChinaUSDF


Unnamed: 0,Unnamed: 1,GDB($B),Population(M)
China,1990,390.28,1133.68
China,2000,1198.48,1266.83
China,2010,6988.47,1339.72
US,1990,8900.0,249.62
US,2000,12680.0,282.16
US,2010,14940.0,309.35


In [247]:
ChinaUSDF.to_panel()


Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Pandas provides a `.to_xarray()` method to help automate this conversion.

  """Entry point for launching an IPython kernel.


<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 2 (major_axis) x 3 (minor_axis)
Items axis: GDB($B) to Population(M)
Major_axis axis: China to US
Minor_axis axis: 1990 to 2010