In [1]:
import numpy as np
import pandas as pd

In [3]:
# cope with missing data

df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
                  index=list('abcd'),
                  columns='one two'.split())

In [4]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [5]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [6]:
df.sum(axis=1)

a    1.40
b    2.60
c     NaN
d   -0.55
dtype: float64

In [7]:
# mean

df.mean()

one    3.083333
two   -2.900000
dtype: float64

In [8]:
df.mean(axis=1)

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [9]:
# idxmax
# idxmin

df.idxmax()

one    b
two    d
dtype: object

In [10]:
df.idxmin()

one    d
two    b
dtype: object

In [11]:
# describe
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [12]:
# describe for non-numeric data
obj = pd.Series(list('abcd')*4)

In [13]:
obj

0     a
1     b
2     c
3     d
4     a
5     b
6     c
7     d
8     a
9     b
10    c
11    d
12    a
13    b
14    c
15    d
dtype: object

In [14]:
obj.describe()

count     16
unique     4
top        a
freq       4
dtype: object

In [15]:
obj = pd.Series(list(range(5)))

In [17]:
obj.describe()

count    5.000000
mean     2.000000
std      1.581139
min      0.000000
25%      1.000000
50%      2.000000
75%      3.000000
max      4.000000
dtype: float64

### Correlation	and	Covariance

In [18]:
import pandas_datareader.data as web

In [19]:
all_data = {ticker: web.get_data_yahoo(ticker)
           for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

In [21]:
price = pd.DataFrame({ticker: data['Adj Close']
                      for ticker, data in all_data.items()})

In [22]:
volume = pd.DataFrame({ticker: data['Volume']
                      for ticker, data in all_data.items()})

In [23]:
returns = price.pct_change()

In [24]:
returns.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-05,0.011385,0.014571,0.004886,0.012398
2018-01-08,-0.003714,0.004273,0.006031,0.00102
2018-01-09,-0.000115,-0.000614,0.002202,-0.00068
2018-01-10,-0.00023,-0.003299,0.002136,-0.004534
2018-01-11,0.006311,0.001575,-0.001523,0.001025


In [25]:
returns.head()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009-12-31,,,,
2010-01-04,0.015565,0.01092,0.011841,0.01542
2010-01-05,0.001729,-0.004404,-0.01208,0.000323
2010-01-06,-0.015906,-0.025209,-0.006496,-0.006137
2010-01-07,-0.001849,-0.023279,-0.003461,-0.0104


In [26]:
returns['MSFT'].corr(returns['IBM'])

0.46589421333714115

In [28]:
returns['MSFT'].cov(returns['IBM'])

7.7012387135681504e-05

In [29]:
returns.corr()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.416482,0.354851,0.396547
GOOG,0.416482,1.0,0.380242,0.480274
IBM,0.354851,0.380242,1.0,0.465894
MSFT,0.396547,0.480274,0.465894,1.0


In [30]:
returns.cov()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,0.000253,0.0001,6.7e-05,8.8e-05
GOOG,0.0001,0.000228,6.8e-05,0.000102
IBM,6.7e-05,6.8e-05,0.000139,7.7e-05
MSFT,8.8e-05,0.000102,7.7e-05,0.000196


### Unique	Values,	Value	Counts,	and	Membership

In [34]:
# for unique values

In [31]:
obj = pd.Series(list('aabbccdd'))

In [32]:
unique = obj.unique()

In [33]:
unique

array(['a', 'b', 'c', 'd'], dtype=object)

In [35]:
# value frequency
obj.value_counts()

a    2
b    2
c    2
d    2
dtype: int64

In [37]:
obj.value_counts?

In [38]:
# containing or not 

In [39]:
mask = obj.isin(['b', 'c'])

In [40]:
mask

0    False
1    False
2     True
3     True
4     True
5     True
6    False
7    False
dtype: bool

In [41]:
obj[mask]

2    b
3    b
4    c
5    c
dtype: object