In [20]:
import numpy as np
import pandas as pd

print np.__version__
print pd.__version__

1.10.4
0.17.1


## other topics

### int and bool mask

* index to mask vector
* mask to index vector

In [21]:
df = pd.DataFrame(np.array([1, 2, 4, 5]))
mask = np.array([True, False, False, True], dtype=bool)
ix = np.array([1,2])

In [22]:
np.logical_not(mask)

array([False,  True,  True, False], dtype=bool)

In [23]:
df.index.isin(ix)

array([False,  True,  True, False], dtype=bool)

In [24]:
mask = np.ones(len(df), dtype=bool)
mask

array([ True,  True,  True,  True], dtype=bool)

In [25]:
mask[[0,2,3]] = False
mask

array([False,  True, False, False], dtype=bool)

In [26]:
df[mask]

Unnamed: 0,0
1,2


In [27]:
df[mask].index.tolist()

[1]

### epoch time

In [28]:
from datetime import datetime

In [29]:
ts = pd.DataFrame([datetime(2015,6,1,9,41,12,912), datetime(2015,6,1,9,41,18), datetime(2015,6,1,9,42,10)])
ts

Unnamed: 0,0
0,2015-06-01 09:41:12.000912
1,2015-06-01 09:41:18.000000
2,2015-06-01 09:42:10.000000


In [30]:
ts.astype(np.int64) // 10**9

Unnamed: 0,0
0,1433151672
1,1433151678
2,1433151730


### hash two columns

In [31]:
multi_key = pd.DataFrame({'member_id': [4, 5, 4, 5],
                          'organization_id': [3,3,4,3]})

def create_hash_on(df, pk_columns):
    pk_ix = [df.columns.get_loc(c) for c in pk_columns]
    def hasher(x):
        return hash(tuple(x[pk_ix]))
    return hasher

f = create_hash_on(multi_key, ['member_id', 'organization_id'])

In [32]:
multi_key.apply(f, axis = 1)

0    3713084879524566006
1    3713085962046318431
2    3713084879516988331
3    3713085962046318431
dtype: int64

In [33]:
multi_key.columns.get_loc('member_id')

0

In [34]:
multi_key['pk'] = multi_key.apply(lambda x: hash(tuple([x[0], x[1]])), axis = 1)
multi_key

Unnamed: 0,member_id,organization_id,pk
0,4,3,3713084879524566006
1,5,3,3713085962046318431
2,4,4,3713084879516988331
3,5,3,3713085962046318431


### conditional update

In [35]:
X = pd.DataFrame({'offset': [-1, 0],
                  'score': [5,5]})
X.loc[X.offset == -1, 'score'] = 0
X

Unnamed: 0,offset,score
0,-1,0
1,0,5


### missing values

In [36]:
nps_data = {'score': [0, np.NaN, 2, 10],
            'member_id': [1, np.NaN, 4, 6]}

nps = pd.DataFrame(nps_data)
nps

Unnamed: 0,member_id,score
0,1.0,0.0
1,,
2,4.0,2.0
3,6.0,10.0


In [37]:
nps['score'].fillna(-1, inplace=True)
nps

Unnamed: 0,member_id,score
0,1.0,0
1,,-1
2,4.0,2
3,6.0,10


### place pk columns at front

In [38]:
df = pd.DataFrame({'member_id': [1,2,3], 'org_id': [2,3,2], 'score': [5,6,7]})
key_columns = ['member_id', 'org_id']
df.columns.tolist()

['member_id', 'org_id', 'score']

In [39]:
score_columns = list(set(df.columns.tolist()) - set(key_columns))
score_columns

['score']

In [40]:
csv_columns = key_columns + sorted(score_columns)
csv_columns

['member_id', 'org_id', 'score']