### Example data

In [None]:
import pandas as pd
df = pd.read_pickle('~/data2016/oct28/es5858_72.pickle')

### what else is out there:
http://nbviewer.jupyter.org/github/rasbt/python_reference/blob/master/tutorials/things_in_pandas.ipynb


# Groupby

#### Groupby single column, return flat structure dataframe, renaming aggregrate column

In [2]:
col = 'source_fingerprint'
df.groupby(col).size().reset_index(name="count_" + col).head(2)

Unnamed: 0,source_fingerprint,count_source_fingerprint
0,113458435,2012
1,130340976,1


#### Groupby single column, return flat structure dataframe, renaming aggregrate column, sorting

In [6]:
disp = (df.groupby(col).size()
        .reset_index(name="count_" + col)
        .sort_values(by="count_" + col,ascending=False))
disp.head(2)

Unnamed: 0,source_fingerprint,count_source_fingerprint
28,3234692311,2884
0,113458435,2012


#### Groupby multiple columns, return flat structure dataframe, renaming aggregrate column

In [15]:
col2 = 'expert_rule_hash'
df.groupby([col,col2]).size().reset_index(name="count").head(5)

Unnamed: 0,source_fingerprint,expert_rule_hash,count
0,113458435,-1363508622,1925
1,113458435,-1228492193,87
2,130340976,-761585423,1
3,151981700,-1854639493,2
4,294460806,-180095330,7


#### Discussion
Simply using the `groupby(col).count()` returns multi-index dataframes, which are annoying to manipulate in subsequent operations. Also, the above line will return all other fields as columns in the returned frame.

# iloc vs ix vs loc
* loc works on labels in the index.
* iloc works on the positions in the index (so it only takes integers).
* ix usually tries to behave like loc but falls back to behaving like iloc if the label is not in the index.

In general, avoid ix. It is confusing.

In [19]:
import numpy as np

In [26]:
range(10,20)

[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [39]:
id_df = pd.DataFrame(np.random.rand(5,5), index=np.linspace(11,15,5))
id_df

Unnamed: 0,0,1,2,3,4
11.0,0.202578,0.997552,0.802047,0.329609,0.276571
12.0,0.993908,0.596351,0.170369,0.779017,0.194998
13.0,0.344082,0.614996,0.280879,0.993496,0.451992
14.0,0.295794,0.281893,0.321557,0.403914,0.91544
15.0,0.771991,0.550304,0.267051,0.59913,0.647488


In [40]:
id_df.iloc[[0,2,4]]

Unnamed: 0,0,1,2,3,4
11.0,0.202578,0.997552,0.802047,0.329609,0.276571
13.0,0.344082,0.614996,0.280879,0.993496,0.451992
15.0,0.771991,0.550304,0.267051,0.59913,0.647488


In [41]:
id_df.loc[[11, 13, 15]]

Unnamed: 0,0,1,2,3,4
11,0.202578,0.997552,0.802047,0.329609,0.276571
13,0.344082,0.614996,0.280879,0.993496,0.451992
15,0.771991,0.550304,0.267051,0.59913,0.647488


In [45]:
id_df.ix[[11, 13]]

Unnamed: 0,0,1,2,3,4
11,0.202578,0.997552,0.802047,0.329609,0.276571
13,0.344082,0.614996,0.280879,0.993496,0.451992


In [52]:
id_df.loc[:2]

Unnamed: 0,0,1,2,3,4


In [54]:
id_df.ix[:2]

Unnamed: 0,0,1,2,3,4


In [55]:
id_df.iloc[:2, :2]

Unnamed: 0,0,1
11.0,0.202578,0.997552
12.0,0.993908,0.596351
