In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd

# use seaborn plotting defaults
import seaborn as sns; sns.set()

In [2]:
df = pd.DataFrame({"key1" : ["a", "a", None, "b", "b", "a", None],
                    "key2" : pd.Series([1, 2, 1, 2, 1, None, 1],
                                       dtype="Int64"),
                    "data1" : np.random.standard_normal(7),
                    "data2" : np.random.standard_normal(7)})

df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,-0.095877,1.897078
1,a,2.0,0.152642,0.809096
2,,1.0,-1.493412,1.831482
3,b,2.0,-0.883866,0.347696
4,b,1.0,-0.14196,0.090576
5,a,,-0.220403,-0.014901
6,,1.0,1.177311,-1.179757


In [3]:
df['data1'].groupby([df['key1'], df['key2']]).mean().unstack()

key2,1,2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.095877,0.152642
b,-0.14196,-0.883866


In [4]:
df.groupby(["key1", "key2"]).size().unstack()

key2,1,2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1,1
b,1,1


In [5]:
for name, group in df.groupby("key1"):
     print(name)
     print('—0—0—')
     print(group)
     print('—1—1—')

a
—0—0—
  key1  key2     data1     data2
0    a     1 -0.095877  1.897078
1    a     2  0.152642  0.809096
5    a  <NA> -0.220403 -0.014901
—1—1—
b
—0—0—
  key1  key2     data1     data2
3    b     2 -0.883866  0.347696
4    b     1 -0.141960  0.090576
—1—1—


In [6]:
df.groupby({"key1": "key", "key2": "key",
            "data1": "data", "data2": "data"}, axis="columns").count()

  df.groupby({"key1": "key", "key2": "key",


Unnamed: 0,data,key
0,2,2
1,2,2
2,2,1
3,2,2
4,2,2
5,2,1
6,2,1


In [7]:
people = pd.DataFrame(np.random.standard_normal((5, 5)),
                       columns=["a", "b", "c", "d", "e"],
                      index=["Joe", "Steve", "Wanda", "Jill", "Trey"])

people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values

people

Unnamed: 0,a,b,c,d,e
Joe,-0.672179,-1.41861,-0.583388,0.360547,-0.381004
Steve,1.539029,-1.161115,-0.65847,-1.627633,-0.237058
Wanda,-2.399011,,,-1.565,-0.791642
Jill,-2.173003,-0.359169,-0.284476,-0.728974,1.822831
Trey,-0.277356,-2.03216,-0.888443,-2.094701,-1.190346


In [12]:
mapping = {'a':'peppa', 'b':'peppa', 'c':'pig', 'd':'pig', 'e':'pop'}

by_c = people.T.groupby(mapping)

by_c.sum()

Unnamed: 0,Joe,Steve,Wanda,Jill,Trey
peppa,-2.090789,0.377914,-2.399011,-2.532172,-2.309516
pig,-0.22284,-2.286102,-1.565,-1.01345,-2.983144
pop,-0.381004,-0.237058,-0.791642,1.822831,-1.190346


In [14]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-0.672179,-1.41861,-0.583388,0.360547,-0.381004
4,-2.450359,-2.391329,-1.172919,-2.823675,0.632486
5,-0.859982,-1.161115,-0.65847,-3.192633,-1.0287


In [15]:
columns = pd.MultiIndex.from_arrays([["US", "US", "US", "JP", "JP"],
                                     [1, 3, 5, 1, 3]],
                                     names=["cty", "tenor"])

hier_df = pd.DataFrame(np.random.standard_normal((4, 5)), columns=columns)

hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,1.910974,1.689141,0.336022,0.319427,-0.985295
1,-0.811466,-0.906339,0.330764,0.159338,-1.162582
2,-1.36862,0.622443,2.270966,0.914277,-0.275083
3,0.850189,2.004425,0.237512,-0.201385,0.67131


In [31]:
hier_df.T

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3
cty,tenor,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
US,1,1.910974,-0.811466,-1.36862,0.850189
US,3,1.689141,-0.906339,0.622443,2.004425
US,5,0.336022,0.330764,2.270966,0.237512
JP,1,0.319427,0.159338,0.914277,-0.201385
JP,3,-0.985295,-1.162582,-0.275083,0.67131


In [34]:
hier_df.T.groupby(level='cty').all()

Unnamed: 0_level_0,0,1,2,3
cty,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
JP,True,True,True,True
US,True,True,True,True


In [35]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,-0.095877,1.897078
1,a,2.0,0.152642,0.809096
2,,1.0,-1.493412,1.831482
3,b,2.0,-0.883866,0.347696
4,b,1.0,-0.14196,0.090576
5,a,,-0.220403,-0.014901
6,,1.0,1.177311,-1.179757


In [45]:
tips = pd.read_csv(r'C:\Users\matte\OneDrive\Desktop\GitHub\data\py_4_dataana\tips.csv')
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.5,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4


In [46]:
tips["tip_pct"] = tips["tip"] / tips["total_bill"]

In [48]:
grouped = tips.groupby(["day", "smoker"])

In [54]:
grouped['tip_pct'].agg([('average', 'mean'), 'std']).reset_index()

Unnamed: 0,day,smoker,average,std
0,Fri,No,0.15165,0.028123
1,Fri,Yes,0.174783,0.051293
2,Sat,No,0.158048,0.039767
3,Sat,Yes,0.147906,0.061375
4,Sun,No,0.160113,0.042347
5,Sun,Yes,0.18725,0.154134
6,Thur,No,0.160298,0.038774
7,Thur,Yes,0.163863,0.039389


In [55]:
def top(df, n=5, column='tip_pct'):
    return df.sort_values(column, ascending=False)[:n]

top(tips)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
232,11.61,3.39,No,Sat,Dinner,2,0.29199
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535


In [62]:
tips.groupby('smoker').apply(top, n=5, column='total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,212,48.33,9.0,No,Sat,Dinner,4,0.18622
No,59,48.27,6.73,No,Sat,Dinner,4,0.139424
No,156,48.17,5.0,No,Sun,Dinner,6,0.103799
No,142,41.19,5.0,No,Thur,Lunch,5,0.121389
No,23,39.42,7.58,No,Sat,Dinner,4,0.192288
Yes,170,50.81,10.0,Yes,Sat,Dinner,3,0.196812
Yes,182,45.35,3.5,Yes,Sun,Dinner,3,0.077178
Yes,102,44.3,2.5,Yes,Sat,Dinner,3,0.056433
Yes,197,43.11,5.0,Yes,Thur,Lunch,4,0.115982
Yes,184,40.55,3.0,Yes,Sun,Dinner,2,0.073983


In [63]:
frame = pd.DataFrame({"data1": np.random.standard_normal(1000),
                       "data2": np.random.standard_normal(1000)})
frame.head()

Unnamed: 0,data1,data2
0,-0.751967,-0.151767
1,0.047954,-0.794129
2,0.290496,-0.942657
3,2.075461,-0.772079
4,1.021671,-1.078596


In [65]:
quartiles = pd.qcut(frame['data1'], 4)
quartiles.head()

0    (-2.741, -0.692]
1     (0.0104, 0.658]
2     (0.0104, 0.658]
3      (0.658, 2.929]
4      (0.658, 2.929]
Name: data1, dtype: category
Categories (4, interval[float64, right]): [(-2.741, -0.692] < (-0.692, 0.0104] < (0.0104, 0.658] < (0.658, 2.929]]

In [77]:
grouped = frame.groupby(quartiles, observed=True)

In [84]:
def get_stats(x):
    return  pd.DataFrame(
        {"min": x.min(), "max": x.max(),"count": x.count(), "mean": x.mean()}
    )

grouped.apply(get_stats)

Unnamed: 0_level_0,Unnamed: 1_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(-2.741, -0.692]",data1,-2.740314,-0.692272,250,-1.276004
"(-2.741, -0.692]",data2,-3.8083,2.96553,250,-0.125347
"(-0.692, 0.0104]",data1,-0.691927,0.008309,250,-0.332839
"(-0.692, 0.0104]",data2,-3.261547,3.65143,250,0.012843
"(0.0104, 0.658]",data1,0.01248,0.657291,250,0.303118
"(0.0104, 0.658]",data2,-2.534083,3.088962,250,0.049492
"(0.658, 2.929]",data1,0.660508,2.92864,250,1.230363
"(0.658, 2.929]",data2,-2.918878,4.743236,250,0.073401
