In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd

# use seaborn plotting defaults
import seaborn as sns; sns.set()

In [2]:
df = pd.DataFrame({"key1" : ["a", "a", None, "b", "b", "a", None],
                    "key2" : pd.Series([1, 2, 1, 2, 1, None, 1],
                                       dtype="Int64"),
                    "data1" : np.random.standard_normal(7),
                    "data2" : np.random.standard_normal(7)})

df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,-0.141777,-0.295707
1,a,2.0,0.280018,-1.228986
2,,1.0,1.026452,-0.437585
3,b,2.0,0.156217,0.771456
4,b,1.0,-1.272687,-1.070423
5,a,,0.126895,0.817462
6,,1.0,-0.000522,1.893475


In [3]:
df['data1'].groupby([df['key1'], df['key2']]).mean().unstack()

key2,1,2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.141777,0.280018
b,-1.272687,0.156217


In [4]:
df.groupby(["key1", "key2"]).size().unstack()

key2,1,2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1,1
b,1,1


In [5]:
for name, group in df.groupby("key1"):
     print(name)
     print('—0—0—')
     print(group)
     print('—1—1—')

a
—0—0—
  key1  key2     data1     data2
0    a     1 -0.141777 -0.295707
1    a     2  0.280018 -1.228986
5    a  <NA>  0.126895  0.817462
—1—1—
b
—0—0—
  key1  key2     data1     data2
3    b     2  0.156217  0.771456
4    b     1 -1.272687 -1.070423
—1—1—


In [6]:
df.groupby({"key1": "key", "key2": "key",
            "data1": "data", "data2": "data"}, axis="columns").count()

  df.groupby({"key1": "key", "key2": "key",


Unnamed: 0,data,key
0,2,2
1,2,2
2,2,1
3,2,2
4,2,2
5,2,1
6,2,1


In [7]:
people = pd.DataFrame(np.random.standard_normal((5, 5)),
                       columns=["a", "b", "c", "d", "e"],
                      index=["Joe", "Steve", "Wanda", "Jill", "Trey"])

people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values

people

Unnamed: 0,a,b,c,d,e
Joe,1.924145,-0.481762,-0.537608,0.043393,0.403544
Steve,0.10311,0.329785,-0.416183,1.067073,-1.062327
Wanda,1.674474,,,1.65486,1.2388
Jill,-0.004064,1.044533,0.477863,-0.956031,-1.576923
Trey,1.072698,-0.972453,0.950562,0.264563,-1.378681


In [8]:
mapping = {'a':'peppa', 'b':'peppa', 'c':'pig', 'd':'pig', 'e':'pop'}

by_c = people.T.groupby(mapping)

by_c.sum()

Unnamed: 0,Joe,Steve,Wanda,Jill,Trey
peppa,1.442383,0.432895,1.674474,1.040469,0.100245
pig,-0.494215,0.65089,1.65486,-0.478168,1.215125
pop,0.403544,-1.062327,1.2388,-1.576923,-1.378681


In [9]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,1.924145,-0.481762,-0.537608,0.043393,0.403544
4,1.068633,0.07208,1.428425,-0.691468,-2.955604
5,1.777584,0.329785,-0.416183,2.721933,0.176473


In [10]:
columns = pd.MultiIndex.from_arrays([["US", "US", "US", "JP", "JP"],
                                     [1, 3, 5, 1, 3]],
                                     names=["cty", "tenor"])

hier_df = pd.DataFrame(np.random.standard_normal((4, 5)), columns=columns)

hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,0.372583,0.421875,-0.390346,-1.019517,0.745466
1,-0.01905,-0.31466,-0.748539,-1.938388,-0.939908
2,-0.294525,1.302488,0.514003,0.71484,0.881447
3,-1.909369,0.762093,1.372494,-0.467751,-1.190743


In [11]:
hier_df.T

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3
cty,tenor,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
US,1,0.372583,-0.01905,-0.294525,-1.909369
US,3,0.421875,-0.31466,1.302488,0.762093
US,5,-0.390346,-0.748539,0.514003,1.372494
JP,1,-1.019517,-1.938388,0.71484,-0.467751
JP,3,0.745466,-0.939908,0.881447,-1.190743


In [12]:
hier_df.T.groupby(level='cty').all()

Unnamed: 0_level_0,0,1,2,3
cty,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
JP,True,True,True,True
US,True,True,True,True


In [13]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,-0.141777,-0.295707
1,a,2.0,0.280018,-1.228986
2,,1.0,1.026452,-0.437585
3,b,2.0,0.156217,0.771456
4,b,1.0,-1.272687,-1.070423
5,a,,0.126895,0.817462
6,,1.0,-0.000522,1.893475


In [14]:
tips = pd.read_csv(r'C:\Users\matte\OneDrive\Desktop\GitHub\data\py_4_dataana\tips.csv')
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.5,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4


In [15]:
tips["tip_pct"] = tips["tip"] / tips["total_bill"]

In [16]:
grouped = tips.groupby(["day", "smoker"])

In [17]:
grouped['tip_pct'].agg([('average', 'mean'), 'std']).reset_index()

Unnamed: 0,day,smoker,average,std
0,Fri,No,0.15165,0.028123
1,Fri,Yes,0.174783,0.051293
2,Sat,No,0.158048,0.039767
3,Sat,Yes,0.147906,0.061375
4,Sun,No,0.160113,0.042347
5,Sun,Yes,0.18725,0.154134
6,Thur,No,0.160298,0.038774
7,Thur,Yes,0.163863,0.039389


In [18]:
def top(df, n=5, column='tip_pct'):
    return df.sort_values(column, ascending=False)[:n]

top(tips)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
232,11.61,3.39,No,Sat,Dinner,2,0.29199
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535


In [19]:
tips.groupby('smoker').apply(top, n=5, column='total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,212,48.33,9.0,No,Sat,Dinner,4,0.18622
No,59,48.27,6.73,No,Sat,Dinner,4,0.139424
No,156,48.17,5.0,No,Sun,Dinner,6,0.103799
No,142,41.19,5.0,No,Thur,Lunch,5,0.121389
No,23,39.42,7.58,No,Sat,Dinner,4,0.192288
Yes,170,50.81,10.0,Yes,Sat,Dinner,3,0.196812
Yes,182,45.35,3.5,Yes,Sun,Dinner,3,0.077178
Yes,102,44.3,2.5,Yes,Sat,Dinner,3,0.056433
Yes,197,43.11,5.0,Yes,Thur,Lunch,4,0.115982
Yes,184,40.55,3.0,Yes,Sun,Dinner,2,0.073983


In [20]:
frame = pd.DataFrame({"data1": np.random.standard_normal(1000),
                       "data2": np.random.standard_normal(1000)})
frame.head()

Unnamed: 0,data1,data2
0,0.318356,-0.281773
1,0.150244,1.154319
2,1.18978,0.3663
3,-0.147802,-0.25069
4,0.303794,-1.311797


In [21]:
quartiles = pd.qcut(frame['data1'], 4)
quartiles.head()

0     (0.0103, 0.699]
1     (0.0103, 0.699]
2      (0.699, 3.092]
3    (-0.699, 0.0103]
4     (0.0103, 0.699]
Name: data1, dtype: category
Categories (4, interval[float64, right]): [(-3.138, -0.699] < (-0.699, 0.0103] < (0.0103, 0.699] < (0.699, 3.092]]

In [22]:
grouped = frame.groupby(quartiles, observed=True)

In [23]:
def get_stats(x):
    return  pd.DataFrame(
        {"min": x.min(), "max": x.max(),"count": x.count(), "mean": x.mean()}
    )

grouped.apply(get_stats)

Unnamed: 0_level_0,Unnamed: 1_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(-3.138, -0.699]",data1,-3.136843,-0.702632,250,-1.316765
"(-3.138, -0.699]",data2,-2.604631,2.115691,250,0.071501
"(-0.699, 0.0103]",data1,-0.697238,0.009538,250,-0.330123
"(-0.699, 0.0103]",data2,-2.738821,2.154957,250,-0.006684
"(0.0103, 0.699]",data1,0.011106,0.698803,250,0.337158
"(0.0103, 0.699]",data2,-2.383047,2.884812,250,-0.059634
"(0.699, 3.092]",data1,0.700584,3.092457,250,1.25826
"(0.699, 3.092]",data2,-2.381668,2.530547,250,0.012023


In [24]:
s = pd.Series(np.random.standard_normal(6))
s[::2] = np.nan
s

0         NaN
1   -1.024997
2         NaN
3    0.568012
4         NaN
5    0.182681
dtype: float64

In [25]:
s.fillna(s.mean())

0   -0.091434
1   -1.024997
2   -0.091434
3    0.568012
4   -0.091434
5    0.182681
dtype: float64

In [27]:
states = ["Ohio", "New York", "Vermont", "Florida",
           "Oregon", "Nevada", "California", "Idaho"]

group_key = ["East", "East", "East", "East",
              "West", "West", "West", "West"]

data = pd.Series(np.random.standard_normal(8), index=states)

data[["Vermont", "Nevada", "Idaho"]] = np.nan
data

Ohio         -1.630047
New York      0.713762
Vermont            NaN
Florida       0.466735
Oregon       -0.494354
Nevada             NaN
California    1.629700
Idaho              NaN
dtype: float64

In [30]:
data.groupby(group_key).mean()

East   -0.149850
West    0.567673
dtype: float64

In [35]:
def fill_mean(x):
    return x.fillna(x.mean())

data.groupby(group_key).apply(fill_mean)

East  Ohio         -1.630047
      New York      0.713762
      Vermont      -0.149850
      Florida       0.466735
West  Oregon       -0.494354
      Nevada        0.567673
      California    1.629700
      Idaho         0.567673
dtype: float64

In [38]:
suits = ["H", "S", "C", "D"]  # Hearts, Spades, Clubs, Diamonds
card_val = (list(range(1, 11)) + [10] * 3) * 4
base_names = ["A"] + list(range(2, 11)) + ["J", "K", "Q"]
cards = []
for suit in suits:
    cards.extend(str(num) + suit for num in base_names)

deck = pd.Series(card_val, index=cards)
deck.head()

AH    1
2H    2
3H    3
4H    4
5H    5
dtype: int64

In [39]:
def draw(deck, n=5):
    return deck.sample(n)

draw(deck)

6D      6
AS      1
10C    10
JC     10
KS     10
dtype: int64

In [52]:
def get_suit(card):
    return card[-1]

deck.groupby(get_suit).apply(draw, n=2)

C  2C      2
   QC     10
D  10D    10
   QD     10
H  8H      8
   9H      9
S  7S      7
   AS      1
dtype: int64

In [55]:
df = pd.DataFrame({"category": ["a", "a", "a", "a",
                                 "b", "b", "b", "b"],
                    "data": np.random.standard_normal(8),
                    "weights": np.random.uniform(size=8)})

df

Unnamed: 0,category,data,weights
0,a,1.23657,0.048143
1,a,-0.709285,0.312576
2,a,-2.034179,0.801457
3,a,0.326016,0.195848
4,b,0.45539,0.490896
5,b,0.740763,0.371433
6,b,-0.645262,0.198376
7,b,-0.524921,0.286953


In [56]:
grouped = df.groupby("category")

def get_wavg(group):
    return np.average(group['data'], weights=group['weights'])

grouped.apply(get_wavg)

category
a   -1.272903
b    0.163291
dtype: float64

In [57]:
df = pd.DataFrame({'key': ['a', 'b', 'c'] * 4,
                    'value': np.arange(12.)})

In [58]:
df

Unnamed: 0,key,value
0,a,0.0
1,b,1.0
2,c,2.0
3,a,3.0
4,b,4.0
5,c,5.0
6,a,6.0
7,b,7.0
8,c,8.0
9,a,9.0


In [59]:
g = df.groupby('key')['value']
g.mean()

key
a    4.5
b    5.5
c    6.5
Name: value, dtype: float64

In [64]:
def get_mean(group):
    return group.mean()

g.transform(get_mean)

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

In [70]:
def normalize(x):
     return (x - x.mean()) / x.std()

g = df.groupby('key')['value']
g.transform(normalize)

0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64