In [1]:
%matplotlib inline
import numpy as np
import numpy.ma as ma
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
import timeit
import line_profiler

import datetime

from io import StringIO
from pandas.api.types import CategoricalDtype

import pandas._testing as tm

## returns df from string data
def g(data):
    return pd.read_csv(StringIO(data))

# 2.16.1 Statistical functions

In [5]:
ser = pd.Series(np.random.randn(8))
ser.pct_change()

0         NaN
1   -1.923373
2   -2.931038
3   -0.935503
4   -0.123427
5    7.136023
6   -1.639340
7   -2.450775
dtype: float64

In [7]:
df = DataFrame(np.random.rand(10, 4))
df.pct_change(3)

Unnamed: 0,0,1,2,3
0,,,,
1,,,,
2,,,,
3,-0.020591,12.475995,0.967325,-0.553468
4,1.522533,-0.759493,4.268283,-0.128637
5,-0.839266,4.969317,-0.877771,-0.342039
6,-0.230832,0.994627,-0.320743,1.306429
7,0.511824,3.978803,2.44223,-0.767423
8,1.038695,14.97271,17.444836,0.235789
9,-0.085083,0.054706,0.009563,-0.911786


In [14]:
s = pd.Series(range(1,6))
print(s)
s.pct_change(3)

0    1
1    2
2    3
3    4
4    5
dtype: int64


0    NaN
1    NaN
2    NaN
3    3.0
4    1.5
dtype: float64

In [16]:
s1 = pd.Series(range(1,11))
s2 = pd.Series(range(2,21,2))
s1.cov(s2)

18.333333333333332

In [18]:
frame = pd.DataFrame(np.random.randn(1000, 5), columns=["a", "b", "c", "d", "e"])
frame.cov()

Unnamed: 0,a,b,c,d,e
a,1.050728,0.02159,-0.033379,0.031233,0.03594
b,0.02159,1.030669,0.02236,-0.035406,0.055048
c,-0.033379,0.02236,1.059442,0.056878,-0.038169
d,0.031233,-0.035406,0.056878,0.933382,0.052404
e,0.03594,0.055048,-0.038169,0.052404,0.895523


In [20]:
frame = pd.DataFrame(np.random.randn(20, 3), columns=["a", "b", "c"])
frame.loc[frame.index[:5], "a"] = np.nan
frame.loc[frame.index[5:10], "b"] = np.nan
frame.cov()
frame.cov(min_periods=12)

Unnamed: 0,a,b,c
a,0.969831,,0.318122
b,,0.937038,-0.03723
c,0.318122,-0.03723,0.880216


In [24]:
s1 = pd.Series(range(1,6))
s2 = pd.Series(range(6,11))
s2[:3] = np.nan
s1.cov(s2, min_periods=2)

0.5

In [26]:
frame = pd.DataFrame(np.random.randn(20, 3), columns=["a", "b", "c"])
frame.loc[frame.index[:5], "a"] = np.nan
frame.loc[frame.index[5:10], "b"] = np.nan
frame.corr()
frame.corr(min_periods=12)

Unnamed: 0,a,b,c
a,1.0,,-0.268127
b,,1.0,-0.058104
c,-0.268127,-0.058104,1.0


In [27]:
def histogram_intersection(a, b):
    return np.minimum(np.true_divide(a, a.sum()), 
                      np.true_divide(b, b.sum())).sum()

frame.corr(method=histogram_intersection)

Unnamed: 0,a,b,c
a,1.0,-0.449048,-6.072194
b,-0.449048,1.0,-0.541214
c,-6.072194,-0.541214,1.0


In [28]:
index = ["a", "b", "c", "d", "e"]
columns = ["one", "two", "three", "four"]
df1 = pd.DataFrame(np.random.randn(5, 4), index=index, columns=columns)
df2 = pd.DataFrame(np.random.randn(4, 4), index=index[:4], columns=columns)

df1.corrwith(df2)

one      0.517229
two     -0.014314
three   -0.362604
four    -0.563880
dtype: float64

In [30]:
s = pd.Series(np.random.randn(5), index=list("abcde"))
s["d"] = s["b"] # so there's a tie
s.rank()

a    1.688946
b    1.489742
c   -0.845329
d    1.489742
e    0.695434
dtype: float64

In [31]:
s.rank()

a    5.0
b    3.5
c    1.0
d    3.5
e    2.0
dtype: float64

In [36]:
df = pd.DataFrame(np.random.randn(10, 6))
df[4] = df[2][:5] # some ties
df.rank(1, ascending=False)

Unnamed: 0,0,1,2,3,4,5
0,6.0,1.0,2.5,4.0,2.5,5.0
1,6.0,3.0,1.5,4.0,1.5,5.0
2,5.0,4.0,2.5,1.0,2.5,6.0
3,3.0,1.0,4.5,2.0,4.5,6.0
4,2.0,4.0,5.5,1.0,5.5,3.0
5,3.0,1.0,4.0,5.0,,2.0
6,4.0,1.0,3.0,5.0,,2.0
7,3.0,1.0,4.0,2.0,,5.0
8,5.0,4.0,3.0,2.0,,1.0
9,4.0,2.0,3.0,1.0,,5.0


# Group by

## 2.17.1 Splitting an object into groups

In [8]:
df = pd.DataFrame([
    ("bird", "Falconiformes", 389.0),
    ("bird", "Psittaciformes", 24.0),
    ("mammal", "Carnivora", 80.2),
    ("mammal", "Primates", np.nan),
    ("mammal", "Carnivora", 58),
],
   index=["falcon", "parrot", "lion", "monkey", "leopard"],
   columns=("class", "order", "max_speed"), 
)
df


# default is axis=0
grouped = df.groupby('class')
grouped = df.groupby('order', axis='columns')
grouped = df.groupby(["class", "order"])

In [9]:
df = pd.DataFrame({
    "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
    "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
    "C": np.random.randn(8),
    "D": np.random.randn(8),
})
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.410294,0.89723
1,bar,one,-1.665443,0.618558
2,foo,two,1.287168,0.623793
3,bar,three,0.572997,-1.969315
4,foo,two,0.505494,2.211682
5,bar,two,0.808721,-0.10076
6,foo,one,1.955959,1.619905
7,foo,three,-1.10208,-2.951034


In [15]:
df2 = df.set_index(["A", "B"])
grouped = df2.groupby(df2.index.names.difference(["B"]))
grouped.sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-0.283725,-1.451516
foo,2.236245,2.401575


In [19]:
    def get_letter_type(letter):
        if letter.lower() in 'aeiou':
            return 'vowel'
        else:
            return 'consonant'
    df.groupby(get_letter_type, axis=1).apply(lambda x: x.values)

consonant    [[one, -0.4102942797204193, 0.8972297170043543...
vowel        [[foo], [bar], [foo], [bar], [foo], [bar], [fo...
dtype: object

In [24]:
lst = [1, 2, 3, 1, 2, 3]
s = pd.Series([1, 2, 3, 10, 20, 30], lst)
grouped = s.groupby(level=0)
grouped.first()
grouped.last()
grouped.sum()

1    11
2    22
3    33
dtype: int64

In [27]:
df2 = pd.DataFrame({"X": ["B", "B", "A", "A"], "Y": [1, 2, 3, 4]})
df2.groupby('X', sort=False).sum()
df2.groupby('X').sum()

Unnamed: 0_level_0,Y
X,Unnamed: 1_level_1
A,7
B,3


In [30]:
df3 = pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]})
df3.groupby('X').get_group('A')
df3.groupby('X').get_group('B')

Unnamed: 0,X,Y
1,B,4
3,B,2


In [34]:
df_list = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
df_dna = pd.DataFrame(df_list, columns=["a", "b", "c"])
df_dna

# Default ``dropna`` is set to True, which will exclude NaNs in keys
df_dna.groupby(by="b").sum()
df_dna.groupby(by="b", dropna=False).sum()

Unnamed: 0_level_0,a,c
b,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,2,3
2.0,2,5
,1,4


In [41]:
df.groupby('A').groups

{'bar': [1, 3, 5], 'foo': [0, 2, 4, 6, 7]}

In [42]:
 df.groupby(get_letter_type, axis=1).groups

{'consonant': ['B', 'C', 'D'], 'vowel': ['A']}

In [46]:
df
grouped = df.groupby(['A','B'])
grouped.groups
len(grouped)

6

In [55]:
d="""date,height,weight,gender
2000-01-01,42.849980,157.500553,male
2000-01-02,49.607315,177.340407,male
2000-01-03,56.293531,171.524640,male
2000-01-04,48.421077,144.251986,female
2000-01-05,46.556882,152.526206,male
2000-01-06,68.448851,168.272968,female
2000-01-07,70.757698,136.431469,male
2000-01-08,58.909500,176.499753,female
2000-01-09,76.435631,174.094104,female
2000-01-10,45.306120,177.540920,male"""
df=g(d)
df['date'] = pd.to_datetime(df.date)
df = df.set_index('date')
gb = df.groupby('gender')
gb.<TAB>

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1294b5e50>

In [59]:
arrays = [
    ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
    ["one", "two", "one", "two", "one", "two", "one", "two"],
]
index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second'])
s = pd.Series(np.random.rand(8), index=index)
s

# We can then group by one of the levels in s.
grouped = s.groupby(level=1)
grouped.sum()

# If the MultiIndex has names specified, these can be passed instead of the level number:
s.groupby(level='second').sum()


second
one    0.820280
two    1.880629
dtype: float64

In [60]:
s.sum(level='second')

second
one    0.820280
two    1.880629
dtype: float64