# General functions

In [22]:
import pandas as pd
import numpy as np

## pandas.melt

In [3]:
df = pd.DataFrame({'A' : {0: 'a', 1: 'b', 2: 'c'},
                  'B' : {0: 1, 1: 2, 2: 3},
                  'C' : {0: 2, 1: 4, 2: 6}})

df

Unnamed: 0,A,B,C
0,a,1,2
1,b,2,4
2,c,3,6


In [4]:
df_melt = pd.melt(df, id_vars = ['A'], value_vars = ['B','C'])

In [5]:
df_melt

Unnamed: 0,A,variable,value
0,a,B,1
1,b,B,2
2,c,B,3
3,a,C,2
4,b,C,4
5,c,C,6


## pandas.pivot

In [6]:
df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
                           'two'],
                   'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'baz': [1, 2, 3, 4, 5, 6],
                   'zoo': ['x', 'y', 'z', 'q', 'w', 't']})

In [7]:
df

Unnamed: 0,foo,bar,baz,zoo
0,one,A,1,x
1,one,B,2,y
2,one,C,3,z
3,two,A,4,q
4,two,B,5,w
5,two,C,6,t


In [17]:
df.pivot(index = "foo", columns = "bar", values = ["baz", "zoo"])

Unnamed: 0_level_0,baz,baz,baz,zoo,zoo,zoo
bar,A,B,C,A,B,C
foo,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
one,1,2,3,x,y,z
two,4,5,6,q,w,t


In [18]:
df.pivot(index = "foo", columns = ["bar","baz"], values = ["zoo"])

Unnamed: 0_level_0,zoo,zoo,zoo,zoo,zoo,zoo
bar,A,B,C,A,B,C
baz,1,2,3,4,5,6
foo,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
one,x,y,z,,,
two,,,,q,w,t


In [19]:
df.pivot(index = ["foo","baz"], columns = ["bar"], values = ["zoo"])

Unnamed: 0_level_0,Unnamed: 1_level_0,zoo,zoo,zoo
Unnamed: 0_level_1,bar,A,B,C
foo,baz,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,1,x,,
one,2,,y,
one,3,,,z
two,4,q,,
two,5,,w,
two,6,,,t


## pandas.pivot_table

In [25]:
df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
                         "bar", "bar", "bar", "bar"],
                   "B": ["one", "one", "one", "two", "two",
                         "one", "one", "two", "two"],
                   "C": ["small", "large", "large", "small",
                         "small", "large", "small", "small",
                         "large"],
                   "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
                   "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
df

Unnamed: 0,A,B,C,D,E
0,foo,one,small,1,2
1,foo,one,large,2,4
2,foo,one,large,2,5
3,foo,two,small,3,5
4,foo,two,small,3,6
5,bar,one,large,4,6
6,bar,one,small,5,8
7,bar,two,small,6,9
8,bar,two,large,7,9


In [23]:
table = pd.pivot_table(df, values = "D", index = ['A', 'B'], columns = ['C'],aggfunc = np.sum)

In [24]:
table

Unnamed: 0_level_0,C,large,small
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,4.0,5.0
bar,two,7.0,6.0
foo,one,4.0,1.0
foo,two,,6.0


In [29]:
# handling NaN value
table = pd.pivot_table(df, values = ["D",'E'], index = ['A', 'B'], columns = ['C'],aggfunc = np.sum, fill_value = 0)
table

Unnamed: 0_level_0,Unnamed: 1_level_0,D,D,E,E
Unnamed: 0_level_1,C,large,small,large,small
A,B,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
bar,one,4,5,6,8
bar,two,7,6,9,9
foo,one,4,1,9,2
foo,two,0,6,0,11


In [28]:
table = pd.pivot_table(df, values = ["D", "E"], index = ['A','C'], aggfunc = {'D': np.mean , 'E':np.sum})
table

Unnamed: 0_level_0,Unnamed: 1_level_0,D,E
A,C,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,large,5.5,15
bar,small,5.5,17
foo,large,2.0,9
foo,small,2.333333,13


In [30]:
type(table)

pandas.core.frame.DataFrame

## pandas.crosstab

In [31]:
a = np.array(["foo", "foo", "foo", "foo", "bar", "bar",
              "bar", "bar", "foo", "foo", "foo"], dtype=object)
b = np.array(["one", "one", "one", "two", "one", "one",
              "one", "two", "two", "two", "one"], dtype=object)
c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny",
              "shiny", "dull", "shiny", "shiny", "shiny"],
             dtype=object)
pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])


b,one,one,two,two
c,dull,shiny,dull,shiny
a,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,1,2,1,0
foo,2,2,1,2


In [32]:
foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])
bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f'])
pd.crosstab(foo, bar)

col_0,d,e
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1,0
b,0,1


In [39]:
help(pd.core.arrays.categorical.Categorical)

Help on class Categorical in module pandas.core.arrays.categorical:

class Categorical(pandas.core.arrays._mixins.NDArrayBackedExtensionArray, pandas.core.base.PandasObject, pandas.core.strings.object_array.ObjectStringArrayMixin)
 |  Categorical(values, categories=None, ordered=None, dtype: 'Dtype | None' = None, fastpath: 'bool' = False, copy: 'bool' = True)
 |  
 |  Represent a categorical variable in classic R / S-plus fashion.
 |  
 |  `Categoricals` can only take on only a limited, and usually fixed, number
 |  of possible values (`categories`). In contrast to statistical categorical
 |  variables, a `Categorical` might have an order, but numerical operations
 |  (additions, divisions, ...) are not possible.
 |  
 |  All values of the `Categorical` are either in `categories` or `np.nan`.
 |  Assigning values outside of `categories` will raise a `ValueError`. Order
 |  is defined by the order of the `categories`, not lexical order of the
 |  values.
 |  
 |  Parameters
 |  -------

In [40]:
help(pd.crosstab)

Help on function crosstab in module pandas.core.reshape.pivot:

crosstab(index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins: 'bool' = False, margins_name: 'str' = 'All', dropna: 'bool' = True, normalize=False) -> 'DataFrame'
    Compute a simple cross tabulation of two (or more) factors. By default
    computes a frequency table of the factors unless an array of values and an
    aggregation function are passed.
    
    Parameters
    ----------
    index : array-like, Series, or list of arrays/Series
        Values to group by in the rows.
    columns : array-like, Series, or list of arrays/Series
        Values to group by in the columns.
    values : array-like, optional
        Array of values to aggregate according to the factors.
        Requires `aggfunc` be specified.
    rownames : sequence, default None
        If passed, must match number of row arrays passed.
    colnames : sequence, default None
        If passed, must match number of column 

## pandas .cut

In [41]:
pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)

[(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], (0.994, 3.0]]
Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] < (5.0, 7.0]]

In [42]:
type(pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3))

pandas.core.arrays.categorical.Categorical