In [1]:
import pandas as pd
import numpy as np

In [3]:
s = pd.Series(range(10), index = ['a','b','c','e','a','b','g','h','c','k'])

In [4]:
s

a    0
b    1
c    2
e    3
a    4
b    5
g    6
h    7
c    8
k    9
dtype: int64

In [6]:
s.index.is_unique

False

In [37]:
s3 = pd.Series(['c','a','b','a','c','r','b','k'])

In [38]:
uniques = s3.unique()

In [39]:
uniques

array(['c', 'a', 'b', 'r', 'k'], dtype=object)

In [46]:
### You can sort it too
uniques.sort()

In [47]:
uniques

array(['a', 'b', 'c', 'k', 'r'], dtype=object)

In [None]:
### The isin method is responsible for vectorized set membership and can be very useful in filtering a data set down
### to a subset of values in a Series or column in a DF

In [48]:
foo = s3.isin(['b','c'])

In [49]:
foo

0     True
1    False
2     True
3    False
4     True
5    False
6     True
7    False
dtype: bool

In [50]:
s3

0    c
1    a
2    b
3    a
4    c
5    r
6    b
7    k
dtype: object

In [None]:
### As you can see, wherever b or c is encountered, True is returned and False otherwise

In [51]:
s3[foo]

0    c
2    b
4    c
6    b
dtype: object

In [53]:
### The above technique will help us in retreiving the positions of the labels we are looking for

In [55]:
foo1 = s3.isin(['k'])

In [56]:
foo1

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7     True
dtype: bool

In [57]:
s3[foo1]

7    k
dtype: object

In [None]:
################################################################################################################################

In [41]:
s3.value_counts()

b    2
c    2
a    2
k    1
r    1
dtype: int64

In [7]:
### Data selection is one of the main things that behaves differently with duplicates

### Indexing a value with multiple entries returns a series

s['a']

a    0
a    4
dtype: int64

In [8]:
### While single entries returns a scalar value

s['k']

9

In [10]:
### The same logic extends to indexing rows in a DataFrame

df = pd.DataFrame(np.random.randn(4,3), index = ['a','a','b','b'])

In [11]:
df

Unnamed: 0,0,1,2
a,-1.349261,0.481525,0.31441
a,-0.660846,-0.870718,-1.688254
b,0.256054,-0.517757,1.092442
b,-1.133664,2.782009,1.136657


In [12]:
df.ix['b']

Unnamed: 0,0,1,2
b,0.256054,-0.517757,1.092442
b,-1.133664,2.782009,1.136657


In [None]:
### Pandas objects are equipped with a set of common mathematical and statistical methods.
### Most of these fall into the category of reductions or summary statistics, methods that extract a single value (like sum or mean)
### from a Series or a series of values from the rows or columns of a DF
### They are all built up from ground up to exclude missing data.


In [13]:
### example

df.sum()

0   -2.887716
1    1.875060
2    0.855255
dtype: float64

In [14]:
df

Unnamed: 0,0,1,2
a,-1.349261,0.481525,0.31441
a,-0.660846,-0.870718,-1.688254
b,0.256054,-0.517757,1.092442
b,-1.133664,2.782009,1.136657


In [16]:
### The sum method adds up the elements of a column together cos axis 0 is the default
### If you wanna add over the rows, this is how you do it

df.sum(axis=1)

a   -0.553326
a   -3.219818
b    0.830739
b    2.785002
dtype: float64

In [None]:
### NA values are excluded unless the entire slice(row or column) is NA. This can be disabled using the skipna function.

In [18]:
df.idxmax()

0    b
1    b
2    b
dtype: object

In [19]:
### the above method returns the index value where the maximum value is found

### Wanna see where the minimum value is found?

df.idxmin()

0    a
1    a
2    a
dtype: object

In [None]:
### The above methods are used to return indirect statistics

In [20]:
### There are other methods for accumulations

In [21]:
df.cumsum()

Unnamed: 0,0,1,2
a,-1.349261,0.481525,0.31441
a,-2.010107,-0.389192,-1.373844
b,-1.754053,-0.906949,-0.281402
b,-2.887716,1.87506,0.855255


In [22]:
df

Unnamed: 0,0,1,2
a,-1.349261,0.481525,0.31441
a,-0.660846,-0.870718,-1.688254
b,0.256054,-0.517757,1.092442
b,-1.133664,2.782009,1.136657


In [23]:
### As you can see, cummulative sum works in this way. x, x1 = x+x+1, x2= x1+x+2, x3 = x2+x+3

In [24]:
### The methods that return a single value, (cumsum returns accumulated results) are called reduction methods
### cumsum is an example of accumulation

In [25]:
### There are methods that can neither be categorized as reduction or accumulation. "Describe" is one such example

In [26]:
df.describe()

Unnamed: 0,0,1,2
count,4.0,4.0,4.0
mean,-0.721929,0.468765,0.213814
std,0.712567,1.645063,1.323079
min,-1.349261,-0.870718,-1.688254
25%,-1.187563,-0.605997,-0.186256
50%,-0.897255,-0.018116,0.703426
75%,-0.431621,1.056646,1.103496
max,0.256054,2.782009,1.136657


In [29]:
### On non-numeric data, Describe produces alternate summary stats

s1 = pd.Series(['a','a','b','c'] * 4)

In [30]:
s1

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [31]:
s1.describe()

count     16
unique     3
top        a
freq       8
dtype: object