In [1]:
import numpy as np
import pandas as pd


In [2]:
### Boolean Comparisons

## Series and DataFrame have the binary comparison methods eq, ne, le, lt, ge, and gt whose behavior is vectorized:

# - eq (equivalent to ==) — equals to
# - ne (equivalent to !=) — not equals to
# - le (equivalent to <=) — less than or equals to
# - lt (equivalent to <) — less than
# - ge (equivalent to >=) — greater than or equals to
# - gt (equivalent to >) — greater than

In [3]:
df = pd.DataFrame({
        'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
        'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
        'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})


In [4]:
df2 = df.copy()

In [5]:
df.gt(df2)

Unnamed: 0,one,two,three
a,False,False,False
b,False,False,False
c,False,False,False
d,False,False,False


In [6]:
df2.ne(df)

Unnamed: 0,one,two,three
a,False,False,True
b,False,False,False
c,False,False,False
d,True,False,False


In [7]:
np.nan == np.nan

False

In [8]:
(df > 0).all()

one      False
two      False
three    False
dtype: bool

In [9]:
(df > 0).any()

one      False
two       True
three    False
dtype: bool

In [10]:
(df > 0).any().any()

True

In [11]:
## To evaluate single-element pandas objects in a boolean context, use the method bool():

In [12]:
pd.Series([True]).bool()

True

In [13]:
pd.Series([False]).bool()

False

In [14]:
pd.DataFrame([[True]]).bool()

True

In [15]:
pd.DataFrame([[False]]).bool()

False

In [16]:
### Objects comparison

## You can conveniently perform element-wise comparisons when comparing a pandas data structure with a scalar value:

In [17]:
pd.Series(['foo', 'bar', 'baz']) == 'foo'

0     True
1    False
2    False
dtype: bool

In [18]:
## Pandas also handles element-wise comparisons between different array-like objects of the same length:

In [19]:
pd.Series(['foo', 'bar', 'baz']) == pd.Index(['foo', 'bar', 'qux'])

0     True
1     True
2    False
dtype: bool

In [25]:
## Often you may find that there is more than one way to compute the same result. For example, consider df + df and df * 2. 
## To test that these two computations produce the same result, given the tools shown above, you might imagine using (df + df == df * 2).all().all().

In [23]:
(df + df == df * 2).all().all()

False

In [24]:
(df * 2 == df * 2).all().all()

False

In [26]:
(df + df == df * 2).all()

one      False
two       True
three    False
dtype: bool

In [27]:
df + df == df * 2

Unnamed: 0,one,two,three
a,True,True,False
b,True,True,True
c,True,True,True
d,False,True,True


In [28]:
## This happens because of the problem mentioned above that

In [29]:
np.nan == np.nan

False

In [30]:
## So, Pandas objects (such as Series and DataFrames) have an equals() method for testing equality, with NaNs in corresponding locations treated as equal.

In [31]:
(df + df).equals(df * 2)

True

In [32]:
### Descriptive Statistics


## There exists a large number of methods for computing descriptive statistics and other related operations on Series, DataFrame. 
## All of them are vectorized. Most of them are aggregations and produce a lower-dimensional result.

## Generally speaking, these methods take an axis as an argument and the axis can be specified by name or integer:

In [33]:
# Aggregation for each column
df.mean(0)

one     -0.477452
two      0.302817
three   -1.408413
dtype: float64

In [34]:
# Aggregation for each index
df.mean(1)

a   -0.175278
b   -0.247193
c   -0.529726
d   -0.882506
dtype: float64

In [35]:
## By applying vectorized operations, we can describe various statistical procedures, like standardization (rendering data zero mean and standard deviation 1), very concisely:

In [36]:
ts_stand = (df - df.mean()) / df.std()

In [37]:
ts_stand.std()


one      1.0
two      1.0
three    1.0
dtype: float64

In [38]:
### Describe

## There is a convenient describe() function which computes a variety of summary statistics about a Series or the columns of a DataFrame:

In [39]:
series = pd.Series(np.random.randn(1000))

In [40]:
series[::2] = np.nan

In [41]:
series.describe()

count    500.000000
mean      -0.075327
std        1.042239
min       -3.269558
25%       -0.770996
50%       -0.128995
75%        0.686795
max        2.821360
dtype: float64

In [42]:
frame = pd.DataFrame(np.random.randn(1000, 5),
   ....:                      columns=['a', 'b', 'c', 'd', 'e'])

In [43]:
frame.iloc[::2] = np.nan

In [44]:
frame.describe()

Unnamed: 0,a,b,c,d,e
count,500.0,500.0,500.0,500.0,500.0
mean,0.083752,-0.033958,-0.072402,-0.000468,0.028471
std,0.986182,0.971275,1.017486,0.946903,0.965605
min,-2.722843,-2.847506,-3.089472,-3.333071,-3.275872
25%,-0.609133,-0.701838,-0.764203,-0.617643,-0.606521
50%,0.090559,-0.03734,-0.117968,0.053157,-0.022916
75%,0.766593,0.643289,0.579022,0.657906,0.672616
max,3.189406,3.115448,2.529109,2.649671,3.428582


In [45]:
## For a non-numerical Series object, describe() will give a simple summary of the number of unique values and the most frequently occurring values:

In [46]:
s = pd.Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a'])

In [47]:
s.describe()

count     9
unique    4
top       a
freq      5
dtype: object

In [48]:
### Index of Min/Max Values


## The idxmin() and idxmax() functions on Series and DataFrame compute the index labels with the minimum and maximum corresponding values:

In [49]:
s1 = pd.Series(np.random.randn(5))

In [50]:
s1

0    0.843218
1    0.896581
2   -0.616874
3   -0.398317
4    0.735110
dtype: float64

In [51]:
s1.idxmin(), s1.idxmax()

(2, 1)

In [52]:
df1 = pd.DataFrame(np.random.randn(5, 3), columns=['A', 'B', 'C'])

In [53]:
df1

Unnamed: 0,A,B,C
0,0.006393,0.763998,-0.468657
1,-0.6996,1.470513,-0.249117
2,0.517087,0.590093,0.174123
3,-0.637921,0.457735,0.536992
4,-0.008791,-0.321607,-2.019351


In [54]:
df1.idxmin(axis=0)

A    1
B    4
C    4
dtype: int64

In [55]:
df1.idxmax(axis=1)

0    B
1    B
2    B
3    C
4    A
dtype: object

In [56]:
### Iterations


## The behaviour of basic iterations over pandas objects depends on the type. When iterating over a Series, it is regarded as array-like, and basic iterations produces the values. 
## DataFrames follow the dict-like convention of iterating over the keys of the objects.

## In short, basic iteration (for i in object) produces:

# - Series: values
# - DataFrame: column labels

In [57]:
df = pd.DataFrame({'col1': np.random.randn(3),
                     'col2': np.random.randn(3)}, index=['a', 'b', 'c'])

In [58]:
for col in df:
        print(col)

col1
col2


In [59]:
## To iterate over the rows of a DataFrame, you can use the following methods:

# - items(): to iterate over the (key, value) pairs.

# - iterrows(): Iterate over the rows of a DataFrame as (index, Series) pairs. 
# This converts the rows to Series objects, which can change the dtypes and has some performance implications.

# - itertuples(): Iterate over the rows of a DataFrame as namedtuples of the values. 
# This is a lot faster than iterrows() and is in most cases preferable to use to iterate over the values of a DataFrame.

In [60]:
### items

## Consistent with the dict-like interface, items() iterates through key-value pairs:

# - Series: (index, scalar value) pairs
# - DataFrame: (column, Series) pairs

## For example:

In [61]:
df = pd.DataFrame({'a': [1, 2, 3], 'b': ['a', 'b', 'c']})

In [62]:
for label, ser in df.items():
        print(label)
        print(ser) 

a
0    1
1    2
2    3
Name: a, dtype: int64
b
0    a
1    b
2    c
Name: b, dtype: object


In [63]:
### iterrows

## iterrows() allows you to iterate through the rows of a DataFrame as Series objects. 
## It returns an iterator yielding each index value along with a Series containing the data in each row:

In [64]:
for row_index, row in df.iterrows():
        print(row_index, row, sep='\n')

0
a    1
b    a
Name: 0, dtype: object
1
a    2
b    b
Name: 1, dtype: object
2
a    3
b    c
Name: 2, dtype: object


In [65]:
### itertuple

## The itertuples() method will return an iterator yielding a namedtuple for each row in the DataFrame. 
## The first element of the tuple will be the row’s corresponding index value, while the remaining values are the row values.

## For example:

In [66]:
for row in df.itertuples():
        print(row)

Pandas(Index=0, a=1, b='a')
Pandas(Index=1, a=2, b='b')
Pandas(Index=2, a=3, b='c')
