In [1]:
import pandas as pd

In [2]:
df = pd.DataFrame(
    {
        "Name": [
            "Braund, Mr. Owen Harris",
            "Allen, Mr. William Henry",
            "Bonnell, Miss. Elizabeth",
        ],
        "Age": [22, 34, 45],
        "Sex": ["male", "male", "female"],
    }
)

In [3]:
df

Unnamed: 0,Name,Age,Sex
0,"Braund, Mr. Owen Harris",22,male
1,"Allen, Mr. William Henry",34,male
2,"Bonnell, Miss. Elizabeth",45,female


### Each column in a df is a Series

### let's work with the data in column "Age"

In [4]:
df["Age"]

0    22
1    34
2    45
Name: Age, dtype: int64

### how to create Series from scratch

In [5]:
ages = pd.Series([22, 34, 45], name="Age")

In [6]:
ages #no column labels for series only row labels for series

0    22
1    34
2    45
Name: Age, dtype: int64

### max age of passengers

In [7]:
df['Age'].max() #by selecting the Age column from dataframe

45

In [8]:
ages.max()  # from pandas Series

45

### basic statisctics with describe() method

In [9]:
df.describe()

Unnamed: 0,Age
count,3.0
mean,33.666667
std,11.503623
min,22.0
25%,28.0
50%,34.0
75%,39.5
max,45.0


### aggregations with describe

In [10]:
import numpy as np

In [11]:
series = pd.Series(np.random.randn(1000))

In [12]:
series

0     -0.354683
1      0.801164
2     -0.661934
3     -0.197221
4      0.660566
         ...   
995    1.069035
996   -2.022516
997    1.683397
998   -0.824121
999    0.128534
Length: 1000, dtype: float64

In [13]:
series[::2] = np.nan

In [14]:
series

0           NaN
1      0.801164
2           NaN
3     -0.197221
4           NaN
         ...   
995    1.069035
996         NaN
997    1.683397
998         NaN
999    0.128534
Length: 1000, dtype: float64

In [15]:
series.describe()

count    500.000000
mean       0.103984
std        0.999584
min       -3.017099
25%       -0.552061
50%        0.123486
75%        0.841661
max        3.026988
dtype: float64

In [21]:
# np.random.randn(1000, 5) will create 5 columns of random numbers with 1000 rows
# pd.DataFrame(x, columns=['a', 'b'...]) will give the column name
frame = pd.DataFrame(np.random.randn(1000, 5), columns = ['a', 'b', 'c', 'd', 'e'])

In [22]:
frame

Unnamed: 0,a,b,c,d,e
0,1.169935,-0.405144,-0.066089,-0.358985,1.675503
1,-2.193696,0.176616,0.972226,-0.035445,1.038014
2,-2.141097,0.417685,-0.194033,1.442898,-0.269936
3,0.757818,-1.174309,-0.439817,1.495973,-0.682287
4,-0.738937,1.559066,0.455216,0.281538,-2.396321
...,...,...,...,...,...
995,-0.846564,-0.159712,2.027616,0.047650,0.404581
996,0.216871,-0.620505,0.088555,-0.905627,1.742363
997,1.418658,0.232178,1.057874,0.333730,0.249942
998,-1.023643,0.614648,0.289276,-0.132214,-0.782078


In [25]:
frame.iloc[::2] = np.nan #iloc used for integer indexing

In [20]:
frame.describe()

Unnamed: 0,a,b,c,d,e
count,500.0,500.0,500.0,500.0,500.0
mean,0.035978,-0.037498,0.025229,0.00213,-0.067092
std,0.967399,1.032769,0.980106,1.029893,1.036419
min,-2.611786,-3.492245,-2.881768,-2.846208,-3.534457
25%,-0.597332,-0.726989,-0.616135,-0.707894,-0.75738
50%,0.124804,-0.053992,0.028515,-0.038079,-0.083551
75%,0.667989,0.660791,0.648947,0.707988,0.657989
max,3.067677,3.604923,2.846161,3.208011,3.16197


### we can select specific percentiles to include in the output

In [31]:
series.describe(percentiles=[0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])

count    500.000000
mean       0.103984
std        0.999584
min       -3.017099
5%        -1.537426
10%       -1.233941
20%       -0.752495
30%       -0.363522
40%       -0.098143
50%        0.123486
60%        0.403755
70%        0.674694
80%        1.002943
90%        1.359714
100%       3.026988
max        3.026988
dtype: float64

### for non numerical series, describe() will give a simple summary

In [32]:
s = pd.Series(['a', 'b', 'c', 'b', 'a', 'a', 'c', 'd', 'e', 'a', 'c', 'a', 'c', np.nan, 'a', 'b', 'c'])

In [33]:
s.describe()

count     16
unique     5
top        a
freq       6
dtype: object

### On mixed type DataFrame object, describe() will restrict the summary to include only numerical columns, if none are only categorical columns will be included

In [36]:
frame = pd.DataFrame({'a':["Y", "N", "N", "Y"], 'b': range(4)})

In [37]:
frame

Unnamed: 0,a,b
0,Y,0
1,N,1
2,N,2
3,Y,3


In [38]:
frame.describe()

Unnamed: 0,b
count,4.0
mean,1.5
std,1.290994
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


### However we can include non numerical columns too in describe, with paramters include/exclude

#### with include/exclude we can pass the datatypes only and 'all' 

In [40]:
frame.describe(include=['object'])

Unnamed: 0,a
count,4
unique,2
top,Y
freq,2


In [41]:
frame.describe(exclude=['number'])

Unnamed: 0,a
count,4
unique,2
top,Y
freq,2


In [42]:
frame.describe(include='all')

Unnamed: 0,a,b
count,4,4.0
unique,2,
top,Y,
freq,2,
mean,,1.5
std,,1.290994
min,,0.0
25%,,0.75
50%,,1.5
75%,,2.25


## Index of min/max values

* idxmin() will return the index of min value of series/dataframe
* idxmax() will return the index of max value of series/dataframe

In [44]:
s1 = pd.Series(np.random.randn(10))

In [45]:
s1

0    1.433208
1   -0.478271
2    0.543056
3   -0.432771
4   -0.931697
5   -1.321292
6   -1.019279
7   -1.300000
8   -0.946787
9   -2.011817
dtype: float64

In [47]:
s1.idxmin(), s1.idxmax()

(9, 0)

In [50]:
s1[9], s1[0]

(-2.0118166936234254, 1.4332078041689515)

In [51]:
s1.describe()

count    10.000000
mean     -0.646565
std       0.992817
min      -2.011817
25%      -1.229820
50%      -0.939242
75%      -0.444146
max       1.433208
dtype: float64

### now check with multiple series or dataframe columns

In [53]:
df1 = pd.DataFrame(np.random.randn(7, 5), columns=['a', 'b', 'c', 'd', 'e'])

In [54]:
df1

Unnamed: 0,a,b,c,d,e
0,-0.338929,-0.412811,0.242496,-1.838294,0.588975
1,-0.826719,2.067993,0.580856,0.366991,-0.033012
2,-0.15202,-1.029013,0.573091,0.610472,0.015568
3,0.088874,-0.62965,0.58675,-0.134877,1.330945
4,-1.748392,-0.372358,1.320572,-0.6191,0.02367
5,-0.831083,0.131637,-0.237758,-0.828414,-0.742476
6,-1.039771,-1.277665,-0.187859,-0.849789,0.655105


In [58]:
df1.idxmin(), df1.idxmax()

(a    4
 b    6
 c    5
 d    0
 e    5
 dtype: int64,
 a    3
 b    1
 c    4
 d    2
 e    3
 dtype: int64)

In [63]:
df1.idxmin(axis=0) #it returns index or rown number of each column which has min value

a    4
b    6
c    5
d    0
e    5
dtype: int64

In [64]:
df1

Unnamed: 0,a,b,c,d,e
0,-0.338929,-0.412811,0.242496,-1.838294,0.588975
1,-0.826719,2.067993,0.580856,0.366991,-0.033012
2,-0.15202,-1.029013,0.573091,0.610472,0.015568
3,0.088874,-0.62965,0.58675,-0.134877,1.330945
4,-1.748392,-0.372358,1.320572,-0.6191,0.02367
5,-0.831083,0.131637,-0.237758,-0.828414,-0.742476
6,-1.039771,-1.277665,-0.187859,-0.849789,0.655105


In [62]:
df1.idxmin(axis=1) # it returns the column name of each row which has min value

0    d
1    a
2    b
3    b
4    a
5    a
6    b
dtype: object

In [67]:
df1

Unnamed: 0,a,b,c,d,e
0,-0.338929,-0.412811,0.242496,-1.838294,0.588975
1,-0.826719,2.067993,0.580856,0.366991,-0.033012
2,-0.15202,-1.029013,0.573091,0.610472,0.015568
3,0.088874,-0.62965,0.58675,-0.134877,1.330945
4,-1.748392,-0.372358,1.320572,-0.6191,0.02367
5,-0.831083,0.131637,-0.237758,-0.828414,-0.742476
6,-1.039771,-1.277665,-0.187859,-0.849789,0.655105


In [65]:
df1.idxmax(axis=0)

a    3
b    1
c    4
d    2
e    3
dtype: int64

In [66]:
df1.idxmax(axis=1)

0    e
1    b
2    d
3    e
4    c
5    b
6    e
dtype: object

### when there are multiple rows or columns matching the max or min value, the 1st matching index will be returned

In [68]:
df3 = pd.DataFrame([2,1,1,2,np.nan], columns=['A'], index=list("edcba"))

In [69]:
df3

Unnamed: 0,A
e,2.0
d,1.0
c,1.0
b,2.0
a,


In [72]:
df3.idxmin()

A    d
dtype: object

In [73]:
df3.idxmax()

A    e
dtype: object

## Note: In numpy idxmin and idxmax is argmin and argmax

# Value counts(histogramming)/mode

## value_counts() is a series method which returns number of counts of unique values in a series

In [86]:
data = np.random.randint(1, 7, size = 50)

In [87]:
data

array([5, 3, 3, 6, 2, 4, 3, 2, 2, 5, 4, 5, 6, 4, 4, 1, 3, 4, 1, 6, 3, 4,
       5, 2, 6, 3, 1, 3, 4, 2, 4, 3, 4, 1, 5, 4, 3, 4, 1, 6, 3, 3, 1, 5,
       2, 5, 5, 2, 5, 6])

In [88]:
s = pd.Series(data)

In [89]:
s

0     5
1     3
2     3
3     6
4     2
5     4
6     3
7     2
8     2
9     5
10    4
11    5
12    6
13    4
14    4
15    1
16    3
17    4
18    1
19    6
20    3
21    4
22    5
23    2
24    6
25    3
26    1
27    3
28    4
29    2
30    4
31    3
32    4
33    1
34    5
35    4
36    3
37    4
38    1
39    6
40    3
41    3
42    1
43    5
44    2
45    5
46    5
47    2
48    5
49    6
dtype: int32

In [90]:
s.value_counts()

3    11
4    11
5     9
2     7
6     6
1     6
dtype: int64

In [92]:
pd.value_counts(data) #array is passed as an argument in value_counts method

3    11
4    11
5     9
2     7
6     6
1     6
dtype: int64

### value_counts() can be used on multiple columns

In [94]:
data = {'a': [1, 2, 3, 3, 2, 2, 3], 'b': ['a', 'b', 'a', 'c', 'a', 'b', 'a']}
data
                                          

{'a': [1, 2, 3, 3, 2, 2, 3], 'b': ['a', 'b', 'a', 'c', 'a', 'b', 'a']}

In [95]:
pd.Series(data)

a    [1, 2, 3, 3, 2, 2, 3]
b    [a, b, a, c, a, b, a]
dtype: object

In [100]:
x = pd.DataFrame(data)
x

Unnamed: 0,a,b
0,1,a
1,2,b
2,3,a
3,3,c
4,2,a
5,2,b
6,3,a


In [101]:
x.value_counts()

a  b
2  b    2
3  a    2
1  a    1
2  a    1
3  c    1
dtype: int64

### with mode() method, we can get the most frequent values

In [105]:
s5 = pd.Series([1,2,1,3,1,3,4,2,2,4,2,1,1,7,9,2])

In [106]:
s5.mode()

0    1
1    2
dtype: int64

### now let's work with mode on multiple columns

In [108]:
df5 = pd.DataFrame(
    {
        'A': np.random.randint(1, 7, size=50),
        'B': np.random.randint(7, 13, size=50)
    }
)

In [109]:
df5

Unnamed: 0,A,B
0,2,7
1,1,9
2,5,11
3,5,12
4,3,7
5,1,8
6,2,9
7,5,11
8,5,11
9,4,11


In [111]:
df5.mode() # returns 2 and 7 are the most frequent values together

Unnamed: 0,A,B
0,2,7


In [112]:
df5['A'].mode()

0    2
Name: A, dtype: int32

In [113]:
df5['B'].mode()

0    7
Name: B, dtype: int32