In [1]:
# !pip install pandas hvplot

In [3]:
import pandas as pd
import hvplot.pandas

# Exploratory Data Analysis with Pandas and hvPlot

Exploratory Data Analysis (EDA) is a crucial step in data analysis process. The purpose of EDA is mainly to summarize the data into few key statistical measures and visualizations to help us understand underlying patterns in the data. During EDA, we are still in discovery phase of our entire analysis process which means that we would benefit from a tool offering these two main features

1. Accommodate rapid experimentation of data analysis
2. Give immediate result in the form of table/numbers/visualizations
3. Integrate text based explanations for future use

Notebooks are excellent tools for these as we can do multiple "experimental" analyses on our data in different cells or sections, get immediate results, and also document our thought process behind each "experiment".

In this session, we learn two Python libraries that are well integrated with notebooks for EDA: Pandas for tabular data analysis and hvPlot for visualization. 

## Quick Data Description

In [5]:
url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv'
df = pd.read_csv(url)
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [6]:
df.head(5)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [7]:
df.head(10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False


In [8]:
df.tail(5)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     891 non-null    int64  
 1   pclass       891 non-null    int64  
 2   sex          891 non-null    object 
 3   age          714 non-null    float64
 4   sibsp        891 non-null    int64  
 5   parch        891 non-null    int64  
 6   fare         891 non-null    float64
 7   embarked     889 non-null    object 
 8   class        891 non-null    object 
 9   who          891 non-null    object 
 10  adult_male   891 non-null    bool   
 11  deck         203 non-null    object 
 12  embark_town  889 non-null    object 
 13  alive        891 non-null    object 
 14  alone        891 non-null    bool   
dtypes: bool(2), float64(2), int64(4), object(7)
memory usage: 92.4+ KB


## Categorical and Continuos Variables

In [13]:
df['pclass'].nunique()

3

In [14]:
df['sex'].nunique()

2

In [15]:
df['alive'].nunique()

2

In [25]:
df['embark_town'].nunique()

3

In [20]:
df['pclass'].unique()

array([3, 1, 2])

In [21]:
df['sex'].unique()

array(['male', 'female'], dtype=object)

In [22]:
df['alive'].unique()

array(['no', 'yes'], dtype=object)

In [26]:
df['embark_town'].unique()

array(['Southampton', 'Cherbourg', 'Queenstown', nan], dtype=object)

In [10]:
df['pclass'].value_counts()

pclass
3    491
1    216
2    184
Name: count, dtype: int64

In [11]:
df['alive'].value_counts()

alive
no     549
yes    342
Name: count, dtype: int64

In [12]:
df['sex'].value_counts()

sex
male      577
female    314
Name: count, dtype: int64

In [27]:
df['embark_town'].value_counts()

embark_town
Southampton    644
Cherbourg      168
Queenstown      77
Name: count, dtype: int64

In [18]:
df[['sex', 'alive']].value_counts().sort_index()

sex     alive
female  no        81
        yes      233
male    no       468
        yes      109
Name: count, dtype: int64

In [28]:
df[['embark_town', 'alive']].value_counts().sort_index()

embark_town  alive
Cherbourg    no        75
             yes       93
Queenstown   no        47
             yes       30
Southampton  no       427
             yes      217
Name: count, dtype: int64

In [29]:
df[['pclass', 'sex', 'alive', 'embark_town']].value_counts().sort_index()

pclass  sex     alive  embark_town
1       female  no     Cherbourg        1
                       Southampton      2
                yes    Cherbourg       42
                       Queenstown       1
                       Southampton     46
        male    no     Cherbourg       25
                       Queenstown       1
                       Southampton     51
                yes    Cherbourg       17
                       Southampton     28
2       female  no     Southampton      6
                yes    Cherbourg        7
                       Queenstown       2
                       Southampton     61
        male    no     Cherbourg        8
                       Queenstown       1
                       Southampton     82
                yes    Cherbourg        2
                       Southampton     15
3       female  no     Cherbourg        8
                       Queenstown       9
                       Southampton     55
                yes    Cherbourg       15

## Group-by For Statistics on Groups of Data

## Visualizing Data 