## 1. Import the module

In [107]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import altair as alt
from sklearn.model_selection import (
    train_test_split
)
import os
%matplotlib inline

# 2. Summary of the data set

### 2.1 Read the data and split the data into 70% train and 30% test set.

In [108]:
df = pd.read_csv(
    "../data/raw/contraceptive.csv",
)
train_df, test_df = train_test_split(df, test_size=0.3, random_state=123)
train_df.head()

Unnamed: 0,Wife Age,Wife education,Husband education,Number of children ever born,Wife religion,Wife now working?,Husband occupation,Standard-of-living index,Media exposure,Contraceptive method used
834,41,1,4,9,1,1,2,1,0,3
491,40,4,4,6,0,0,1,4,0,2
376,40,3,3,3,1,1,2,2,1,1
250,34,4,4,3,0,1,1,2,0,1
228,47,1,4,8,1,1,3,3,0,1


In [109]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1031 entries, 834 to 1389
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype
---  ------                        --------------  -----
 0   Wife Age                      1031 non-null   int64
 1   Wife education                1031 non-null   int64
 2   Husband education             1031 non-null   int64
 3   Number of children ever born  1031 non-null   int64
 4   Wife religion                 1031 non-null   int64
 5   Wife now working?             1031 non-null   int64
 6   Husband occupation            1031 non-null   int64
 7   Standard-of-living index      1031 non-null   int64
 8   Media exposure                1031 non-null   int64
 9   Contraceptive method used     1031 non-null   int64
dtypes: int64(10)
memory usage: 88.6 KB


In [110]:
train_df.describe()

Unnamed: 0,Wife Age,Wife education,Husband education,Number of children ever born,Wife religion,Wife now working?,Husband occupation,Standard-of-living index,Media exposure,Contraceptive method used
count,1031.0,1031.0,1031.0,1031.0,1031.0,1031.0,1031.0,1031.0,1031.0,1031.0
mean,32.730359,2.941804,3.441319,3.335597,0.85936,0.756547,2.115422,3.132881,0.078565,1.909796
std,8.349353,1.013744,0.804236,2.412323,0.347819,0.429374,0.872039,0.969834,0.269188,0.87501
min,16.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
25%,26.0,2.0,3.0,2.0,1.0,1.0,1.0,3.0,0.0,1.0
50%,32.0,3.0,4.0,3.0,1.0,1.0,2.0,3.0,0.0,2.0
75%,39.0,4.0,4.0,5.0,1.0,1.0,3.0,4.0,0.0,3.0
max,49.0,4.0,4.0,16.0,1.0,1.0,4.0,4.0,1.0,3.0


# 3. Exploratory Data Analysis

### 3.1 Plot the Histogram of all features

In [128]:
numerical_features = ["Wife Age", "Number of children ever born"]
alt.Chart(train_df).mark_bar().encode(
    x=alt.X(alt.repeat(), type="quantitative", bin=alt.Bin(maxbins=40)),
    y="count()",
).properties(width=300, height=200).repeat(numerical_features)

In [175]:
categorical_features = list(
    train_df.drop(
        columns=[
            "Contraceptive method used",
            "Wife Age",
            "Number of children ever born",
        ]
    ).columns
)

alt.Chart(train_df).mark_bar().encode(
    x=alt.X('Wife education', type="nominal", title="Wife Education"),
    y=alt.Y("count()", title="Count"),
    column=alt.Column("Contraceptive method used:O")
)

### 3.2 Plot the Scatter Plot and Correlation Plot

In [111]:
alt.Chart(train_df).mark_point(opacity=0.3, size=10).encode(
     x=alt.X(alt.repeat('row'), type='quantitative'),
     y=alt.Y(alt.repeat('column'), type='quantitative')
).properties(
    width=50,
    height=50
)

In [113]:
corr_df = (
    train_df.select_dtypes("number").corr("spearman").stack().reset_index(name="corr")
)
corr_df.loc[corr_df["corr"] == 1, "corr"] = 0  # Remove diagonal
corr_df["abs"] = corr_df["corr"].abs()
alt.Chart(corr_df).mark_circle().encode(
    x="level_0",
    y="level_1",
    size="abs",
    color=alt.Color("corr", scale=alt.Scale(scheme="blueorange", domain=(-1, 1))),
)

In [114]:
alt.Chart(train_df).mark_bar().encode(
    x=alt.X("Wife Age:Q", bin=alt.Bin(maxbins=40), title="Wife Age"),
    y=alt.Y("count()", title="Count"),
    color="Contraceptive method used:O",
).properties(width=300, height=200).facet(row="Contraceptive method used")

In [115]:
alt.Chart(train_df).mark_bar().encode(
    x=alt.X("Wife Age:Q", bin=alt.Bin(maxbins=40), title="Wife Age"),
    y=alt.Y("count()", title="Count"),
    color="Contraceptive method used:O",
).properties(width=300, height=200).facet(row="Contraceptive method used")

In [116]:
alt.Chart(train_df).mark_boxplot().encode(
    x="Contraceptive method used:O",
    y=alt.Y("Wife Age", scale=alt.Scale(domain=(15, 50))),
    color="Contraceptive method used:O",
).properties(width=200)