In [None]:
import os
import sys

import pandas as pd
import altair as alt

alt.data_transformers.enable('data_server')
alt.renderers.enable('mimetype')

In [None]:
data_path = os.path.join(os.pardir, "data", "raw", "complaints.csv")

complaints_df = pd.read_csv(data_path, parse_dates=True)

In [None]:
complaints_df.columns[[9,16]]

In [None]:
complaints_df.head()

In [None]:
complaints_df.info()

In [None]:
complaints_df.describe()

# Missing Values

In [None]:
complaints_df[:10].style.highlight_null()

In [None]:
num_complaints = 2000
missing_vals = alt.Chart(
    complaints_df.head(num_complaints).isna().reset_index().melt(
        id_vars='index'
    ),
    title = f"Missing Values of Most Recent {num_complaints} Complaints"
).mark_rect().encode(
    alt.X('index:O', axis=None),
    alt.Y('variable', title=None),
    alt.Color('value', title='Missing Value'),
    alt.Stroke('value')  # We set the stroke which is the outline of each rectangle in the heatmap
).properties(
    width=min(1000, complaints_df.head(num_complaints).shape[0])
);
display(missing_vals)

# Distribution of Categorical Variables

In [None]:
col = "Issue"
num_bars = 10

for col in complaints_df.drop(["Complaint ID", "Consumer complaint narrative", "Date received"], axis = 1).columns:

    counts = complaints_df.head(num_complaints).groupby(col).size().reset_index(name='counts')
    count_bar = alt.Chart(
        counts,
        title = f"{num_bars} Most Common {col} Reported"
    ).mark_bar().encode(
        x='counts',
        y=alt.Y(col, type = "nominal", sort = "x"),
    ).transform_window(
        rank='rank(counts)',
        sort=[alt.SortField('counts', order='descending')]
    ).transform_filter(
        (alt.datum.rank <= num_bars)
    )
    display(count_bar)
    


In [None]:
complaints_df["Consumer disputed?"].head(10)