In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [None]:
import json
import pandas as pd

def read_jl(path):
    with open(path, 'rt', encoding='utf8') as f:
        for line in f:
            if line.strip():
                yield json.loads(line)
                
def read_df(path):
    data = list(read_jl(path))
    df = pd.DataFrame(data)
    df = df.assign(
        size=df.size.astype(int),
        status_code=df.status_code.fillna(0).astype(int),
        crawl=pd.to_datetime(df.crawl, unit='s'),
        crawled_at=pd.to_datetime(df.crawled_at, unit='s'),    
        err4xx = (df.status_code > 399) & (df.status_code < 500),
        err5xx = (df.status_code > 499) & (df.status_code < 600),
        
        # adjust it for your task:
        soft404 = df.crawled_url.str.lower().str.contains('notfound|404').fillna(False) & (df.status_code == 200),        
    )
    df = df.assign(        
        ok=(df.status_code==200) & (~df.soft404),
        time=df.crawled_at-df.crawl,
        redirect=~df.crawled_url.isnull() & (df.url != df.crawled_url),
    )
    return df

In [None]:
df_full = read_df('../status.jl')
df = df_full[['crawl', 'url', 'ok', 'err4xx', 'err5xx', 'soft404', 'time', 'redirect']]

Duplicate URLs (likely soft404):

In [None]:
vc = df_full.crawled_url.value_counts()
vc[vc > len(df_full.crawl.unique())]

Crawl results:

In [None]:
g = df.groupby(df.crawl)
res = pd.DataFrame({
    'ok': g.ok.mean()*100,
    'err4xx': g.err4xx.sum().astype(int),
    'err5xx': g.err5xx.sum().astype(int),
    'soft404': g.soft404.sum().astype(int),
    'count': g.ok.count(),
    'time': pd.to_timedelta(g.time.max()),
})
res

In [None]:
res.ok.plot(style='o-');

In [None]:
res.err4xx.plot(legend=True, ylim=(0, None), style='o-')
res.soft404.plot(legend=True, style='o-');
res.err5xx.plot(legend=True, style='o-');

In [None]:
res.time.dt.total_seconds().plot(legend=True);

In [None]:
res['count'].plot(legend=True);