# CGUs Statistics

In [1]:
import glob
import json
import os
import pandas as pd
pd.set_option('display.max_rows', None)
import numpy as np

In [2]:
def _get_latest_file():
    list_of_files = glob.glob('../reports/stats_*.json')
    latest_file = max(list_of_files, key=os.path.getctime)
    return latest_file

In [3]:
with open(_get_latest_file()) as f:
    data = json.loads(f.read())

In [4]:
df = pd.DataFrame.from_dict(data, orient="index")
df

Unnamed: 0,document_type,num_words,readability,readability_grade_level
Cdiscount - Commercial Terms,Commercial Terms,27935,75.795005,10.794728
Cdiscount - Trackers Policy,Trackers Policy,7715,63.765318,14.72546
Cdiscount - Terms of Service,Terms of Service,4112,62.336995,16.042495
Apple - Human Rights Policy,Human Rights Policy,1495,30.919436,16.088988
Apple - Privacy Policy,Privacy Policy,4638,28.074519,16.326577
We Heart It - Privacy Policy,Privacy Policy,6315,24.666919,18.032515
We Heart It - Terms of Service,Terms of Service,10220,31.788863,17.260412
Twitter - Privacy Policy,Privacy Policy,7388,39.033121,15.306359
Twitter - Copyright Claims Policy,Copyright Claims Policy,2359,43.987624,12.652629
Twitter - Trackers Policy,Trackers Policy,1772,43.365237,13.590663


## Stats by document types

In [6]:
df.groupby("document_type").agg(
    num_docs = ("num_words", "size"),
    mean_num_words = ("num_words", np.mean),
    ecart_type = ("num_words", np.std),
    mean_readability = ("readability", np.mean)
).sort_values("mean_readability", ascending=False)

Unnamed: 0_level_0,num_docs,mean_num_words,ecart_type,mean_readability
document_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Quality Guidelines,1,1479.0,,55.220054
Commercial Terms,4,10650.75,11792.555289,50.742262
Trackers Policy,9,2342.444444,2038.524473,50.448958
Brand Guidelines,2,1631.0,439.820418,49.676747
In-App Purchases Policy,1,2509.0,,42.289617
Community Guidelines,3,2149.0,1082.020333,40.769397
Review Guidelines,2,17374.5,4688.825066,39.50169
Parent Organization Privacy Policy,3,8900.0,0.0,39.012685
Developer Terms,9,4359.333333,2004.770062,38.182087
Program Policies,2,2114.5,943.987553,37.498229


### Readability Index
[Flesch–Kincaid readability tests](https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests)


| Score | School level |	Notes |
|:---|:---|:---|
| 100.00–90.00 |	5th grade |	Very easy to read. Easily understood by an average 11-year-old student. |
|90.0–80.0 |	6th grade |	Easy to read. Conversational English for consumers. |
|80.0–70.0 |	7th grade |	Fairly easy to read. |
|70.0–60.0 |	8th & 9th grade |	Plain English. Easily understood by 13- to 15-year-old students. |
|60.0–50.0 |	10th to 12th grade |	Fairly difficult to read. |
|50.0–30.0 |	College |	Difficult to read. |
|30.0–10.0 |	College graduate |	Very difficult to read. Best understood by university graduates. |
|10.0–0.0 |	Professional |	Extremely difficult to read. Best understood by university graduates. |