In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install geocoder
!pip install bs4

# Data Extraction Protocol for Iron Age Archaeological Sites of Turkey Dataset

In [None]:
# packages
from bs4 import BeautifulSoup
import json
import pandas as pd
import geocoder

In [None]:
def read_html(p: str):
    ""
    with open(p, "r", encoding="utf-8") as f:
        return BeautifulSoup(f, "html")
    
def save_json(p: str, o):
    with open(p, "w", encoding="utf-8") as f:
        jf = json.dumps(o, ensure_ascii=False, indent=2)
        f.write(jf)

In [None]:
page = read_html("taydata.html")

In [None]:
retrieval_date = "2021-03-21"
update_date = "2021-01-01"

In [None]:
def get_section(p, section_id: str):
    ""
    return p.find("section", id=section_id)

def get_date_range(el):
    "get date range from element"
    metadata = el.find(class_="metadata")
    date_start = metadata.find("span", class_="date-start").get_text(strip=True)
    date_end = metadata.find("span", class_="date-end").get_text(strip=True)
    return {
        "date-start": int(date_start) if date_start else None,
        "date-end": int(date_end) if date_end else None,
        "period-name": el["id"].replace("-data", "")
           }

def get_section_date_range(page, sid):
    section = get_section(page, sid)
    return get_date_range(section)

In [None]:
def get_data_section(el):
    return el.find("section", class_="data")

def extract_id_no(el) -> str:
    link = el.find("a", href=lambda x: "CagNo" in x)
    href = link["href"]
    idno = href.find("CagNo")
    nhref = href[idno:]
    ampersand = nhref.find("&")
    nhref = nhref[:ampersand]
    nhref = nhref.replace("CagNo=", "")
    return nhref

def extract_info(el) -> dict:
    "extract region city and administrative division from element"
    font_txt = el.find("font", size=1).get_text(strip=True)
    infos_lst = [f.strip() for f in font_txt.split("-") if f]
    return {
        "region": infos_lst.pop(),
        "city": infos_lst.pop(),
        "administrative-division": infos_lst.pop(),
        "research-status": infos_lst.pop(),
        "site-type": infos_lst.pop()
    }

def info_tpl(info):
    ""
    return (
        ("region", info["region"]), 
        ("city", info["city"]),
        ("administrative-division", info["administrative-division"]),
        ("research-status", info["research-status"]),
        ("site-type", info["site-type"])
    )

def date_range_tpl(date_range):
    return (("date-start", date_range["date-start"]), 
            ("date-end", date_range["date-end"]),
           ("period-name", date_range["period-name"]))

def take_subrows(trows: list, counter: int) -> list:
    return trows[counter:counter+3]

def rearrange_subrow(subrow: list) -> tuple:
    "rearrange subrows"
    idn = extract_id_no(subrow[0])
    info = extract_info(subrow[1])
    return idn, info

def get_table_rows(data_section):
    "find table and take a list of its rows"
    table_rows = data_section.find_all("tr")
    data = []
    counter = 0
    while counter < len(table_rows):
        subrows = take_subrows(table_rows, counter)
        counter+=3
        data.append(subrows)
    return data

def arrange_data(page, sid: str, data):
    "arrange data for given section"
    section = get_section(page, sid)
    date_range = get_date_range(section)
    data_section = get_data_section(section)
    rows = get_table_rows(data_section)
    for subrow in rows:
        data_id, info = rearrange_subrow(subrow)
        if data_id not in data:
            data[data_id] = {"active-dates": [date_range_tpl(date_range)], 
                             "infos": [info_tpl(info)]}
        else:
            adates = set(data[data_id]["active-dates"])
            infos = set(data[data_id]["infos"])
            infotpl = info_tpl(info)
            date_tpl = date_range_tpl(date_range)
            adates.add(date_tpl)
            infos.add(infotpl)
            data[data_id]["active-dates"] = list(adates)
            data[data_id]["infos"] = list(infos)     
    return

def extract_data(page):
    ""
    section_ids = [f["id"] for f in page.find_all("section", id=True)]
    data = {}
    for sid in section_ids:
        arrange_data(page, sid, data)
    for d, vs in data.items():
        vs["infos"] = vs["infos"].pop()
    d = {}
    d["data"] = data
    d["metadata"] = {
        "date-range-assumptions": {
            sid.replace("-data",""): get_section_date_range(page,
                                                            sid) for sid in section_ids
        }
    }
    
    return d

In [None]:
data = extract_data(page)
data["metadata"]["acquisition-date"] = retrieval_date
data["metadata"]["tay-update-date"] = update_date

In [None]:
save_json("tayraw.json",data)

In [None]:
def get_info_key(key, data):
    vals = set()
    for k, vd in data.items():
        if "infos" not in vd:
            print(k)
            print(vd)
            raise ValueError("fffffff")
        infos = dict(vd["infos"])
        vals.add(infos[key])
    return list(vals)

def group_by_info_key(key, data):
    vals = get_info_key(key, data)
    vdata = {v: [] for v in vals}
    for k, vd in data.items():
        info_tpl = vd["infos"]
        infos = dict(info_tpl)
        kval = infos[key]
        vdata[kval].append(k)
    return vdata

def get_city(data) -> list:
    ""
    return group_by_info_key("city", data)

def get_region(data) -> list:
    return group_by_info_key("region", data)

def get_research_status(data) -> list:
    return group_by_info_key("research-status", data)

def get_site_type(data) -> list:
    return group_by_info_key("site-type", data)

def get_administrative_division(data) -> list:
    return group_by_info_key("administrative-division", data)

In [None]:
save_json("tay_by_city.json", get_city(data["data"]))
save_json("tay_by_region.json", get_region(data["data"]))
save_json("tay_by_research_status.json", get_research_status(data["data"]))
save_json("tay_by_site_type.json", get_site_type(data["data"]))
save_json("tay_by_administrative_division.json", get_administrative_division(data["data"]))

In [None]:
# geonames related
def addr_from_info(info: dict):
    addr = ",".join([info["administrative-division"], 
                     info["city"],
                    "Türkiye"])
    return addr

def get_geoinfo(info: dict, ads: dict):
    addr = addr_from_info(info)
    if addr in ads:
        return False, addr
    
    g = geocoder.osm(addr, maxRows=10)
    gjson = g.geojson
    ads[addr] = gjson
    return True, gjson

def get_geodata(data):
    geodata = {}
    addresses = {}
    for k, vd in data.items():
        info = dict(vd["infos"])
        opcode, k_geodata = get_geoinfo(info, addresses)
        if opcode is False:
            geodata[k] = addresses[k_geodata]
        else:
            geodata[k] = k_geodata
    return geodata

In [None]:
geodata = get_geodata(data["data"])  # takes a lot of time
save_json("tay_by_geodata.json", geodata)

In [None]:
def add_info_to_dfdict(dfdict, info, k):
    ""
    for ki, kv in info.items():
        dfdict[ki].append(kv)
    dfdict["id"].append(k)
    return dfdict
    

def info_dataframe(data):
    dfdict = {
        "region": [],
        "city": [],
        "administrative-division": [],
        "research-status": [],
        "site-type": [],
        "id": [],
        "date-start": [],
        "date-end": [],
        "period-name": []
    }
    for k, vd in data.items():
        info = dict(vd["infos"])
        active_dates = vd["active-dates"]
        for adate in active_dates:
            ad = dict(adate)
            add_info_to_dfdict(dfdict, info, k)
            for a, d in ad.items():
                dfdict[a].append(d)
    return pd.DataFrame(dfdict)

In [None]:
dfi = info_dataframe(data["data"])
dfi.to_json("tay_by_info.json", force_ascii=False, indent=2)
dfi.to_csv("tay_by_info.csv")

In [None]:
def get_group_serie_df(df: pd.DataFrame, gnames: list, condition=None):
    if condition is None:
        return df.groupby(gnames)["id"].count()
    return df[condition].groupby(gnames)["id"].count()

def get_group_dict(df: pd.DataFrame, gnames=["city", 
                                             "region", 
                                             "research-status",
                                            "site-type",
                                            "administrative-division",
                                            "date-start",
                                            "date-end",
                                            "period-name"],
                  condition=None) -> dict:
    return {gname:get_group_serie_df(df,gname, condition) for gname in gnames}

def mk_condition(df, s, e, sk="date-start", ek="date-end"):
    "get date interval"
    return (df[sk] >= s) & (df[ek] <= e)

In [None]:
# all tay places
series = get_group_dict(dfi)

# interval -1200 to -700
series_1200_700 = get_group_dict(dfi, condition=mk_condition(dfi, s=-1200, e=-700))

# interval -800 to -600
series_800_600 = get_group_dict(dfi, condition=mk_condition(dfi, s=-800, e=-600))

# interval -600 to -300
series_600_300 = get_group_dict(dfi, condition=mk_condition(dfi, s=-600, e=-300))

In [None]:
def serie_stats(s: pd.Series):
    serie_stat = s.describe()
    smax = s.max()    
    smin = s.min()
    condmax = (s <= smax) & (s > s.mean())
    condmin = (s >= smin) & (s < s.mean())
    smax = s.where(condmax)
    smin = s.where(condmin)
    df = pd.DataFrame({"counts": s})
    df["rank"] = None
    for index, value in s.items():
        sminval = smin[index]
        smaxval = smax[index]
        if pd.notnull(sminval):
            df.at[index, "rank"] = "below-mean"
        elif pd.notnull(smaxval):
            df.at[index, "rank"] = "above-mean"
    d = {}
    d["descriptive-stats"] = serie_stat.to_dict()
    d["below-mean"] = s.loc[condmin].to_dict()
    d["above-mean"] = s.loc[condmax].to_dict()
    return d, df, serie_stat

def save_serie_stats(ss: dict, prefix="tay_stats_by_", suffix="", ext="json"):
    "save series"
    for sk, serie in ss.items():
        save_name = prefix
        save_name += sk.replace("-","_")
        save_name += suffix
        statd, df, sstat = serie_stats(serie)
        if ext == "json":
            save_name += "." + ext
            save_json(save_name, statd)
        else:
            save_name1 = save_name + "." + ext
            save_name2 = save_name + "_descriptive" + "." + ext
            df.to_csv(save_name1)
            sstat.to_csv(save_name2)

In [None]:
save_serie_stats(series, suffix="-forall-periods", ext="csv")
save_serie_stats(series_1200_700, suffix="-between-1200-700-bce", ext="csv")
save_serie_stats(series_800_600, suffix="-between-800-600-bce", ext="csv")
save_serie_stats(series_600_300, suffix="-between-600-300-bce", ext="csv")

In [None]:
def combine_data(data, geodata):
    for k, v in geodata.items():
        data[k]["geodata"] = geodata.get(k, None)
    return data

In [None]:
all_data = combine_data(data["data"], geodata)
save_json("tay_data.json", all_data)

In [None]:
# now let's make some historical maps
def groupby_active_dates(data,metadata):
    date_ranges = {}
    date_name = metadata["date-range-assumptions"]
    range_to_name = {date_range_tpl(v):k for k,v in date_name.items()}
    
    for k, vd in data.items():
        adates = vd["active-dates"]
        for adate in adates:
            if adate not in date_ranges:
                date_ranges[adate] = [k]
            else:
                date_ranges[adate].append(k)
    return date_ranges


def get_date_geodata(data):
    date_ranges = groupby_active_dates(data["data"],data["metadata"])
    date_range_geodata = {}
    for date_range, keys in date_ranges.items():
        name = date_range[-1][1]
        if name not in date_range_geodata:
            date_range_geodata[name] = {}
        date_range_geodata[name]["place-ids"] = keys
        date_range_geodata[name]["date-range"] = dict(date_range)
        gdata = {"type": "FeatureCollection", "features": []}
        for key in keys:
            key_geodata = data["data"][key]["geodata"]
            key_features = key_geodata["features"]
            # obtain highest accuracy feature
            keyf = sorted(key_features, key=lambda x: x["properties"]["accuracy"])
            if keyf:
                gdata["features"].append(keyf.pop())
        date_range_geodata[name]["geodata"] = gdata
    return date_range_geodata

In [None]:
date_geodata = get_date_geodata(data)
save_json("tay_by_date_geodata.json", date_geodata)

In [None]:
def mk_geodata_with_info(data):
    geo = {"type": "FeatureCollection", "features": []}
    for key, vdata in data.items():
        geodata = vdata["geodata"]
        key_features = geodata["features"]
        # obtain highest accuracy feature
        keyf = sorted(key_features, key=lambda x: x["properties"]["accuracy"])
        if keyf:
            feature = keyf.pop()
            feature["properties"]["tay-infos"] = dict(vdata["infos"])
            feature["properties"]["tay-active-dates"] = [dict(a) for a in vdata["active-dates"]]
            feature["properties"]["tay-place-id"] = key
            geo["features"].append(feature)
    return geo

In [None]:
geo_data_with_info = mk_geodata_with_info(data["data"])
save_json("tay_as_geojson.json", geo_data_with_info)