In [None]:
!pip install pdfkit




In [None]:
import os
import json


def create_platforms_json():
    files = os.listdir('./data/files/pdf/')
    platforms = []
    companies = []
    for f in files:
        if '.pdf' not in f:
            companies.append(f)

    for c in companies:
        c_dict = {}
        c_dict["name"] = c
        c_dict["slug"] = c.lower()
        c_dict["policies"] = []
        pols = os.listdir('./data/files/pdf/' + c + '/')
        for p in pols:
            slug = ''
            if p == 'Terms of Service':
                slug = 'ts'
            if p == 'Community Guidelines':
                slug = 'cg'
            if p == 'Privacy Policy':
                slug = 'pp'
            diffchecks = get_diffchecks(c,p) #list of json objects {date1, date2, html}
            dates = os.listdir('./data/files/pdf/' + c + '/' + p + '/')
            ds = []
            for d in dates:
                ds.append(d.split('_')[0])
                if len(ds)>=8:
                    break
            p_dict = {'name': p, 'slug': slug, 'dates': ds, 'diffchecks': diffchecks}
            c_dict["policies"].append(p_dict)
        platforms.append(c_dict)

    with open('./data/json/platforms.json', 'w') as fi:
        json.dump(platforms, fi)
    print("Platform Overview Created...")

In [None]:
import difflib
import sys
from datetime import date
from datetime import timedelta
from datetime import datetime
import requests
#import pdfkit
import pandas as pd
import os

def get_url_list(url):
    r = requests.get(
        'http://web.archive.org/cdx/search/cdx?url=' + url + '&output=json&collapse=digest&filter=statuscode:200')
    try:
        entry_list = r.json()[1:]
        return entry_list
    except:
        return []


def filter_list_by_timeframe(time_in_days, timestamp1, timestamp2):
    year1 = timestamp1[:4]
    month1 = timestamp1[4:6]
    day1 = timestamp1[6:8]

    year2 = timestamp2[:4]
    month2 = timestamp2[4:6]
    day2 = timestamp2[6:8]

    f_date = date(int(year1), int(month1), int(day1))
    l_date = date(int(year2), int(month2), int(day2))
    delta = l_date - f_date
    if abs(delta.days) < time_in_days:
        return False
    else:
        return True


def get_file_diff(file1, file2):
    d = difflib.Differ()
    delta = list(d.compare(file1, file2))
    return delta


def get_text(doc):
    text = ""
    for page in doc.pages():
        text += page.getText()
    return text


def read_date(date_string):
    date_string = str(date_string)
    year1 = date_string[:4]
    month1 = date_string[4:6]
    day1 = date_string[6:8]
    return date(int(year1), int(month1), int(day1))


def is_earlier_than(fix_date, item_date):
    delta = fix_date - item_date
    return delta.days > 0


def adjust_timeframe(doc_list, url_start_date, url_end_date):
    filtered_list = list(
        filter(lambda entry: (not is_earlier_than(read_date(url_start_date), read_date(entry[1]))), doc_list))
    filtered_list = list(
        filter(lambda entry: (is_earlier_than(read_date(url_end_date), read_date(entry[1]))), filtered_list))
    return filtered_list


def filter_list(doc_list, url_start_date, url_end_date):
    doc_list = adjust_timeframe(doc_list, url_start_date, url_end_date)
    new_list = [doc_list[0]]
    current_time = doc_list[0][1]
    for i in range(1, len(doc_list)):
        item = doc_list[i]
        timestamp1 = item[1]
        if filter_list_by_timeframe(7, current_time, timestamp1):
            new_list.append(item)
            current_time = timestamp1
    return new_list


def pdf_from_url(option, url, path):
    try:
      path_wkhtmltopdf = "/content/usr/bin/wkhtmltopdf"
      config = pdfkit.configuration(wkhtmltopdf=path_wkhtmltopdf)
      pdfkit.from_url(url, path, options=option, configuration=config)
      return True
    except:
        return False


def download_pdf_from_url(timestamp, url, path):
    options = {'quiet': ''}
    web_arch_url = "http://web.archive.org/web/" + timestamp + "/" + url
    # try:
    path_wkhtmltopdf = "/content/usr/bin/wkhtmltopdf"
    config = pdfkit.configuration(wkhtmltopdf=path_wkhtmltopdf)
    pdfkit.from_url(web_arch_url, path, options=options, configuration=config)
    return True
# except:
#     print("something went wrong")
#     return False


def convert_entry_to_date(entry):
    entry = str(entry)
    if 'today' in entry:
        return date.today()
    year1 = entry[:4]
    month1 = entry[4:6]
    day1 = entry[6:8]
    return date(int(year1), int(month1), int(day1))


def create_url_doc_timeline():
    df = pd.read_csv('/content/drive/MyDrive/Code/HIIG/pgArchive/data/utils/url_doc_list.csv')
    # make a MINED.csv as log for mined pdfs to avoid doubles
    df_final = pd.DataFrame(columns=["Platform", "Doctype", "URL", "From", "To"])
    companies = list(set(df["Platform"]))
    doctypes = list(set(df["Doctype"]))
    for company in companies:
        df_comp = df[(df["Platform"] == company)]
        for doctype in doctypes:
            df_comp_doc = df_comp[(df_comp["Doctype"] == doctype)]
            if len(df_comp_doc.index) == 0:
                continue
            end_date_before = convert_entry_to_date(df_comp_doc.iloc[0][4])
            url = df_comp_doc.iloc[0][2]
            for i in range(1, len(df_comp_doc.index)):
                start_date = convert_entry_to_date(df_comp_doc.iloc[i][3])
                delta = end_date_before - start_date
                if delta.days > 0:

                    df_final = df_final.append(
                        {"Platform": company, "Doctype": doctype, "URL": url, "From": df_comp_doc.iloc[i - 1][3],
                         "To": (start_date - timedelta(days=1)).strftime("%Y%m%d")}, ignore_index=True)
                else:
                    df_final = df_final.append(
                        {"Platform": company, "Doctype": doctype, "URL": url, "From": df_comp_doc.iloc[i - 1][3],
                         "To": end_date_before.strftime("%Y%m%d")}, ignore_index=True)
                end_date_before = convert_entry_to_date(df_comp_doc.iloc[i][4])
                url = df_comp_doc.iloc[i][2]
            df_final = df_final.append(
                {"Platform": company, "Doctype": doctype, "URL": url, "From": start_date.strftime("%Y%m%d"),
                 "To": end_date_before.strftime("%Y%m%d")}, ignore_index=True)
    df_final.to_csv('/content/drive/MyDrive/Code/HIIG/pgArchive/data/utils/url_doc_timeline.csv', index = False)


def doc_already_crawled(df_memory, identifier):
    entry_exists = len(df_memory[(df_memory["ID"] == identifier)]) >= 1
    return entry_exists


def create_file_suffix(doctype, company):
    doctype = doctype.replace(' ', '')
    company = company.replace(' ', '')
    suffix = company + "_" + doctype
    return suffix


def download_url_list_as_pdf(url_list, doctype, df_memory_comp, company):
    df_memory_comp = df_memory_comp[(df_memory_comp["Doctype"] == doctype.replace(' ', ''))]
    new_downloads = []
    file_suffix = create_file_suffix(doctype, company)
    skip_next = False
    for item in url_list:
        if skip_next:
            skip_next = False
            continue
        url = item[2]
        timestamp = item[1]
        if company == "Soundcloud" and doctype in ["Community Guidelines"] and timestamp in ["20111010221822","20140131114029C"]:
            skip_next = True
        identifier = "" + timestamp + "" + doctype.replace(' ', '') + "" + company.replace(' ', '')
        file_name = "" + timestamp + "_" + file_suffix + ".pdf"
        print(identifier)
        path = "/content/drive/MyDrive/Code/HIIG/pgArchive/data/files/" + file_name
        if doc_already_crawled(df_memory_comp, identifier):
            print("Arleady in memory")
            continue
        d = datetime.now().strftime("%Y%m%d %H:%M:%S")
        try:
            print("Downloading...." + str(d))
            download_pdf_from_url(timestamp, url, path)
            print("Download finished")
            new_downloads.append({"Platform": company, "Doctype": doctype, "doc_date": timestamp, "filename": file_name,
                                  "crawling_date": date.today().strftime("%Y%m%d"), "ID": identifier})
        except Exception as e:
            print(e)
            continue
    return new_downloads


def crawl_documents(company, doctype):
    df_memory = pd.read_csv('/content/drive/MyDrive/Code/HIIG/pgArchive/data/utils/doc_crawling_memory.csv')
    df_memory_comp = df_memory[(df_memory["Platform"] == company)]

    df_timeline = pd.read_csv('/content/drive/MyDrive/Code/HIIG/pgArchive/data/utils/url_doc_timeline.csv')
    df_timeline_comp = df_timeline[(df_timeline["Platform"] == company)]
    df_timeline_comp = df_timeline_comp[(df_timeline_comp["Doctype"] == doctype)]
    for i in range(len(df_timeline_comp.index)):
        url = df_timeline_comp.iloc[i]["URL"]
        print(url)
        url_start_date = df_timeline_comp.iloc[i]["From"]
        url_end_date = df_timeline_comp.iloc[i]["To"]

        web_archive_list = get_url_list(url)
        if len(web_archive_list) == 0:
            continue
        web_archive_list = filter_list(web_archive_list, url_start_date, url_end_date)
        new_downloads = download_url_list_as_pdf(web_archive_list, doctype, df_memory_comp, company)
        for download in new_downloads:
            df_memory = df_memory.append(download, ignore_index=True)
    df_memory.to_csv('/content/drive/MyDrive/Code/HIIG/pgArchive/data/utils/doc_crawling_memory.csv')


def recover_memory_from_pdfs():
    files = os.listdir('/content/drive/MyDrive/Code/HIIG/pgArchive/data/files/')
    new_downloads = []
    df_memory = pd.read_csv('/content/drive/MyDrive/Code/HIIG/pgArchive/data/utils/doc_crawling_memory.csv')
    for file in files:
        filestring = str(file)
        if ".pdf" not in filestring:
            continue
        file_arr = filestring.split('_')
        timestamp = file_arr[0]
        doctype = file_arr[2].split('.')[0]
        company = file_arr[1]
        identifier = "" + timestamp + "" + doctype + "" + company
        df_memory = df_memory.append(
            {"Platform": company, "Doctype": doctype, "doc_date": timestamp, "filename": filestring,
             "crawling_date": date.today().strftime("%Y%m%d"), "ID": identifier}, ignore_index=True)
    df_memory.to_csv('/content/drive/MyDrive/Code/HIIG/pgArchive/data/utils/doc_crawling_memory.csv', index=False)

In [None]:
crawl_documents("Instagram", "Privacy Policy")

http://instagram.com/legal/privacy
20110718102340PrivacyPolicyInstagram
Downloading....20211221 15:58:18
wkhtmltopdf exited with non-zero code 1. error:
/content/usr/bin/wkhtmltopdf: /lib/x86_64-linux-gnu/libm.so.6: version `GLIBC_2.29' not found (required by /content/usr/bin/wkhtmltopdf)
/content/usr/bin/wkhtmltopdf: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.28' not found (required by /content/usr/bin/wkhtmltopdf)

20110818092924PrivacyPolicyInstagram
Downloading....20211221 15:58:18
wkhtmltopdf exited with non-zero code 1. error:
/content/usr/bin/wkhtmltopdf: /lib/x86_64-linux-gnu/libm.so.6: version `GLIBC_2.29' not found (required by /content/usr/bin/wkhtmltopdf)
/content/usr/bin/wkhtmltopdf: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.28' not found (required by /content/usr/bin/wkhtmltopdf)

20111204033633PrivacyPolicyInstagram
Downloading....20211221 15:58:18
wkhtmltopdf exited with non-zero code 1. error:
/content/usr/bin/wkhtmltopdf: /lib/x86_64-linux-gnu/libm.so.6

KeyboardInterrupt: ignored

In [None]:
! wget "https://github.com/wkhtmltopdf/packaging/releases/download/0.12.5-3/wkhtmltox-0.12.5-3.archlinux-x86_64.pkg.tar.xz" && tar vxfJ "wkhtmltox-0.12.5-3.archlinux-x86_64.pkg.tar.xz"

--2021-12-21 15:53:26--  https://github.com/wkhtmltopdf/packaging/releases/download/0.12.5-3/wkhtmltox-0.12.5-3.archlinux-x86_64.pkg.tar.xz
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 404 Not Found
2021-12-21 15:53:26 ERROR 404: Not Found.



In [None]:
! pip install glibc

Collecting glibc
  Downloading glibc-0.6.1-py2.py3-none-any.whl (35 kB)
Installing collected packages: glibc
Successfully installed glibc-0.6.1
