In [1]:
%load_ext autoreload
%autoreload 2

In [70]:
def process_list_with_progress_bar(items, outdir, func, callback, error_callback):
    from tqdm import tqdm
    from pathlib import Path
    outdir = Path(outdir)
    outdir.mkdir(parents=True, exist_ok=True)
    with tqdm(total=len(items)) as pbar:
        for x in items:
            i, item = x
            # pbar.set_description(f"Processing {str(item)}")
            try:
                result = func(i,item)
                callback(i,item,result, outdir)
            except Exception as e:
                error_callback(i,item,e, outdir)
            pbar.update(1)

In [14]:
def dump(i, item, result, outdir):
    import json
    data = {
        "item": item,
        "result": result,
    }
    with open(outdir / f"{i}.json", "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [66]:
def dump_err(i, item, e, outdir):
    import json
    data = {
        "item": item,
        "error": {
            "type": str(type(e)),
            "args": [str(arg) for arg in e.args],
            "message": str(e)
        },
    }
    with open(outdir / f"{i}.error.json", "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [53]:
def input_phase():
    def process_item(i,item):
        url,notes = item.split(",",1)
        notes = notes.strip()
        if notes == url or notes == "":
            notes = "no notes provided"
        return url,notes
    process_list_with_progress_bar(
        items=enumerate(open("input.txt","r",encoding="utf-8").readlines()),
        outdir="outputs/1-read",
        func=process_item,
        callback=dump,
        error_callback=dump_err,
    )
input_phase()

100%|██████████| 129/129 [00:00<00:00, 2434.01it/s]


In [82]:
def process_indir_with_progress_bar(indir, outdir, func, callback, error_callback):
    from pathlib import Path
    indir = Path(indir)
    assert indir.exists()
    items = list(indir.glob("*"))
    items = [item for item in items if not item.name.endswith(".error.json")]
    items = [(int(item.name.split(".",1)[0]),item) for item in items]
    items = sorted(items, key=lambda x: x[0])
    import json
    items = [(i,json.load(open(item,"r",encoding="utf-8"))["result"]) for i,item in items]

    outdir = Path(outdir)
    if outdir.exists():
        done = [x.name for x in outdir.glob("*.json")]
        done_ids = [int(x.split(".",1)[0]) for x in done]
        items = [(i,item) for i,item in items if not i in done_ids]
        print(f"Skipping {len(done)} items, processing {len(items)} items")
    return process_list_with_progress_bar(items, outdir, func, callback, error_callback)

In [80]:
def fetch_phase():
    def process_item(i,item):
        url, notes = item
        import requests
        res = requests.get(url)
        html = res.text
        from time import sleep
        sleep(0.1)

        if res.status_code != 200:
            raise Exception(f"HTTP status code {res.status_code}")
        return (url, notes, html)
    
    process_indir_with_progress_bar(
        indir="outputs/1-read/",
        outdir="outputs/2-fetch/",
        func=process_item,
        callback=dump,
        error_callback=dump_err,
    )
fetch_phase()

Skipping 60 items, processing 69 items


100%|██████████| 69/69 [02:48<00:00,  2.44s/it]


In [92]:
def innertext_phase():
    def process_item(i, item):
        url, notes, page = item
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(page, 'html.parser')
        
        # Remove undesired tags
        for tag in soup.find_all(class_=['header', 'footer', 'sidebar']):
            tag.decompose()

        # Get title
        title = soup.title.string if soup.title else ""
        
        # Get text from <main> tag
        main = soup.find('main')
        if main:
            main_text = main.get_text()
        else:
            main_text = soup.get_text()
        
        # Consolidate texts
        text = f"Title: {title}\n\nMain: {main_text}"

        assert len(text) > 100, f"Text too short ({len(text)}): {text}"
        
        return (url, notes, text)
    
    process_indir_with_progress_bar(
        indir="outputs/2-fetch/",
        outdir="outputs/3-innertext/",
        func=process_item,
        callback=dump,
        error_callback=dump_err,
    )
innertext_phase()

100%|██████████| 112/112 [00:06<00:00, 17.90it/s]


In [None]:
def summarize_text(text, num_sentences=5):
    from sumy.parsers.plaintext import PlaintextParser
    from sumy.nlp.tokenizers import Tokenizer
    from sumy.summarizers.lsa import LsaSummarizer
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, num_sentences)
    return " ".join(str(sentence) for sentence in summary)
summarize_text("""The design of experiments (DOE or DOX), also known as experiment design or experimental design, is the design of any task that aims to describe and explain the variation of information under conditions that are hypothesized to reflect the variation. The term is generally associated with experiments in which the design introduces conditions that directly affect the variation, but may also refer to the design of quasi-experiments, in which natural conditions that influence the variation are selected for observation.

In its simplest form, an experiment aims at predicting the outcome by introducing a change of the preconditions, which is represented by one or more independent variables, also referred to as "input variables" or "predictor variables." The change in one or more independent variables is generally hypothesized to result in a change in one or more dependent variables, also referred to as "output variables" or "response variables." The experimental design may also identify control variables that must be held constant to prevent external factors from affecting the results. Experimental design involves not only the selection of suitable independent, dependent, and control variables, but planning the delivery of the experiment under statistically optimal conditions given the constraints of available resources. There are multiple approaches for determining the set of design points (unique combinations of the settings of the independent variables) to be used in the experiment.

Main concerns in experimental design include the establishment of validity, reliability, and replicability. For example, these concerns can be partially addressed by carefully choosing the independent variable, reducing the risk of measurement error, and ensuring that the documentation of the method is sufficiently detailed. Related concerns include achieving appropriate levels of statistical power and sensitivity.

Correctly designed experiments advance knowledge in the natural and social sciences and engineering, with design of experiments methodology recognised as a key tool in the successful implementation of a Quality by Design (QbD) framework.[1] Other applications include marketing and policy making. The study of the design of experiments is an important topic in metascience."""
               ,num_sentences=4)

'In its simplest form, an experiment aims at predicting the outcome by introducing a change of the preconditions, which is represented by one or more independent variables, also referred to as "input variables" or "predictor variables." Main concerns in experimental design include the establishment of validity, reliability, and replicability. Related concerns include achieving appropriate levels of statistical power and sensitivity. Correctly designed experiments advance knowledge in the natural and social sciences and engineering, with design of experiments methodology recognised as a key tool in the successful implementation of a Quality by Design (QbD) framework.'

In [None]:
def summarize_phase():
    def process_item(i,item):
        url,notes,text = item
        summarized_text = summarize_text(text, num_sentences=5)
        return (url, notes, summarized_text)
    
    process_indir_with_progress_bar(
        indir="outputs/3-innertext/",
        outdir="outputs/4-summary/",
        func=process_item,
        callback=dump,
        error_callback=dump_err,
    )
summarize_phase()

  warn(message % (words_count, sentences_count))
100%|██████████| 112/112 [00:49<00:00,  2.25it/s]


In [47]:
def tagging_phase():
    def process_item(i,item):
        url, notes, summary=item
        import requests
        json_body = {
            "prompt": f"""
### Instruction:
Provide a list of hashtags for the following.
Output should be parse-able with python's ast.literal_eval() and nothing else.

{summary}

### Response:
['
            """.strip(),
            "max_new_tokens": 50
        }

        # Forward request
        response = requests.post(
            "http://127.0.0.1:5000/api/v1/generate",
            headers={"Content-Type": "application/json"},
            json=json_body
        )
        response = response.json()["results"][0]["text"]
        return (url, notes, response)
    
    process_indir_with_progress_bar(
        indir="outputs/4-summary/",
        outdir="outputs/5-tagging/",
        func=process_item,
        callback=dump,
        error_callback=dump_err,
    )
innertext_phase()

100%|██████████| 127/127 [01:52<00:00,  1.13it/s]
