### Objective: 
1. Create Folders based on unique rating values and create empty files inside folders with filename=article_pageid
2. Download articles from wikipedia and save as file. Use article_pageid as page id to search in wikipedia api.

#### RUNNING IMPORTS

In [None]:
import pandas as pd
import os
import threading
import wikipedia
import queue
import pickle

### EDIT THIS TAB TO CONFIGURE NOTEBOOK SETTINGS

In [None]:
# CONFIG
BASE_FOLDER = "TEST"
TSV_FILE = "test_set.tsv"
# Group into folder by this column
FOLDER_BY = "rating"
# Create files with names from this column
FILE_BY = "article_pageid"
FILE_FORMAT = ".txt"
# THREADS
THREAD_COUNT = 20
# END CONFIG

##### READING THE TSV FILE AND CREATING INFERENCES

In [None]:
df = pd.read_csv(TSV_FILE, delimiter="\t")
df.head()
file_dict = dict(zip(df[FILE_BY], df[FOLDER_BY]))
folder_dict = dict(zip(df[FOLDER_BY], df[FILE_BY]))
urllist = df[FILE_BY].unique().tolist()

#### CREATING FOLDERS

In [None]:
if not os.path.exists(BASE_FOLDER):
    os.mkdir(BASE_FOLDER)
for folder in folder_dict.keys():
    os.makedirs(os.path.join(BASE_FOLDER, folder), exist_ok=True)
    print(f"folder created: {os.path.join(BASE_FOLDER, folder)}")


### MAIN ASYNC WORKER CLASS AND RUNNER FUNCTION

In [None]:
ERROR_LIST = []
class Worker(threading.Thread):
    def __init__(self, q, *args, **kwargs):
        self.q = q
        super().__init__(*args, **kwargs)
    def run(self):
        while True:
            try:
                work = self.q.get(timeout=3)
                print(f"{self.name} working on {work} with {self.q.qsize()} items left")
                page = wikipedia.page(pageid=work)
                keepcharacters = (".", "_")
                title = "".join(c for c in page.title if c.isalnum() or c in keepcharacters).rstrip()
                content = page.content.encode("utf-8")
                folder = file_dict[work]
                with open(os.path.join(BASE_FOLDER, folder, f"{title}{FILE_FORMAT}"), "wb") as f:
                    f.write(content)
                    f.close()
                    print(f"{os.path.join(BASE_FOLDER, folder, f'{title}{FILE_FORMAT}')} written")
            except queue.Empty:
                return
            except Exception as e:
                print(f"{self.name} error",e)
                ERROR_LIST.append(work)
            self.q.task_done()

def RunWorkers(urllist):
    q = queue.Queue()
    global ERROR_LIST 
    ERROR_LIST = []
    for work in urllist:
        q.put_nowait(work)
    for _ in range(THREAD_COUNT):
        Worker(q).start()
    q.join()
    print(f"{len(ERROR_LIST)} errors")
    pickle.dump(ERROR_LIST, open("ERROR_LIST.p", "wb"))

## RUN THIS FOR FIRST RUN / DOWNLOAD FROM CSV

In [None]:
RunWorkers(urllist)

## RUN THIS FOR ERROR CORRECTION FROM PREVIOUS RUN

In [None]:
ERRORS = pickle.load(open("ERROR_LIST.p", "rb"))
print(f"Errors: {len(ERRORS)}")

RunWorkers(ERRORS)