# Old

## Preprocess data

### Preprocess from .csv into .txt

In [2]:
import enchant
broker = enchant.Broker()
broker.describe()
broker.list_languages()

['en_US', 'en', 'en_AU', 'en_CA', 'en_GB']

In [3]:
import re

import enchant
import nltk

class TextCleaner:
    def __init__(self):
        #self.d = enchant.Dict("nl_NL")
        self.stopword_list = nltk.corpus.stopwords.words("dutch")
        self.STOPWORDS = set(self.stopword_list)
        
    def get_words(self):
        self.text = " ".join([c for c in nltk.word_tokenize(self.text)])
        return self

    def lower(self):
        """Transform to lower case."""
        self.text = "".join([t.lower() for t in self.text])
        return self

    def remove_stopwords(self):
        """Remove the stopwords."""
        self.text = "".join([t for t in self.text if t not in self.STOPWORDS])
        return self

    def remove_numeric(self):
        """Remove numbers."""
        self.text = "".join([c for c in self.text if not c.isdigit()])
        return self

    def remove_non_ascii(self):
        """Remove non ASCII chars."""
        self.text = "".join([re.sub(r"[^\x00-\x7f]", r" ", c) for c in self.text])
        return self

    def remove_extra_whitespace_tabs(self):
        """Remove extra whitespaces and tabs."""
        self.text = re.sub(r"^\s*|\s\s*", " ", self.text).strip()
        return self

    def remove_one_char(self):
        self.text = " ".join([w for w in self.text.split() if len(w) > 1])
        return self

    def remove_non_words(self):
        """Remove rare words."""
        self.text = " ".join(
            [word for word in str(self.text).split() if self.d.check(word)]
        )
        return self

    def keep_standard_chars(self):
        self.text = "".join([re.sub(r"[^-0-9\w,. ?!()%/]", r"", c) for c in self.text])
        return self

    def preprocess(self, text):
        self.text = text
        self = self.get_words()
        self = self.lower()
        self = self.remove_stopwords()
        self = self.remove_numeric()
        self = self.remove_extra_whitespace_tabs()
        self = self.remove_one_char()
        self = self.remove_non_words()
        return self.text

    def clean(self, text):
        self.text = text
        self = self.get_words()
        self = self.keep_standard_chars()
        self = self.remove_extra_whitespace_tabs()
        return self.text

In [4]:
from tqdm.notebook import tqdm, tqdm_notebook
from loguru import logger

def process_selected_articles(path):
    tqdm_notebook().pandas()
    csv_temp = []
    # Create preprocessing class
    tc = TextCleaner()

    # Load merged articles for selected topic in nlp_pipeline
    df = pd.read_csv(path)
         
    # Initial clean
    df.reset_index(inplace=True)
    df.drop(
        columns={
            "index",
        },
        inplace=True,
    )

    # Split p into original paragraphs
    logger.debug(
        f"Articles before selecting 'articles': {df.shape[0]}"
    )
    df = df[df["subject"] == "artikel"]
    df["p"] = df.apply(lambda row: repr(row["p"]).split("\\',"), axis=1)
    logger.debug(
        f"Articles after selecting 'articles': {df.shape[0]}"
    )
    df = df.explode("p")
    logger.debug(
        f"Articles after splitting into paragraphs: {df.shape[0]}"
    )

    # Preprocess p to cleaner p for Tokenizer and transformers
    res = df["p"].progress_apply(tc.clean)

    # Eliminate paragraphs that do not contain anything
    res.dropna(inplace=True)
    
    # Save to .txt  
    base = os.path.basename(path)
    name = os.path.splitext(base)[0]
    res.to_csv(f'/home/leonardovida/data-histaware/raw/raw_merged/{name}.txt', header=None, index=None, sep=' ', mode='a')
    
    logger.debug(f"Completed: {path}")

### Convert from _.csv_ to _.txt_ - Do it just once though

In [6]:
# Find path to csv files with processed data
#Path().parent.absolute()
paths = [str(x) for x in Path(PATH_RAW_FILES).glob("*.csv")]
paths = paths[2:]
paths

['/home/leonardovida/data-histaware/raw/raw_merged/merged_1970s_40.csv',
 '/home/leonardovida/data-histaware/raw/raw_merged/merged_1970s_60.csv',
 '/home/leonardovida/data-histaware/raw/raw_merged/merged_1970s_80.csv',
 '/home/leonardovida/data-histaware/raw/raw_merged/merged_1970s_120.csv',
 '/home/leonardovida/data-histaware/raw/raw_merged/merged_1970s_100.csv']

In [None]:
# Create txt files for "processed" data
for path in tqdm(paths, total=len(paths)):
    process_selected_articles(path)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

2021-04-10 14:31:52.863 | DEBUG    | __main__:process_selected_articles:23 - Articles before selecting 'articles': 957008
2021-04-10 14:32:03.730 | DEBUG    | __main__:process_selected_articles:28 - Articles after selecting 'articles': 567544
2021-04-10 14:32:07.279 | DEBUG    | __main__:process_selected_articles:32 - Articles after splitting into paragraphs: 1225204


HBox(children=(FloatProgress(value=0.0, max=1225204.0), HTML(value='')))

### Load .txt files into one (to be changed)

In [None]:
text_files = [str(x) for x in Path(PATH_RAW_FILES).glob("*.txt")]
df = pd.DataFrame()
for file in text_files:
    temp = pd.read_csv(f"{PATH_RAW_FILES}/merged_1970s_20.txt", delimiter = "\t", header=None)
    df = pd.concat([df, temp], axis=0)

In [None]:
df.shape[0]