# Fasttext Data Pre-Processing

In [1]:
# read and pre-process data
import pandas as pd
import string
import nltk
from nltk.stem import WordNetLemmatizer # word lemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords # stopwords

# format data
from sklearn.model_selection import train_test_split # train test split
import csv

In [2]:
def pre_processing(text, lemmatize=True, stem=False):
    '''
    Accepts a text and processes text
    '''
    # strip accents
    text = text.encode('ascii', 'ignore')
    text = str(text.decode("utf-8"))

    # covert to lowercase
    text = text.lower()

    # remove punctuation
    text = text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))

    # remove unnecessary white spaces
    text = text.replace("\n", "")

    # tokenize
    text_words = nltk.word_tokenize(text)

    # lemmatize
    if lemmatize:
        wordnet_lemmatizer = WordNetLemmatizer()
        text_words = [wordnet_lemmatizer.lemmatize(x, pos="v") for x in text_words]

    # stem
    if stem:
        stemmer = SnowballStemmer("english")
        text_words = [stemmer.stem(x) for x in text_words]

    # remove stop words
    stop = list(stopwords.words('english'))
    keep_stopwords = ["no", "not", "nor"]
    for word in keep_stopwords:
        stop.remove(word)
        stop = set(stop)
    text_words = [x for x in text_words if not x in stop]

    return ' '.join(text_words)

In [5]:
def format_data(texts, labels):
    '''
    Accepts a series of texts and labels and outputs the formatted data for fasttext model
    '''
    formatted_data = []

    for i in range(len(texts)):
        current_row = []

        # prepare label
        current_row.append("__label__" + str(list(labels)[i]))

        # prepare text
        current_row.extend(nltk.word_tokenize(list(texts)[i]))

        # add to output
        formatted_data.append(current_row)
    
    return pd.Series(formatted_data)

In [13]:
def save_train_data(text, label, filename):
    '''
    Accepts a list of texts, labels and filenames and saves the data into .txt file for each corresponding text, label and filename
    '''
    formatted_data = format_data(text, label)
    # save data
    with open(filename, "w") as csvoutfile:
        csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n')
        for row in formatted_data:
            csv_writer.writerow(row)

def save_test_data(text, label, filename):
    '''
    Accepts a list of texts, labels and filenames and saves the data into a .csv file for reading later
    '''
    save_df = pd.concat([text, label], axis=1)
    save_df = save_df.reset_index(drop=True) # reset index
    save_df.columns = ["text", "label"] # standardise columns
    save_df.to_csv(filename, index=False) # save to csv

In [15]:
train_filenames = ['all_train', 'news_train', 'reddit_train', 'twitter_train']
test_filenames = ['all_test', 'news_test', 'reddit_test', 'twitter_test']

train_datasets = ['data/' + x + '.csv' for x in train_filenames]
test_datasets = ['data/' + x + '.csv' for x in test_filenames]

save_train_filenames = ['data/fasttext/normal/' + x + '.txt' for x in train_filenames]
save_train_filenames_lemmatize = ['data/fasttext/lemmatize/' + x + '_lemmatize.txt' for x in train_filenames]
save_train_filenames_stem = ['data/fasttext/stem/' + x + '_stem.txt' for x in train_filenames]

save_test_filenames = ['data/fasttext/normal/' + x + '.csv' for x in test_filenames]
save_test_filenames_lemmatize = ['data/fasttext/lemmatize/' + x + '_lemmatize.csv' for x in test_filenames]
save_test_filenames_stem = ['data/fasttext/stem/' + x + '_stem.csv' for x in test_filenames]

In [19]:
# generate train datasets
for i in range(len(train_filenames)):
    # load data
    train_df = pd.read_csv(train_datasets[i], header=0)

    # process text
    train_df_lemmatize, train_df_stem = train_df.copy(deep=True), train_df.copy(deep=True)
    train_df_lemmatize["text"] = train_df_lemmatize["text"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))
    train_df_stem["text"] = train_df_stem["text"].apply(lambda x: pre_processing(x, lemmatize=False, stem=True))
    train_df["text"] = train_df["text"].apply(lambda x: pre_processing(x, lemmatize=False, stem=False))

    # save data
    save_train_data(train_df["text"], train_df["label"], save_train_filenames[i])
    save_train_data(train_df_lemmatize["text"], train_df_lemmatize["label"], save_train_filenames_lemmatize[i])
    save_train_data(train_df_stem["text"], train_df_stem["label"], save_train_filenames_stem[i])

In [18]:
# generate test datasets
for i in range(len(test_filenames)):
    # load data
    test_df = pd.read_csv(test_datasets[i], header=0)

    # process text
    test_df_lemmatize, test_df_stem = test_df.copy(deep=True), test_df.copy(deep=True)
    test_df_lemmatize["text"] = test_df_lemmatize["text"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))
    test_df_stem["text"] = test_df_stem["text"].apply(lambda x: pre_processing(x, lemmatize=False, stem=True))
    test_df["text"] = test_df["text"].apply(lambda x: pre_processing(x, lemmatize=False, stem=False))

    # save data
    save_test_data(test_df["text"], test_df["label"], save_test_filenames[i])
    save_test_data(test_df_lemmatize["text"], test_df_lemmatize["label"], save_test_filenames_lemmatize[i])
    save_test_data(test_df_stem["text"], test_df_stem["label"], save_test_filenames_stem[i])