# Fasttext Data Pre-Processing

In [1]:
# read and pre-process data
import pandas as pd
import string
import nltk
from nltk.stem import WordNetLemmatizer # word lemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords # stopwords

# format data
from sklearn.model_selection import train_test_split # train test split
import csv

## Read and Clean Text

In [2]:
def pre_processing(text, lemmatize=True, stem=False):
    '''
    Accepts a text and processes text
    '''
    # strip accents
    text = text.encode('ascii', 'ignore')
    text = str(text.decode("utf-8"))

    # covert to lowercase
    text = text.lower()

    # remove punctuation
    text = text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))

    # remove unnecessary white spaces
    text = text.replace("\n", "")

    # tokenize
    text_words = nltk.word_tokenize(text)

    # lemmatize
    if lemmatize:
        wordnet_lemmatizer = WordNetLemmatizer()
        text_words = [wordnet_lemmatizer.lemmatize(x, pos="v") for x in text_words]

    # stem
    if stem:
        stemmer = SnowballStemmer("english")
        text_words = [stemmer.stem(x) for x in text_words]

    # remove stop words
    stop = list(stopwords.words('english'))
    keep_stopwords = ["no", "not", "nor"]
    for word in keep_stopwords:
        stop.remove(word)
        stop = set(stop)
    text_words = [x for x in text_words if not x in stop]

    return ' '.join(text_words)

### Conventional and Cryptonews Data

In [3]:
# load data
sample_crypto = pd.read_csv("data/sample_crypto.csv", header=0)

# combine title and excerpt
sample_crypto["text"] = sample_crypto["title"].fillna('') + " " + sample_crypto["excerpt"].fillna('')

# sample text
sample_crypto_text = sample_crypto[["title", "text", "label"]]
sample_crypto_excerpt = sample_crypto[["excerpt", "label"]]
sample_crypto_excerpt = sample_crypto_excerpt.dropna(subset=["excerpt"])

# sample with lemmatized words
sample_crypto_lemmatize_text = sample_crypto_text.copy()
sample_crypto_lemmatize_text["title"] = sample_crypto_lemmatize_text["title"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))
sample_crypto_lemmatize_text["text"] = sample_crypto_lemmatize_text["text"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))
sample_crypto_lemmatize_excerpt = sample_crypto_excerpt.copy()
sample_crypto_lemmatize_excerpt["excerpt"] = sample_crypto_lemmatize_excerpt["excerpt"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))

# sample with stemmed words
sample_crypto_stem_text = sample_crypto_text.copy()
sample_crypto_stem_text["title"] = sample_crypto_stem_text["title"].apply(lambda x: pre_processing(x, lemmatize=False, stem=True))
sample_crypto_stem_text["text"] = sample_crypto_stem_text["text"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))
sample_crypto_stem_excerpt = sample_crypto_excerpt.copy()
sample_crypto_stem_excerpt["excerpt"] = sample_crypto_stem_excerpt["excerpt"].apply(lambda x: pre_processing(x, lemmatize=False, stem=True))

# separate lemmatized X and y
X_crypto_lemmatize_text = sample_crypto_lemmatize_text[["title", "text"]]
y_crypto_lemmatize_text = sample_crypto_lemmatize_text["label"]
X_crypto_lemmatize_excerpt = sample_crypto_lemmatize_excerpt["excerpt"]
y_crypto_lemmatize_excerpt = sample_crypto_lemmatize_excerpt["label"]

# separate stemmed X and y
X_crypto_stem_text = sample_crypto_stem_text[["title", "text"]]
y_crypto_stem_text = sample_crypto_stem_text["label"]
X_crypto_stem_excerpt = sample_crypto_stem_excerpt["excerpt"]
y_crypto_stem_excerpt = sample_crypto_stem_excerpt["label"]

### Reddit Data

In [4]:
sample_reddit = pd.read_csv("data/sample_reddit.csv", header=0)[["title", "excerpt", "label"]]

# combine title and excerpt (if any)
sample_reddit["text"] = sample_reddit["title"].fillna('') + " " + sample_reddit["excerpt"].fillna('')

# sample with lemmatized words
sample_reddit_lemmatize = sample_reddit.copy()
sample_reddit_lemmatize["text"] = sample_reddit_lemmatize["text"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))

# sample with stemmed words
sample_reddit_stem = sample_reddit.copy()
sample_reddit_stem["text"] = sample_reddit_stem["text"].apply(lambda x: pre_processing(x, lemmatize=False, stem=True))

# separate lemmatized X and y
X_reddit_lemmatize = sample_reddit_lemmatize["text"]
y_reddit_lemmatize = sample_reddit_lemmatize["label"]

# separate stemmed X and y
X_reddit_stem = sample_reddit_stem["text"]
y_reddit_stem = sample_reddit_stem["label"]

### Twitter Data

In [5]:
sample_twitter = pd.read_csv("data/sample_twitter.csv", header=0)[["text", "label"]]

# remove any whitespaces in text
sample_twitter["text"] = sample_twitter["text"].apply(lambda x: x.replace("\n",""))

# sample with lemmatized words
sample_twitter_lemmatize = sample_twitter.copy()
sample_twitter_lemmatize["text"] = sample_twitter_lemmatize["text"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))

# sample with stemmed words
sample_twitter_stem = sample_twitter.copy()
sample_twitter_stem["text"] = sample_twitter_stem["text"].apply(lambda x: pre_processing(x, lemmatize=False, stem=True))

# separate lemmatized X and y
X_twitter_lemmatize = sample_twitter_lemmatize["text"]
y_twitter_lemmatize = sample_twitter_lemmatize["label"]

# separate stemmed X and y
X_twitter_stem = sample_twitter_stem["text"]
y_twitter_stem = sample_twitter_stem["label"]

### Combined Data

In [6]:
# social media
# combine lemmatized data
sample_socialmedia_lemmatize = pd.concat([sample_reddit_lemmatize[["text", "label"]], sample_twitter_lemmatize])

# combine stemmed data
sample_socialmedia_stem = pd.concat([sample_reddit_stem[["text", "label"]], sample_twitter_stem])

# separate lemmatized X and y
X_socialmedia_lemmatize = sample_socialmedia_lemmatize["text"]
y_socialmedia_lemmatize = sample_socialmedia_lemmatize["label"]

# separate stemmed X and y
X_socialmedia_stem = sample_socialmedia_stem["text"]
y_socialmedia_stem = sample_socialmedia_stem["label"]

In [7]:
# all
# combine lemmatized data
sample_all_lemmatize = pd.concat([sample_crypto_lemmatize_text[["text", "label"]], sample_reddit_lemmatize[["text", "label"]], sample_twitter_lemmatize])

# combine stemmed data
sample_all_stem = pd.concat([sample_crypto_stem_text[["text", "label"]], sample_reddit_stem[["text", "label"]], sample_twitter_stem])

# separate lemmatized X and y
X_all_lemmatize = sample_all_lemmatize["text"]
y_all_lemmatize = sample_all_lemmatize["label"]

# separate stemmed X and y
X_all_stem = sample_all_stem["text"]
y_all_stem = sample_all_stem["label"]

## Train-Test Split
### Conventional and Cryptonews Data

In [8]:
# split train-test (lemmatize, title/text)
X_crypto_lemmatize_text_train_init, X_crypto_lemmatize_text_test_init, y_crypto_lemmatize_text_train, y_crypto_lemmatize_text_test = train_test_split(X_crypto_lemmatize_text, y_crypto_lemmatize_text, test_size=0.2, random_state=123)
X_crypto_lemmatize_title_train = X_crypto_lemmatize_text_train_init["title"]
X_crypto_lemmatize_text_train = X_crypto_lemmatize_text_train_init["text"]
X_crypto_lemmatize_title_test = X_crypto_lemmatize_text_test_init["title"]
X_crypto_lemmatize_text_test = X_crypto_lemmatize_text_test_init["text"]

# split train-test (lemmatize, excerpt)
X_crypto_lemmatize_excerpt_train, X_crypto_lemmatize_excerpt_test, y_crypto_lemmatize_excerpt_train, y_crypto_lemmatize_excerpt_test = train_test_split(X_crypto_lemmatize_excerpt, y_crypto_lemmatize_excerpt, test_size=0.2, random_state=123)

# split train-test (stem, title/text)
X_crypto_stem_text_train_init, X_crypto_stem_text_test_init, y_crypto_stem_text_train, y_crypto_stem_text_test = train_test_split(X_crypto_stem_text, y_crypto_stem_text, test_size=0.2, random_state=123)
X_crypto_stem_title_train = X_crypto_stem_text_train_init["title"]
X_crypto_stem_text_train = X_crypto_stem_text_train_init["text"]
X_crypto_stem_title_test = X_crypto_stem_text_test_init["title"]
X_crypto_stem_text_test = X_crypto_stem_text_test_init["text"]

# split train-test (stem, excerpt)
X_crypto_stem_excerpt_train, X_crypto_stem_excerpt_test, y_crypto_stem_excerpt_train, y_crypto_stem_excerpt_test = train_test_split(X_crypto_stem_excerpt, y_crypto_stem_excerpt, test_size=0.2, random_state=123)

### Reddit Data

In [9]:
# split train-test (lemmatize)
X_reddit_lemmatize_train, X_reddit_lemmatize_test, y_reddit_lemmatize_train, y_reddit_lemmatize_test = train_test_split(X_reddit_lemmatize, y_reddit_lemmatize, test_size=0.2, random_state=123)

# split train-test (stem)
X_reddit_stem_train, X_reddit_stem_test, y_reddit_stem_train, y_reddit_stem_test = train_test_split(X_reddit_stem, y_reddit_stem, test_size=0.2, random_state=123)

### Twitter Data

In [10]:
# split train-test (lemmatize)
X_twitter_lemmatize_train, X_twitter_lemmatize_test, y_twitter_lemmatize_train, y_twitter_lemmatize_test = train_test_split(X_twitter_lemmatize, y_twitter_lemmatize, test_size=0.2, random_state=123)

# split train-test (stem)
X_twitter_stem_train, X_twitter_stem_test, y_twitter_stem_train, y_twitter_stem_test = train_test_split(X_twitter_stem, y_twitter_stem, test_size=0.2, random_state=123)

### Combined Data

In [11]:
# social media
# split train-test (lemmatize)
X_socialmedia_lemmatize_train, X_socialmedia_lemmatize_test, y_socialmedia_lemmatize_train, y_socialmedia_lemmatize_test = train_test_split(X_socialmedia_lemmatize, y_socialmedia_lemmatize, test_size=0.2, random_state=123)

# split train-test (stem)
X_socialmedia_stem_train, X_socialmedia_stem_test, y_socialmedia_stem_train, y_socialmedia_stem_test = train_test_split(X_socialmedia_stem, y_socialmedia_stem, test_size=0.2, random_state=123)

In [12]:
# all
# split train-test (lemmatize)
X_all_lemmatize_train, X_all_lemmatize_test, y_all_lemmatize_train, y_all_lemmatize_test = train_test_split(X_all_lemmatize, y_all_lemmatize, test_size=0.2, random_state=123)

# split train-test (stem)
X_all_stem_train, X_all_stem_test, y_all_stem_train, y_all_stem_test = train_test_split(X_all_stem, y_all_stem, test_size=0.2, random_state=123)

## Formatting and Saving Data (Part 1: Full Train and Test Data)

In [13]:
def format_data(texts, labels):
    '''
    Accepts a series of texts and labels and outputs the formatted data for fasttext model
    '''
    formatted_data = []

    for i in range(len(texts)):
        current_row = []

        # prepare label
        current_row.append("__label__" + str(list(labels)[i]))

        # prepare text
        current_row.extend(nltk.word_tokenize(list(texts)[i]))

        # add to output
        formatted_data.append(current_row)
    
    return pd.Series(formatted_data)

In [14]:
def save_train_data(text_list, label_list, filename_list):
    '''
    Accepts a list of texts, labels and filenames and saves the data into .txt file for each corresponding text, label and filename
    '''
    for i in range(len(filename_list)):
        # format data
        formatted_data = format_data(text_list[i], label_list[i])

        # save data
        filename = filename_list[i]
        with open(filename, "w") as csvoutfile:
            csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n')
            for row in formatted_data:
                csv_writer.writerow(row)

def save_test_data(text_list, label_list, filename_list):
    '''
    Accepts a list of texts, labels and filenames and saves the data into a .csv file for reading later
    '''
    for i in range(len(filename_list)):
        save_df = pd.concat([text_list[i], label_list[i]], axis=1)
        save_df = save_df.reset_index(drop=True) # reset index
        save_df.to_csv(filename_list[i], index=False) # save to csv

In [15]:
# instantiate postfix (common to all train, test, validation sets)
filename = ["sample_crypto_lemmatize_title", "sample_crypto_lemmatize_excerpt", "sample_crypto_lemmatize_text", "sample_crypto_stem_title", "sample_crypto_stem_excerpt", "sample_crypto_stem_text", "sample_reddit_lemmatize", "sample_reddit_stem", "sample_twitter_lemmatize", "sample_twitter_stem", "sample_socialmedia_lemmatize", "sample_socialmedia_stem", "sample_all_lemmatize", "sample_all_stem"]

In [16]:
# save training data as .txt for fasttext model input
# set text and label data
train_text_list_full = [X_crypto_lemmatize_title_train, X_crypto_lemmatize_excerpt_train, X_crypto_lemmatize_text_train, X_crypto_stem_title_train, X_crypto_stem_excerpt_train, X_crypto_stem_text_train, X_reddit_lemmatize_train, X_reddit_stem_train, X_twitter_lemmatize_train, X_twitter_stem_train, X_socialmedia_lemmatize_train, X_socialmedia_stem_train, X_all_lemmatize_train, X_all_stem_train]

train_label_list_full = [y_crypto_lemmatize_text_train, y_crypto_lemmatize_excerpt_train, y_crypto_lemmatize_text_train, y_crypto_stem_text_train, y_crypto_stem_excerpt_train, y_crypto_stem_text_train, y_reddit_lemmatize_train, y_reddit_stem_train, y_twitter_lemmatize_train, y_twitter_stem_train, y_socialmedia_lemmatize_train, y_socialmedia_stem_train, y_all_lemmatize_train, y_all_stem_train]

# set filenames
train_filename_prefix = "data/fasttext_unsorted/train_all/"
train_filename_postfix = ".txt"
train_filename_list = [train_filename_prefix + filename[i] + train_filename_postfix for i in range(len(filename))]

# save data
save_train_data(text_list=train_text_list_full, label_list=train_label_list_full, filename_list=train_filename_list)

In [17]:
# save testing data as .csv
# set text and label data
test_text_list = [X_crypto_lemmatize_title_test, X_crypto_lemmatize_excerpt_test, X_crypto_lemmatize_text_test, X_crypto_stem_title_test, X_crypto_stem_excerpt_test, X_crypto_stem_text_test, X_reddit_lemmatize_test, X_reddit_stem_test, X_twitter_lemmatize_test, X_twitter_stem_test, X_socialmedia_lemmatize_test, X_socialmedia_stem_test, X_all_lemmatize_test, X_all_stem_test]

test_label_list = [y_crypto_lemmatize_text_test, y_crypto_lemmatize_excerpt_test, y_crypto_lemmatize_text_test, y_crypto_stem_text_test, y_crypto_stem_excerpt_test, y_crypto_stem_text_test, y_reddit_lemmatize_test, y_reddit_stem_test, y_twitter_lemmatize_test, y_twitter_stem_test, y_socialmedia_lemmatize_test, y_socialmedia_stem_test, y_all_lemmatize_test, y_all_stem_test]

# set filenames
test_filename_prefix = "data/fasttext_unsorted/test/"
test_filename_postfix = ".csv"
test_filename_list = [test_filename_prefix + filename[i] + test_filename_postfix for i in range(len(filename))]

save_test_data(text_list=test_text_list, label_list=test_label_list, filename_list=test_filename_list)

## Train-Validation Split
### Conventional and Cryptonews Data

In [18]:
# split train-validation (lemmatize, title/text)
X_crypto_lemmatize_text_train, X_crypto_lemmatize_text_val, y_crypto_lemmatize_text_train, y_crypto_lemmatize_text_val = train_test_split(X_crypto_lemmatize_text_train_init, y_crypto_lemmatize_text_train, test_size=0.25, random_state=123)
X_crypto_lemmatize_title_train = X_crypto_lemmatize_text_train["title"]
X_crypto_lemmatize_text_train = X_crypto_lemmatize_text_train["text"]
X_crypto_lemmatize_title_val = X_crypto_lemmatize_text_val["title"]
X_crypto_lemmatize_text_val = X_crypto_lemmatize_text_val["text"]

# split train-validation (lemmatize, excerpt)
X_crypto_lemmatize_excerpt_train, X_crypto_lemmatize_excerpt_val, y_crypto_lemmatize_excerpt_train, y_crypto_lemmatize_excerpt_val = train_test_split(X_crypto_lemmatize_excerpt_train, y_crypto_lemmatize_excerpt_train, test_size=0.25, random_state=123)

# split train-validation (stem, title/text)
X_crypto_stem_text_train, X_crypto_stem_text_val, y_crypto_stem_text_train, y_crypto_stem_text_val = train_test_split(X_crypto_stem_text_train_init, y_crypto_stem_text_train, test_size=0.25, random_state=123)
X_crypto_stem_title_train = X_crypto_stem_text_train["title"]
X_crypto_stem_text_train = X_crypto_stem_text_train["text"]
X_crypto_stem_title_val = X_crypto_stem_text_val["title"]
X_crypto_stem_text_val = X_crypto_stem_text_val["text"]

# split train-validation (stem, excerpt)
X_crypto_stem_excerpt_train, X_crypto_stem_excerpt_val, y_crypto_stem_excerpt_train, y_crypto_stem_excerpt_val = train_test_split(X_crypto_stem_excerpt_train, y_crypto_stem_excerpt_train, test_size=0.25, random_state=123)

### Reddit Data

In [19]:
# split train-validation (lemmatize)
X_reddit_lemmatize_train, X_reddit_lemmatize_val, y_reddit_lemmatize_train, y_reddit_lemmatize_val = train_test_split(X_reddit_lemmatize_train, y_reddit_lemmatize_train, test_size=0.25, random_state=123)

# split train-validation (stem)
X_reddit_stem_train, X_reddit_stem_val, y_reddit_stem_train, y_reddit_stem_val = train_test_split(X_reddit_stem_train, y_reddit_stem_train, test_size=0.25, random_state=123)

### Twitter Data

In [20]:
# split train-validation (lemmatize)
X_twitter_lemmatize_train, X_twitter_lemmatize_val, y_twitter_lemmatize_train, y_twitter_lemmatize_val = train_test_split(X_twitter_lemmatize_train, y_twitter_lemmatize_train, test_size=0.25, random_state=123)

# split train-validation (stem)
X_twitter_stem_train, X_twitter_stem_val, y_twitter_stem_train, y_twitter_stem_val = train_test_split(X_twitter_stem_train, y_twitter_stem_train, test_size=0.25, random_state=123)

### Combined Data

In [21]:
# social media
# split train-validation (lemmatize)
X_socialmedia_lemmatize_train, X_socialmedia_lemmatize_val, y_socialmedia_lemmatize_train, y_socialmedia_lemmatize_val = train_test_split(X_socialmedia_lemmatize_train, y_socialmedia_lemmatize_train, test_size=0.25, random_state=123)

# split train-validation (stem)
X_socialmedia_stem_train, X_socialmedia_stem_val, y_socialmedia_stem_train, y_socialmedia_stem_val = train_test_split(X_socialmedia_stem_train, y_socialmedia_stem_train, test_size=0.25, random_state=123)

In [22]:
# all
# split train-validation (lemmatize)
X_all_lemmatize_train, X_all_lemmatize_val, y_all_lemmatize_train, y_all_lemmatize_val = train_test_split(X_all_lemmatize_train, y_all_lemmatize_train, test_size=0.25, random_state=123)

# split train-validation (stem)
X_all_stem_train, X_all_stem_val, y_all_stem_train, y_all_stem_val = train_test_split(X_all_stem_train, y_all_stem_train, test_size=0.25, random_state=123)

## Formatting and Saving Data (Part 2: Train and Validation Data)

In [23]:
# save training data as .txt for fasttext model input
# set text and label data
train_text_list = [X_crypto_lemmatize_title_train, X_crypto_lemmatize_excerpt_train, X_crypto_lemmatize_text_train, X_crypto_stem_title_train, X_crypto_stem_excerpt_train, X_crypto_stem_text_train, X_reddit_lemmatize_train, X_reddit_stem_train, X_twitter_lemmatize_train, X_twitter_stem_train, X_socialmedia_lemmatize_train, X_socialmedia_stem_train, X_all_lemmatize_train, X_all_stem_train]

train_label_list = [y_crypto_lemmatize_text_train, y_crypto_lemmatize_excerpt_train, y_crypto_lemmatize_text_train, y_crypto_stem_text_train, y_crypto_stem_excerpt_train, y_crypto_stem_text_train, y_reddit_lemmatize_train, y_reddit_stem_train, y_twitter_lemmatize_train, y_twitter_stem_train, y_socialmedia_lemmatize_train, y_socialmedia_stem_train, y_all_lemmatize_train, y_all_stem_train]

# set filenames
train_filename_prefix = "data/fasttext_unsorted/train/"
train_filename_postfix = ".txt"
train_filename_list = [train_filename_prefix + filename[i] + train_filename_postfix for i in range(len(filename))]

# save data
save_train_data(text_list=train_text_list, label_list=train_label_list, filename_list=train_filename_list)

In [24]:
# save validation data as .csv
# set text and label data
validation_text_list = [X_crypto_lemmatize_title_val, X_crypto_lemmatize_excerpt_val, X_crypto_lemmatize_text_val, X_crypto_stem_title_val, X_crypto_stem_excerpt_val, X_crypto_stem_text_val, X_reddit_lemmatize_val, X_reddit_stem_val, X_twitter_lemmatize_val, X_twitter_stem_val, X_socialmedia_lemmatize_val, X_socialmedia_stem_val, X_all_lemmatize_val, X_all_stem_val]

validation_label_list = [y_crypto_lemmatize_text_val, y_crypto_lemmatize_excerpt_val, y_crypto_lemmatize_text_val, y_crypto_stem_text_val, y_crypto_stem_excerpt_val, y_crypto_stem_text_val, y_reddit_lemmatize_test, y_reddit_stem_val, y_twitter_lemmatize_val, y_twitter_stem_val, y_socialmedia_lemmatize_val, y_socialmedia_stem_val, y_all_lemmatize_val, y_all_stem_val]

# set filenames
validation_filename_prefix = "data/fasttext_unsorted/validation/"
validation_filename_postfix = ".csv"
validation_filename_list = [validation_filename_prefix + filename[i] + validation_filename_postfix for i in range(len(filename))]


save_test_data(text_list=validation_text_list, label_list=validation_label_list, filename_list=validation_filename_list)