# Fasttext Data Pre-Processing

In [1]:
# read and pre-process data
import pandas as pd
import string
from datetime import datetime
import nltk
from nltk.stem import WordNetLemmatizer # word lemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords # stopwords

# format data
import csv

## Read and Clean Text

In [2]:
def pre_processing(text, lemmatize=True, stem=False):
    '''
    Accepts a text and processes text
    '''
    # strip accents
    text = text.encode('ascii', 'ignore')
    text = str(text.decode("utf-8"))

    # covert to lowercase
    text = text.lower()

    # remove punctuation
    text = text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))

    # remove unnecessary white spaces
    text = text.replace("\n", "")

    # tokenize
    text_words = nltk.word_tokenize(text)

    # lemmatize
    if lemmatize:
        wordnet_lemmatizer = WordNetLemmatizer()
        text_words = [wordnet_lemmatizer.lemmatize(x, pos="v") for x in text_words]

    # stem
    if stem:
        stemmer = SnowballStemmer("english")
        text_words = [stemmer.stem(x) for x in text_words]

    # remove stop words
    stop = list(stopwords.words('english'))
    keep_stopwords = ["no", "not", "nor"]
    for word in keep_stopwords:
        stop.remove(word)
        stop = set(stop)
    text_words = [x for x in text_words if not x in stop]

    return ' '.join(text_words)

### Conventional and Cryptonews Data

In [3]:
# load data
sample_crypto = pd.read_csv("data/sample_crypto.csv", header=0)

# combine title and excerpt
sample_crypto["text"] = sample_crypto["title"].fillna('') + " " + sample_crypto["excerpt"].fillna('')

# sample text
sample_crypto_text = sample_crypto[["date_time", "title", "text", "label"]]
sample_crypto_excerpt = sample_crypto[["date_time", "excerpt", "label"]]
sample_crypto_excerpt = sample_crypto_excerpt.dropna(subset=["excerpt"])

# sample with lemmatized words
sample_crypto_lemmatize_text = sample_crypto_text.copy()
sample_crypto_lemmatize_text["title"] = sample_crypto_lemmatize_text["title"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))
sample_crypto_lemmatize_text["text"] = sample_crypto_lemmatize_text["text"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))
sample_crypto_lemmatize_excerpt = sample_crypto_excerpt.copy()
sample_crypto_lemmatize_excerpt["excerpt"] = sample_crypto_lemmatize_excerpt["excerpt"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))

# sample with stemmed words
sample_crypto_stem_text = sample_crypto_text.copy()
sample_crypto_stem_text["title"] = sample_crypto_stem_text["title"].apply(lambda x: pre_processing(x, lemmatize=False, stem=True))
sample_crypto_stem_text["text"] = sample_crypto_stem_text["text"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))
sample_crypto_stem_excerpt = sample_crypto_excerpt.copy()
sample_crypto_stem_excerpt["excerpt"] = sample_crypto_stem_excerpt["excerpt"].apply(lambda x: pre_processing(x, lemmatize=False, stem=True))

### Reddit Data

In [4]:
sample_reddit = pd.read_csv("data/sample_reddit.csv", header=0)[["date_time", "title", "excerpt", "label"]]

# combine title and excerpt (if any)
sample_reddit["text"] = sample_reddit["title"].fillna('') + " " + sample_reddit["excerpt"].fillna('')

# sample with lemmatized words
sample_reddit_lemmatize = sample_reddit.copy()
sample_reddit_lemmatize["text"] = sample_reddit_lemmatize["text"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))

# sample with stemmed words
sample_reddit_stem = sample_reddit.copy()
sample_reddit_stem["text"] = sample_reddit_stem["text"].apply(lambda x: pre_processing(x, lemmatize=False, stem=True))

### Twitter Data

In [5]:
sample_twitter = pd.read_csv("data/sample_twitter.csv", header=0)[["date_time", "text", "label"]]

# remove any whitespaces in text
sample_twitter["text"] = sample_twitter["text"].apply(lambda x: x.replace("\n",""))

# sample with lemmatized words
sample_twitter_lemmatize = sample_twitter.copy()
sample_twitter_lemmatize["text"] = sample_twitter_lemmatize["text"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))

# sample with stemmed words
sample_twitter_stem = sample_twitter.copy()
sample_twitter_stem["text"] = sample_twitter_stem["text"].apply(lambda x: pre_processing(x, lemmatize=False, stem=True))

### Combined Data

In [6]:
# social media
# combine lemmatized data
sample_socialmedia_lemmatize = pd.concat([sample_reddit_lemmatize[["date_time", "text", "label"]], sample_twitter_lemmatize])

# combine stemmed data
sample_socialmedia_stem = pd.concat([sample_reddit_stem[["date_time", "text", "label"]], sample_twitter_stem])

# all
# combine lemmatized data
sample_all_lemmatize = pd.concat([sample_crypto_lemmatize_text[["date_time", "text", "label"]], sample_reddit_lemmatize[["date_time", "text", "label"]], sample_twitter_lemmatize])

# combine stemmed data
sample_all_stem = pd.concat([sample_crypto_stem_text[["date_time", "text", "label"]], sample_reddit_stem[["date_time", "text", "label"]], sample_twitter_stem])

## Train-Test-Validation Split

In [7]:
def try_parsing_date(text):
    for fmt in ('%Y-%m-%d %H:%M:%S', '%d/%m/%y %H:%M', '%d/%m/%y'):
        try:
            return datetime.strptime(text, fmt)
        except ValueError:
            pass

        except TypeError:
            return text
        
    raise ValueError('no valid date format found')

def train_val_test_split(df):
    '''
    Output: train-set(80%), validation-set(20%), test set(20%)
    '''
    #Convert all date_time to datetime format
    df['date_time'] = df['date_time'].apply(lambda x: try_parsing_date(x))
    df = df[df['date_time'].notna()]

    
    #Sort df by date_time
    df = df.sort_values('date_time', ascending = True)
    df = df.reset_index(drop=True)

    #Num of rows in the dataframe
    numrows = df.shape[0]

    #Retrieve just the date 
    df['date'] =  df['date_time'].apply(lambda x: x.date())

    #Get training dataset
    #Get last date for the train set and get the data rows that fall within the date range
    train_index = int(0.6*numrows) - 1
    last_date1 = df.iloc[train_index]['date']
    train = df[df['date'] <= last_date1]
    
    #Get validation dataset
    num_rows_required = int(0.2*numrows) 
    val_index = train.shape[0]
    last_date2 = df.iloc[val_index + num_rows_required - 1]['date']
    mask = (df['date'] > last_date1) & (df['date'] <= last_date2)
    val = df.loc[mask]

    #Get testing dataset
    test = df[df['date'] > last_date2]

    return train, val, test

### Conventional and Cryptonews Data

In [8]:
# split train-test (lemmatize, title/text)
sample_crypto_lemmatize_text_train, sample_crypto_lemmatize_text_val, sample_crypto_lemmatize_text_test = train_val_test_split(sample_crypto_lemmatize_text)
sample_crypto_lemmatize_text_train_all = pd.concat([sample_crypto_lemmatize_text_train, sample_crypto_lemmatize_text_val], axis=0)

X_crypto_lemmatize_title_train_all = sample_crypto_lemmatize_text_train_all["title"]
X_crypto_lemmatize_text_train_all = sample_crypto_lemmatize_text_train_all["text"]
y_crypto_lemmatize_text_train_all = sample_crypto_lemmatize_text_train_all["label"]

X_crypto_lemmatize_title_train = sample_crypto_lemmatize_text_train["title"]
X_crypto_lemmatize_text_train = sample_crypto_lemmatize_text_train["text"]
y_crypto_lemmatize_text_train = sample_crypto_lemmatize_text_train["label"]

X_crypto_lemmatize_title_val = sample_crypto_lemmatize_text_val["title"]
X_crypto_lemmatize_text_val = sample_crypto_lemmatize_text_val["text"]
y_crypto_lemmatize_text_val = sample_crypto_lemmatize_text_val["label"]

X_crypto_lemmatize_title_test = sample_crypto_lemmatize_text_test["title"]
X_crypto_lemmatize_text_test = sample_crypto_lemmatize_text_test["text"]
y_crypto_lemmatize_text_test = sample_crypto_lemmatize_text_test["label"]

In [9]:
# split train-test (lemmatize, excerpt)
sample_crypto_lemmatize_excerpt_train, sample_crypto_lemmatize_excerpt_val, sample_crypto_lemmatize_excerpt_test = train_val_test_split(sample_crypto_lemmatize_excerpt)
sample_crypto_lemmatize_excerpt_train_all = pd.concat([sample_crypto_lemmatize_excerpt_train, sample_crypto_lemmatize_excerpt_val], axis=0)

X_crypto_lemmatize_excerpt_train_all = sample_crypto_lemmatize_excerpt_train_all["excerpt"]
y_crypto_lemmatize_excerpt_train_all = sample_crypto_lemmatize_excerpt_train_all["label"]

X_crypto_lemmatize_excerpt_train = sample_crypto_lemmatize_excerpt_train["excerpt"]
y_crypto_lemmatize_excerpt_train = sample_crypto_lemmatize_excerpt_train["label"]

X_crypto_lemmatize_excerpt_val = sample_crypto_lemmatize_excerpt_val["excerpt"]
y_crypto_lemmatize_excerpt_val = sample_crypto_lemmatize_excerpt_val["label"]

X_crypto_lemmatize_excerpt_test = sample_crypto_lemmatize_excerpt_test["excerpt"]
y_crypto_lemmatize_excerpt_test = sample_crypto_lemmatize_excerpt_test["label"]

In [10]:
# split train-test (stem, title/text)
sample_crypto_stem_text_train, sample_crypto_stem_text_val, sample_crypto_stem_text_test = train_val_test_split(sample_crypto_stem_text)
sample_crypto_stem_text_train_all = pd.concat([sample_crypto_stem_text_train, sample_crypto_stem_text_val], axis=0)

X_crypto_stem_title_train_all = sample_crypto_stem_text_train_all["title"]
X_crypto_stem_text_train_all = sample_crypto_stem_text_train_all["text"]
y_crypto_stem_text_train_all = sample_crypto_stem_text_train_all["label"]

X_crypto_stem_title_train = sample_crypto_stem_text_train["title"]
X_crypto_stem_text_train = sample_crypto_stem_text_train["text"]
y_crypto_stem_text_train = sample_crypto_stem_text_train["label"]

X_crypto_stem_title_val = sample_crypto_stem_text_val["title"]
X_crypto_stem_text_val = sample_crypto_stem_text_val["text"]
y_crypto_stem_text_val = sample_crypto_stem_text_val["label"]

X_crypto_stem_title_test = sample_crypto_stem_text_test["title"]
X_crypto_stem_text_test = sample_crypto_stem_text_test["text"]
y_crypto_stem_text_test = sample_crypto_stem_text_test["label"]

In [11]:
# split train-test (stem, excerpt)
sample_crypto_stem_excerpt_train, sample_crypto_stem_excerpt_val, sample_crypto_stem_excerpt_test = train_val_test_split(sample_crypto_stem_excerpt)
sample_crypto_stem_excerpt_train_all = pd.concat([sample_crypto_stem_excerpt_train, sample_crypto_stem_excerpt_val], axis=0)

X_crypto_stem_excerpt_train_all = sample_crypto_stem_excerpt_train_all["excerpt"]
y_crypto_stem_excerpt_train_all = sample_crypto_stem_excerpt_train_all["label"]

X_crypto_stem_excerpt_train = sample_crypto_stem_excerpt_train["excerpt"]
y_crypto_stem_excerpt_train = sample_crypto_stem_excerpt_train["label"]

X_crypto_stem_excerpt_val = sample_crypto_stem_excerpt_val["excerpt"]
y_crypto_stem_excerpt_val = sample_crypto_stem_excerpt_val["label"]

X_crypto_stem_excerpt_test = sample_crypto_stem_excerpt_test["excerpt"]
y_crypto_stem_excerpt_test = sample_crypto_stem_excerpt_test["label"]

### Reddit Data

In [12]:
# split train-test (lemmatize)
sample_reddit_lemmatize_train, sample_reddit_lemmatize_val, sample_reddit_lemmatize_test = train_val_test_split(sample_reddit_lemmatize)
sample_reddit_lemmatize_train_all = pd.concat([sample_reddit_lemmatize_train, sample_reddit_lemmatize_val], axis=0)

X_reddit_lemmatize_train_all = sample_reddit_lemmatize_train_all["text"]
y_reddit_lemmatize_train_all = sample_reddit_lemmatize_train_all["label"]

X_reddit_lemmatize_train = sample_reddit_lemmatize_train["text"]
y_reddit_lemmatize_train = sample_reddit_lemmatize_train["label"]

X_reddit_lemmatize_val = sample_reddit_lemmatize_val["text"]
y_reddit_lemmatize_val = sample_reddit_lemmatize_val["label"]

X_reddit_lemmatize_test = sample_reddit_lemmatize_test["text"]
y_reddit_lemmatize_test = sample_reddit_lemmatize_test["label"]

In [13]:
# split train-test (stem)
sample_reddit_stem_train, sample_reddit_stem_val, sample_reddit_stem_test = train_val_test_split(sample_reddit_stem)
sample_reddit_stem_train_all = pd.concat([sample_reddit_stem_train, sample_reddit_stem_val], axis=0)

X_reddit_stem_train_all = sample_reddit_stem_train_all["text"]
y_reddit_stem_train_all = sample_reddit_stem_train_all["label"]

X_reddit_stem_train = sample_reddit_stem_train["text"]
y_reddit_stem_train = sample_reddit_stem_train["label"]

X_reddit_stem_val = sample_reddit_stem_val["text"]
y_reddit_stem_val = sample_reddit_stem_val["label"]

X_reddit_stem_test = sample_reddit_stem_test["text"]
y_reddit_stem_test = sample_reddit_stem_test["label"]

### Twitter Data

In [14]:
# split train-test (lemmatize)
sample_twitter_lemmatize_train, sample_twitter_lemmatize_val, sample_twitter_lemmatize_test = train_val_test_split(sample_twitter_lemmatize)
sample_twitter_lemmatize_train_all = pd.concat([sample_twitter_lemmatize_train, sample_twitter_lemmatize_val], axis=0)

X_twitter_lemmatize_train_all = sample_twitter_lemmatize_train_all["text"]
y_twitter_lemmatize_train_all = sample_twitter_lemmatize_train_all["label"]

X_twitter_lemmatize_train = sample_twitter_lemmatize_train["text"]
y_twitter_lemmatize_train = sample_twitter_lemmatize_train["label"]

X_twitter_lemmatize_val = sample_twitter_lemmatize_val["text"]
y_twitter_lemmatize_val = sample_twitter_lemmatize_val["label"]

X_twitter_lemmatize_test = sample_twitter_lemmatize_test["text"]
y_twitter_lemmatize_test = sample_twitter_lemmatize_test["label"]

In [15]:
# split train-test (stem)
sample_twitter_stem_train, sample_twitter_stem_val, sample_twitter_stem_test = train_val_test_split(sample_twitter_stem)
sample_twitter_stem_train_all = pd.concat([sample_twitter_stem_train, sample_twitter_stem_val], axis=0)

X_twitter_stem_train_all = sample_twitter_stem_train_all["text"]
y_twitter_stem_train_all = sample_twitter_stem_train_all["label"]

X_twitter_stem_train = sample_twitter_stem_train["text"]
y_twitter_stem_train = sample_twitter_stem_train["label"]

X_twitter_stem_val = sample_twitter_stem_val["text"]
y_twitter_stem_val = sample_twitter_stem_val["label"]

X_twitter_stem_test = sample_twitter_stem_test["text"]
y_twitter_stem_test = sample_twitter_stem_test["label"]

### Combined Data

In [16]:
# split train-test (social media, lemmatize)
sample_socialmedia_lemmatize_train, sample_socialmedia_lemmatize_val, sample_socialmedia_lemmatize_test = train_val_test_split(sample_socialmedia_lemmatize)
sample_socialmedia_lemmatize_train_all = pd.concat([sample_socialmedia_lemmatize_train, sample_socialmedia_lemmatize_val], axis=0)

X_socialmedia_lemmatize_train_all = sample_socialmedia_lemmatize_train_all["text"]
y_socialmedia_lemmatize_train_all = sample_socialmedia_lemmatize_train_all["label"]

X_socialmedia_lemmatize_train = sample_socialmedia_lemmatize_train["text"]
y_socialmedia_lemmatize_train = sample_socialmedia_lemmatize_train["label"]

X_socialmedia_lemmatize_val = sample_socialmedia_lemmatize_val["text"]
y_socialmedia_lemmatize_val = sample_socialmedia_lemmatize_val["label"]

X_socialmedia_lemmatize_test = sample_socialmedia_lemmatize_test["text"]
y_socialmedia_lemmatize_test = sample_socialmedia_lemmatize_test["label"]

In [17]:
# split train-test (social media, stem)
sample_socialmedia_stem_train, sample_socialmedia_stem_val, sample_socialmedia_stem_test = train_val_test_split(sample_socialmedia_stem)
sample_socialmedia_stem_train_all = pd.concat([sample_socialmedia_stem_train, sample_socialmedia_stem_val], axis=0)

X_socialmedia_stem_train_all = sample_socialmedia_stem_train_all["text"]
y_socialmedia_stem_train_all = sample_socialmedia_stem_train_all["label"]

X_socialmedia_stem_train = sample_socialmedia_stem_train["text"]
y_socialmedia_stem_train = sample_socialmedia_stem_train["label"]

X_socialmedia_stem_val = sample_socialmedia_stem_val["text"]
y_socialmedia_stem_val = sample_socialmedia_stem_val["label"]

X_socialmedia_stem_test = sample_socialmedia_stem_test["text"]
y_socialmedia_stem_test = sample_socialmedia_stem_test["label"]

In [18]:
# split train-test (all, lemmatize)
sample_all_lemmatize_train, sample_all_lemmatize_val, sample_all_lemmatize_test = train_val_test_split(sample_all_lemmatize)
sample_all_lemmatize_train_all = pd.concat([sample_all_lemmatize_train, sample_all_lemmatize_val], axis=0)

X_all_lemmatize_train_all = sample_all_lemmatize_train_all["text"]
y_all_lemmatize_train_all = sample_all_lemmatize_train_all["label"]

X_all_lemmatize_train = sample_all_lemmatize_train["text"]
y_all_lemmatize_train = sample_all_lemmatize_train["label"]

X_all_lemmatize_val = sample_all_lemmatize_val["text"]
y_all_lemmatize_val = sample_all_lemmatize_val["label"]

X_all_lemmatize_test = sample_all_lemmatize_test["text"]
y_all_lemmatize_test = sample_all_lemmatize_test["label"]

In [19]:
# split train-test (all, stem)
sample_all_stem_train, sample_all_stem_val, sample_all_stem_test = train_val_test_split(sample_all_stem)
sample_all_stem_train_all = pd.concat([sample_all_stem_train, sample_all_stem_val], axis=0)

X_all_stem_train_all = sample_all_stem_train_all["text"]
y_all_stem_train_all = sample_all_stem_train_all["label"]

X_all_stem_train = sample_all_stem_train["text"]
y_all_stem_train = sample_all_stem_train["label"]

X_all_stem_val = sample_all_stem_val["text"]
y_all_stem_val = sample_all_stem_val["label"]

X_all_stem_test = sample_all_stem_test["text"]
y_all_stem_test = sample_all_stem_test["label"]

## Formatting and Saving Data

In [20]:
def format_data(texts, labels):
    '''
    Accepts a series of texts and labels and outputs the formatted data for fasttext model
    '''
    formatted_data = []

    for i in range(len(texts)):
        current_row = []

        # prepare label
        current_row.append("__label__" + str(list(labels)[i]))

        # prepare text
        current_row.extend(nltk.word_tokenize(list(texts)[i]))

        # add to output
        formatted_data.append(current_row)
    
    return pd.Series(formatted_data)

In [21]:
def save_train_data(text_list, label_list, filename_list):
    '''
    Accepts a list of texts, labels and filenames and saves the data into .txt file for each corresponding text, label and filename
    '''
    for i in range(len(filename_list)):
        # format data
        formatted_data = format_data(text_list[i], label_list[i])

        # save data
        filename = filename_list[i]
        with open(filename, "w") as csvoutfile:
            csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n')
            for row in formatted_data:
                csv_writer.writerow(row)

def save_test_data(text_list, label_list, filename_list):
    '''
    Accepts a list of texts, labels and filenames and saves the data into a .csv file for reading later
    '''
    for i in range(len(filename_list)):
        save_df = pd.concat([text_list[i], label_list[i]], axis=1)
        save_df = save_df.reset_index(drop=True) # reset index
        save_df.columns = ["text", "label"]
        save_df.to_csv(filename_list[i], index=False) # save to csv

In [22]:
# instantiate postfix (common to all train, test, validation sets)
filename = ["sample_crypto_lemmatize_title", "sample_crypto_lemmatize_excerpt", "sample_crypto_lemmatize_text", "sample_crypto_stem_title", "sample_crypto_stem_excerpt", "sample_crypto_stem_text", "sample_reddit_lemmatize", "sample_reddit_stem", "sample_twitter_lemmatize", "sample_twitter_stem", "sample_socialmedia_lemmatize", "sample_socialmedia_stem", "sample_all_lemmatize", "sample_all_stem"]

In [23]:
# save training (all) data as .txt for fasttext model input
# set text and label data
train_text_list_all = [X_crypto_lemmatize_title_train_all, X_crypto_lemmatize_excerpt_train_all, X_crypto_lemmatize_text_train_all, X_crypto_stem_title_train_all, X_crypto_stem_excerpt_train_all, X_crypto_stem_text_train_all, X_reddit_lemmatize_train_all, X_reddit_stem_train_all, X_twitter_lemmatize_train_all, X_twitter_stem_train_all, X_socialmedia_lemmatize_train_all, X_socialmedia_stem_train_all, X_all_lemmatize_train_all, X_all_stem_train_all]

train_label_list_all = [y_crypto_lemmatize_text_train_all, y_crypto_lemmatize_excerpt_train_all, y_crypto_lemmatize_text_train_all, y_crypto_stem_text_train_all, y_crypto_stem_excerpt_train_all, y_crypto_stem_text_train_all, y_reddit_lemmatize_train_all, y_reddit_stem_train_all, y_twitter_lemmatize_train_all, y_twitter_stem_train_all, y_socialmedia_lemmatize_train_all, y_socialmedia_stem_train_all, y_all_lemmatize_train_all, y_all_stem_train_all]

# set filenames
train_all_filename_prefix = "data/fasttext_date/train_all/"
train_all_filename_postfix = ".txt"
train_all_filename_list = [train_all_filename_prefix + filename[i] + train_all_filename_postfix for i in range(len(filename))]

# save data
save_train_data(text_list=train_text_list_all, label_list=train_label_list_all, filename_list=train_all_filename_list)

In [24]:
# save training data as .txt for fasttext model input
# set text and label data
train_text_list = [X_crypto_lemmatize_title_train, X_crypto_lemmatize_excerpt_train, X_crypto_lemmatize_text_train, X_crypto_stem_title_train, X_crypto_stem_excerpt_train, X_crypto_stem_text_train, X_reddit_lemmatize_train, X_reddit_stem_train, X_twitter_lemmatize_train, X_twitter_stem_train, X_socialmedia_lemmatize_train, X_socialmedia_stem_train, X_all_lemmatize_train, X_all_stem_train]

train_label_list = [y_crypto_lemmatize_text_train, y_crypto_lemmatize_excerpt_train, y_crypto_lemmatize_text_train, y_crypto_stem_text_train, y_crypto_stem_excerpt_train, y_crypto_stem_text_train, y_reddit_lemmatize_train, y_reddit_stem_train, y_twitter_lemmatize_train, y_twitter_stem_train, y_socialmedia_lemmatize_train, y_socialmedia_stem_train, y_all_lemmatize_train, y_all_stem_train]

# set filenames
train_filename_prefix = "data/fasttext_date/train/"
train_filename_postfix = ".txt"
train_filename_list = [train_filename_prefix + filename[i] + train_filename_postfix for i in range(len(filename))]

# save data
save_train_data(text_list=train_text_list, label_list=train_label_list, filename_list=train_filename_list)

In [25]:
# save validation data as .csv
# set text and label data
validation_text_list = [X_crypto_lemmatize_title_val, X_crypto_lemmatize_excerpt_val, X_crypto_lemmatize_text_val, X_crypto_stem_title_val, X_crypto_stem_excerpt_val, X_crypto_stem_text_val, X_reddit_lemmatize_val, X_reddit_stem_val, X_twitter_lemmatize_val, X_twitter_stem_val, X_socialmedia_lemmatize_val, X_socialmedia_stem_val, X_all_lemmatize_val, X_all_stem_val]

validation_label_list = [y_crypto_lemmatize_text_val, y_crypto_lemmatize_excerpt_val, y_crypto_lemmatize_text_val, y_crypto_stem_text_val, y_crypto_stem_excerpt_val, y_crypto_stem_text_val, y_reddit_lemmatize_test, y_reddit_stem_val, y_twitter_lemmatize_val, y_twitter_stem_val, y_socialmedia_lemmatize_val, y_socialmedia_stem_val, y_all_lemmatize_val, y_all_stem_val]

# set filenames
validation_filename_prefix = "data/fasttext_date/validation/"
validation_filename_postfix = ".csv"
validation_filename_list = [validation_filename_prefix + filename[i] + validation_filename_postfix for i in range(len(filename))]


save_test_data(text_list=validation_text_list, label_list=validation_label_list, filename_list=validation_filename_list)

In [26]:
# save testing data as .csv
# set text and label data
test_text_list = [X_crypto_lemmatize_title_test, X_crypto_lemmatize_excerpt_test, X_crypto_lemmatize_text_test, X_crypto_stem_title_test, X_crypto_stem_excerpt_test, X_crypto_stem_text_test, X_reddit_lemmatize_test, X_reddit_stem_test, X_twitter_lemmatize_test, X_twitter_stem_test, X_socialmedia_lemmatize_test, X_socialmedia_stem_test, X_all_lemmatize_test, X_all_stem_test]

test_label_list = [y_crypto_lemmatize_text_test, y_crypto_lemmatize_excerpt_test, y_crypto_lemmatize_text_test, y_crypto_stem_text_test, y_crypto_stem_excerpt_test, y_crypto_stem_text_test, y_reddit_lemmatize_test, y_reddit_stem_test, y_twitter_lemmatize_test, y_twitter_stem_test, y_socialmedia_lemmatize_test, y_socialmedia_stem_test, y_all_lemmatize_test, y_all_stem_test]

# set filenames
test_filename_prefix = "data/fasttext_date/test/"
test_filename_postfix = ".csv"
test_filename_list = [test_filename_prefix + filename[i] + test_filename_postfix for i in range(len(filename))]

save_test_data(text_list=test_text_list, label_list=test_label_list, filename_list=test_filename_list)