# FastText Sentiment Analysis Model

In [1]:
# read and pre-process data
import pandas as pd
# import string
# import nltk
# from nltk.stem import WordNetLemmatizer # word lemmatizer
# from nltk.stem.snowball import SnowballStemmer
# from nltk.corpus import stopwords # stopwords

# # format data
# from sklearn.model_selection import train_test_split # train test split
# import csv

# modelling
import io # to generate pre-trained vector
import fasttext

# evaluation metrics
from sklearn.metrics import f1_score, precision_score, recall_score

## Data Pre-Processing

In [2]:
def pre_processing(text, lemmatize=True, stem=False):
    # strip accents
    text = text.encode('ascii', 'ignore')
    text = str(text.decode("utf-8"))

    # covert to lowercase
    text = text.lower()

    # remove punctuation
    text = text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))

    # remove unnecessary white spaces
    text = text.replace("\n", "")

    # tokenize
    text_words = nltk.word_tokenize(text)

    # lemmatize
    if lemmatize:
        wordnet_lemmatizer = WordNetLemmatizer()
        text_words = [wordnet_lemmatizer.lemmatize(x, pos="v") for x in text_words]

    # stem
    if stem:
        stemmer = SnowballStemmer("english")
        text_words = [stemmer.stem(x) for x in text_words]

    # remove stop words
    stop = list(stopwords.words('english'))
    keep_stopwords = ["no", "not", "nor"]
    for word in keep_stopwords:
        stop.remove(word)
        stop = set(stop)
    text_words = [x for x in text_words if not x in stop]

    return ' '.join(text_words)

### Conventional and Cryptonews Data

In [3]:
# load data
sample_crypto = pd.read_csv("data/sample_crypto.csv", header=0)

# combine title and excerpt
sample_crypto["text"] = sample_crypto["title"].fillna('') + " " + sample_crypto["excerpt"].fillna('')

# sample text
sample_crypto_text = sample_crypto[["title", "text", "label"]]
sample_crypto_excerpt = sample_crypto[["excerpt", "label"]]
sample_crypto_excerpt = sample_crypto_excerpt.dropna(subset=["excerpt"])

# sample with lemmatized words
sample_crypto_lemmatize_text = sample_crypto_text.copy()
sample_crypto_lemmatize_text["title"] = sample_crypto_lemmatize_text["title"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))
sample_crypto_lemmatize_text["text"] = sample_crypto_lemmatize_text["text"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))
sample_crypto_lemmatize_excerpt = sample_crypto_excerpt.copy()
sample_crypto_lemmatize_excerpt["excerpt"] = sample_crypto_lemmatize_excerpt["excerpt"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))

# sample with stemmed words
sample_crypto_stem_text = sample_crypto_text.copy()
sample_crypto_stem_text["title"] = sample_crypto_stem_text["title"].apply(lambda x: pre_processing(x, lemmatize=False, stem=True))
sample_crypto_stem_text["text"] = sample_crypto_stem_text["text"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))
sample_crypto_stem_excerpt = sample_crypto_excerpt.copy()
sample_crypto_stem_excerpt["excerpt"] = sample_crypto_stem_excerpt["excerpt"].apply(lambda x: pre_processing(x, lemmatize=False, stem=True))

# separate lemmatized X and y
X_crypto_lemmatize_text = sample_crypto_lemmatize_text[["title", "text"]]
y_crypto_lemmatize_text = sample_crypto_lemmatize_text["label"]
X_crypto_lemmatize_excerpt = sample_crypto_lemmatize_excerpt["excerpt"]
y_crypto_lemmatize_excerpt = sample_crypto_lemmatize_excerpt["label"]

# separate stemmed X and y
X_crypto_stem_text = sample_crypto_stem_text[["title", "text"]]
y_crypto_stem_text = sample_crypto_stem_text["label"]
X_crypto_stem_excerpt = sample_crypto_stem_excerpt["excerpt"]
y_crypto_stem_excerpt = sample_crypto_stem_excerpt["label"]

### Reddit Data

In [4]:
sample_reddit = pd.read_csv("data/sample_reddit.csv", header=0)[["title", "excerpt", "label"]]

# combine title and excerpt (if any)
sample_reddit["text"] = sample_reddit["title"].fillna('') + " " + sample_reddit["excerpt"].fillna('')

# sample with lemmatized words
sample_reddit_lemmatize = sample_reddit.copy()
sample_reddit_lemmatize["text"] = sample_reddit_lemmatize["text"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))

# sample with stemmed words
sample_reddit_stem = sample_reddit.copy()
sample_reddit_stem["text"] = sample_reddit_stem["text"].apply(lambda x: pre_processing(x, lemmatize=False, stem=True))

# separate lemmatized X and y
X_reddit_lemmatize = sample_reddit_lemmatize["text"]
y_reddit_lemmatize = sample_reddit_lemmatize["label"]

# separate stemmed X and y
X_reddit_stem = sample_reddit_stem["text"]
y_reddit_stem = sample_reddit_stem["label"]

### Twitter Data

In [5]:
sample_twitter = pd.read_csv("data/sample_twitter.csv", header=0)[["text", "label"]]

# remove any whitespaces in text
sample_twitter["text"] = sample_twitter["text"].apply(lambda x: x.replace("\n",""))

# sample with lemmatized words
sample_twitter_lemmatize = sample_twitter.copy()
sample_twitter_lemmatize["text"] = sample_twitter_lemmatize["text"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))

# sample with stemmed words
sample_twitter_stem = sample_twitter.copy()
sample_twitter_stem["text"] = sample_twitter_stem["text"].apply(lambda x: pre_processing(x, lemmatize=False, stem=True))

# separate lemmatized X and y
X_twitter_lemmatize = sample_twitter_lemmatize["text"]
y_twitter_lemmatize = sample_twitter_lemmatize["label"]

# separate stemmed X and y
X_twitter_stem = sample_twitter_stem["text"]
y_twitter_stem = sample_twitter_stem["label"]

### Combined Data

In [6]:
# combine lemmatized data
sample_socialmedia_lemmatize = pd.concat([sample_reddit_lemmatize[["text", "label"]], sample_twitter_lemmatize])

# combine stemmed data
sample_socialmedia_stem = pd.concat([sample_reddit_stem[["text", "label"]], sample_twitter_stem])

# separate lemmatized X and y
X_socialmedia_lemmatize = sample_socialmedia_lemmatize["text"]
y_socialmedia_lemmatize = sample_socialmedia_lemmatize["label"]

# separate stemmed X and y
X_socialmedia_stem = sample_socialmedia_stem["text"]
y_socialmedia_stem = sample_socialmedia_stem["label"]

In [7]:
# combine lemmatized data
sample_all_lemmatize = pd.concat([sample_crypto_lemmatize_text[["text", "label"]], sample_reddit_lemmatize[["text", "label"]], sample_twitter_lemmatize])

# combine stemmed data
sample_all_stem = pd.concat([sample_crypto_stem_text[["text", "label"]], sample_reddit_stem[["text", "label"]], sample_twitter_stem])

# separate lemmatized X and y
X_all_lemmatize = sample_all_lemmatize["text"]
y_all_lemmatize = sample_all_lemmatize["label"]

# separate stemmed X and y
X_all_stem = sample_all_stem["text"]
y_all_stem = sample_all_stem["label"]

## Train-Test Split
### Conventional and Cryptonews Data

In [8]:
# split train-test-validation (lemmatize, title/text)
X_crypto_lemmatize_text_train_init, X_crypto_lemmatize_text_test_init, y_crypto_lemmatize_text_train, y_crypto_lemmatize_text_test = train_test_split(X_crypto_lemmatize_text, y_crypto_lemmatize_text, test_size=0.2, random_state=123)
X_crypto_lemmatize_title_train = X_crypto_lemmatize_text_train_init["title"]
X_crypto_lemmatize_text_train = X_crypto_lemmatize_text_train_init["text"]
X_crypto_lemmatize_title_test = X_crypto_lemmatize_text_test_init["title"]
X_crypto_lemmatize_text_test = X_crypto_lemmatize_text_test_init["text"]

# split train-test-validation (lemmatize, excerpt)
X_crypto_lemmatize_excerpt_train, X_crypto_lemmatize_excerpt_test, y_crypto_lemmatize_excerpt_train, y_crypto_lemmatize_excerpt_test = train_test_split(X_crypto_lemmatize_excerpt, y_crypto_lemmatize_excerpt, test_size=0.2, random_state=123)

# split train-test-validation (stem, title/text)
X_crypto_stem_text_train_init, X_crypto_stem_text_test_init, y_crypto_stem_text_train, y_crypto_stem_text_test = train_test_split(X_crypto_stem_text, y_crypto_stem_text, test_size=0.2, random_state=123)
X_crypto_stem_title_train = X_crypto_stem_text_train_init["title"]
X_crypto_stem_text_train = X_crypto_stem_text_train_init["text"]
X_crypto_stem_title_test = X_crypto_stem_text_test_init["title"]
X_crypto_stem_text_test = X_crypto_stem_text_test_init["text"]

# split train-test-validation (stem, excerpt)
X_crypto_stem_excerpt_train, X_crypto_stem_excerpt_test, y_crypto_stem_excerpt_train, y_crypto_stem_excerpt_test = train_test_split(X_crypto_stem_excerpt, y_crypto_stem_excerpt, test_size=0.2, random_state=123)

### Reddit Data

In [9]:
# split train-test-validation (lemmatize)
X_reddit_lemmatize_train, X_reddit_lemmatize_test, y_reddit_lemmatize_train, y_reddit_lemmatize_test = train_test_split(X_reddit_lemmatize, y_reddit_lemmatize, test_size=0.2, random_state=123)

# split train-test-validation (stem)
X_reddit_stem_train, X_reddit_stem_test, y_reddit_stem_train, y_reddit_stem_test = train_test_split(X_reddit_stem, y_reddit_stem, test_size=0.2, random_state=123)

### Twitter Data

In [10]:
# split train-test-validation (lemmatize)
X_twitter_lemmatize_train, X_twitter_lemmatize_test, y_twitter_lemmatize_train, y_twitter_lemmatize_test = train_test_split(X_twitter_lemmatize, y_twitter_lemmatize, test_size=0.2, random_state=123)

# split train-test-validation (stem)
X_twitter_stem_train, X_twitter_stem_test, y_twitter_stem_train, y_twitter_stem_test = train_test_split(X_twitter_stem, y_twitter_stem, test_size=0.2, random_state=123)

### Combined Data

In [11]:
# split train-test-validation (lemmatize)
X_socialmedia_lemmatize_train, X_socialmedia_lemmatize_test, y_socialmedia_lemmatize_train, y_socialmedia_lemmatize_test = train_test_split(X_socialmedia_lemmatize, y_socialmedia_lemmatize, test_size=0.2, random_state=123)

# split train-test-validation (stem)
X_socialmedia_stem_train, X_socialmedia_stem_test, y_socialmedia_stem_train, y_socialmedia_stem_test = train_test_split(X_socialmedia_stem, y_socialmedia_stem, test_size=0.2, random_state=123)

## Formatting Data
Prepare data in format required for fasttext model (only relevant to train data)

## Train and Test Model (Default Parameters)
Perform initial testing to see which model performs best (before hyperparameter tuning)

## Hyperparameter Tuning on Best Model
### Split Train Set into Train and Validation