# FastText Sentiment Analysis Model

In [21]:
# read and pre-process data
import pandas as pd
import string
import nltk
from nltk.stem import WordNetLemmatizer # word lemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords # stopwords

# format data
from sklearn.model_selection import train_test_split # train test split
import csv

# modelling
import io # to generate pre-trained vector
import fasttext

## Data Pre-Processing

In [2]:
def pre_processing(text, lemmatize=True, stem=False):
    # strip accents
    text = text.encode('ascii', 'ignore')
    text = str(text.decode("utf-8"))

    # covert to lowercase
    text = text.lower()

    # remove punctuation
    text = text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))

    # remove unnecessary white spaces
    text = text.replace("\n", "")

    # tokenize
    text_words = nltk.word_tokenize(text)

    # lemmatize
    if lemmatize:
        wordnet_lemmatizer = WordNetLemmatizer()
        text_words = [wordnet_lemmatizer.lemmatize(x, pos="v") for x in text_words]

    # stem
    if stem:
        stemmer = SnowballStemmer("english")
        text_words = [stemmer.stem(x) for x in text_words]

    # remove stop words
    stop = list(stopwords.words('english'))
    keep_stopwords = ["no", "not", "nor"]
    for word in keep_stopwords:
        stop.remove(word)
        stop = set(stop)
    text_words = [x for x in text_words if not x in stop]

    return ' '.join(text_words)

### Conventional and Cryptonews Data
#### Load Data

In [8]:
# load data
sample_crypto = pd.read_csv("data/sample_crypto.csv", header=0)

# combine title and excerpt
sample_crypto["text"] = sample_crypto["title"].fillna('') + " " + sample_crypto["excerpt"].fillna('')

# sample text
sample_crypto_text = sample_crypto[["title", "text", "label"]]
sample_crypto_excerpt = sample_crypto[["excerpt", "label"]]
sample_crypto_excerpt = sample_crypto_excerpt.dropna(subset=["excerpt"])

# sample with lemmatized words
sample_crypto_lemmatize_text = sample_crypto_text.copy()
sample_crypto_lemmatize_text["title"] = sample_crypto_lemmatize_text["title"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))
sample_crypto_lemmatize_text["text"] = sample_crypto_lemmatize_text["text"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))
sample_crypto_lemmatize_excerpt = sample_crypto_excerpt.copy()
sample_crypto_lemmatize_excerpt["excerpt"] = sample_crypto_lemmatize_excerpt["excerpt"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))

# sample with stemmed words
sample_crypto_stem_text = sample_crypto_text.copy()
sample_crypto_stem_text["title"] = sample_crypto_stem_text["title"].apply(lambda x: pre_processing(x, lemmatize=False, stem=True))
sample_crypto_stem_text["text"] = sample_crypto_stem_text["text"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))
sample_crypto_stem_excerpt = sample_crypto_excerpt.copy()
sample_crypto_stem_excerpt["excerpt"] = sample_crypto_stem_excerpt["excerpt"].apply(lambda x: pre_processing(x, lemmatize=False, stem=True))

# separate lemmatized X and y
X_crypto_lemmatize_text = sample_crypto_lemmatize_text[["title", "text"]]
y_crypto_lemmatize_text = sample_crypto_lemmatize_text["label"]
X_crypto_lemmatize_excerpt = sample_crypto_lemmatize_excerpt["excerpt"]
y_crypto_lemmatize_excerpt = sample_crypto_lemmatize_excerpt["label"]

# separate stemmed X and y
X_crypto_stem_text = sample_crypto_stem_text[["title", "text"]]
y_crypto_stem_text = sample_crypto_stem_text["label"]
X_crypto_stem_excerpt = sample_crypto_stem_excerpt["excerpt"]
y_crypto_stem_excerpt = sample_crypto_stem_excerpt["label"]

### Reddit Data

In [10]:
sample_reddit = pd.read_csv("data/sample_reddit.csv", header=0)[["title", "excerpt", "label"]]

# combine title and excerpt (if any)
sample_reddit["text"] = sample_reddit["title"].fillna('') + " " + sample_reddit["excerpt"].fillna('')

# sample with lemmatized words
sample_reddit_lemmatize = sample_reddit.copy()
sample_reddit_lemmatize["text"] = sample_reddit_lemmatize["text"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))

# sample with stemmed words
sample_reddit_stem = sample_reddit.copy()
sample_reddit_stem["text"] = sample_reddit_stem["text"].apply(lambda x: pre_processing(x, lemmatize=False, stem=True))

# separate lemmatized X and y
X_reddit_lemmatize = sample_reddit_lemmatize["text"]
y_reddit_lemmatize = sample_reddit_lemmatize["label"]

# separate stemmed X and y
X_reddit_stem = sample_reddit_stem["text"]
y_reddit_stem = sample_reddit_stem["label"]

### Twitter Data

In [11]:
sample_twitter = pd.read_csv("data/sample_twitter.csv", header=0)[["text", "label"]]

# remove any whitespaces in text
sample_twitter["text"] = sample_twitter["text"].apply(lambda x: x.replace("\n",""))

# sample with lemmatized words
sample_twitter_lemmatize = sample_twitter.copy()
sample_twitter_lemmatize["text"] = sample_twitter_lemmatize["text"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))

# sample with stemmed words
sample_twitter_stem = sample_twitter.copy()
sample_twitter_stem["text"] = sample_twitter_stem["text"].apply(lambda x: pre_processing(x, lemmatize=False, stem=True))

# separate lemmatized X and y
X_twitter_lemmatize = sample_twitter_lemmatize["text"]
y_twitter_lemmatize = sample_twitter_lemmatize["label"]

# separate stemmed X and y
X_twitter_stem = sample_twitter_stem["text"]
y_twitter_stem = sample_twitter_stem["label"]

### Combined Data

In [20]:
# combine lemmatized data
sample_combined_lemmatize = pd.concat([sample_crypto_lemmatize_text[["text", "label"]], sample_reddit_lemmatize[["text", "label"]], sample_twitter_lemmatize])

# combine stemmed data
sample_combined_stem = pd.concat([sample_crypto_stem_text[["text", "label"]], sample_reddit_stem[["text", "label"]], sample_twitter_stem])

# separate lemmatized X and y
X_combined_lemmatize = sample_combined_lemmatize["text"]
y_combined_lemmatize = sample_combined_lemmatize["label"]

# separate stemmed X and y
X_combined_stem = sample_combined_stem["text"]
y_combined_stem = sample_combined_stem["label"]

## Train-Test Split
### Conventional and Cryptonews Data

In [32]:
# split train-test-validation (lemmatize, title/text)
X_crypto_lemmatize_text_train, X_crypto_lemmatize_text_test, y_crypto_lemmatize_text_train, y_crypto_lemmatize_text_test = train_test_split(X_crypto_lemmatize_text, y_crypto_lemmatize_text, test_size=0.2, random_state=123)
X_crypto_lemmatize_text_train, X_crypto_lemmatize_text_val, y_crypto_lemmatize_text_train, y_crypto_lemmatize_text_val = train_test_split(X_crypto_lemmatize_text_train, y_crypto_lemmatize_text_train, test_size=0.25, random_state=123)
X_crypto_lemmatize_title_train = X_crypto_lemmatize_text_train["title"]
X_crypto_lemmatize_text_train = X_crypto_lemmatize_text_train["text"]

# split train-test-validation (lemmatize, excerpt)
X_crypto_lemmatize_excerpt_train, X_crypto_lemmatize_excerpt_test, y_crypto_lemmatize_excerpt_train, y_crypto_lemmatize_excerpt_test = train_test_split(X_crypto_lemmatize_excerpt, y_crypto_lemmatize_excerpt, test_size=0.2, random_state=123)
X_crypto_lemmatize_excerpt_train, X_crypto_lemmatize_excerpt_val, y_crypto_lemmatize_excerpt_train, y_crypto_lemmatize_excerpt_val = train_test_split(X_crypto_lemmatize_excerpt_train, y_crypto_lemmatize_excerpt_train, test_size=0.25, random_state=123)

# split train-test-validation (stem, title/text)
X_crypto_stem_text_train, X_crypto_stem_text_test, y_crypto_stem_text_train, y_crypto_stem_text_test = train_test_split(X_crypto_stem_text, y_crypto_stem_text, test_size=0.2, random_state=123)
X_crypto_stem_text_train, X_crypto_stem_text_val, y_crypto_stem_text_train, y_crypto_stem_text_val = train_test_split(X_crypto_stem_text_train, y_crypto_stem_text_train, test_size=0.25, random_state=123)
X_crypto_stem_title_train = X_crypto_stem_text_train["title"]
X_crypto_stem_text_train = X_crypto_stem_text_train["text"]

# split train-test-validation (stem, excerpt)
X_crypto_stem_excerpt_train, X_crypto_stem_excerpt_test, y_crypto_stem_excerpt_train, y_crypto_stem_excerpt_test = train_test_split(X_crypto_stem_excerpt, y_crypto_stem_excerpt, test_size=0.2, random_state=123)
X_crypto_stem_excerpt_train, X_crypto_stem_excerpt_val, y_crypto_stem_excerpt_train, y_crypto_stem_excerpt_val = train_test_split(X_crypto_stem_excerpt_train, y_crypto_stem_excerpt_train, test_size=0.25, random_state=123)

### Reddit Data

In [37]:
# split train-test-validation (lemmatize)
X_reddit_lemmatize_train, X_reddit_lemmatize_test, y_reddit_lemmatize_train, y_reddit_lemmatize_test = train_test_split(X_reddit_lemmatize, y_reddit_lemmatize, test_size=0.2, random_state=123)
X_reddit_lemmatize_train, X_reddit_lemmatize_val, y_reddit_lemmatize_train, y_reddit_lemmatize_val = train_test_split(X_reddit_lemmatize_train, y_reddit_lemmatize_train, test_size=0.25, random_state=123)

# split train-test-validation (stem)
X_reddit_stem_train, X_reddit_stem_test, y_reddit_stem_train, y_reddit_stem_test = train_test_split(X_reddit_stem, y_reddit_stem, test_size=0.2, random_state=123)
X_reddit_stem_train, X_reddit_stem_val, y_reddit_stem_train, y_reddit_stem_val = train_test_split(X_reddit_stem_train, y_reddit_stem_train, test_size=0.25, random_state=123)

### Twitter Data

In [41]:
# split train-test-validation (lemmatize)
X_twitter_lemmatize_train, X_twitter_lemmatize_test, y_twitter_lemmatize_train, y_twitter_lemmatize_test = train_test_split(X_twitter_lemmatize, y_twitter_lemmatize, test_size=0.2, random_state=123)
X_twitter_lemmatize_train, X_twitter_lemmatize_val, y_twitter_lemmatize_train, y_twitter_lemmatize_val = train_test_split(X_twitter_lemmatize_train, y_twitter_lemmatize_train, test_size=0.25, random_state=123)

# split train-test-validation (stem)
X_twitter_stem_train, X_twitter_stem_test, y_twitter_stem_train, y_twitter_stem_test = train_test_split(X_twitter_stem, y_twitter_stem, test_size=0.2, random_state=123)
X_twitter_stem_train, X_twitter_stem_val, y_twitter_stem_train, y_twitter_stem_val = train_test_split(X_twitter_stem_train, y_twitter_stem_train, test_size=0.25, random_state=123)

### Combined Data

In [42]:
# split train-test-validation (lemmatize)
X_combined_lemmatize_train, X_combined_lemmatize_test, y_combined_lemmatize_train, y_combined_lemmatize_test = train_test_split(X_combined_lemmatize, y_combined_lemmatize, test_size=0.2, random_state=123)
X_combined_lemmatize_train, X_combined_lemmatize_val, y_combined_lemmatize_train, y_combined_lemmatize_val = train_test_split(X_combined_lemmatize_train, y_combined_lemmatize_train, test_size=0.25, random_state=123)

# split train-test-validation (stem)
X_combined_stem_train, X_combined_stem_test, y_combined_stem_train, y_combined_stem_test = train_test_split(X_combined_stem, y_combined_stem, test_size=0.2, random_state=123)
X_combined_stem_train, X_combined_stem_val, y_combined_stem_train, y_combined_stem_val = train_test_split(X_combined_stem_train, y_combined_stem_train, test_size=0.25, random_state=123)

## Formatting Data
Prepare data in format required for fasttext model

In [43]:
def format_data(row, include_title=True, include_excerpt=False):
    current_row = []

    # prepare label
    label = "__label__" + str(row["label"])
    current_row.append(label)

    if include_title:
        current_row.extend(nltk.word_tokenize(row["title"]))
    if include_excerpt:
        current_row.extend(nltk.word_tokenize(row["excerpt"]))
    return current_row

In [44]:
def save_format_data(data_list, filename_list):
    for i in range(len(data_list)):
        with open(labels[i], "w") as csvoutfile:
            csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n')
            for row in sample_list[i]:
                csv_writer.writerow(row)

In [None]:

sample_lemmatize_title = sample_lemmatize.apply(lambda x: format_data(x, include_title=True, include_excerpt=False), axis=1)
sample_lemmatize_excerpt = sample_lemmatize.apply(lambda x: format_data(x, include_title=False, include_excerpt=True), axis=1)
sample_lemmatize_all = sample_lemmatize.apply(lambda x: format_data(x, include_title=True, include_excerpt=True), axis=1)

sample_stem_title = sample_stem.apply(lambda x: format_data(x, include_title=True, include_excerpt=False), axis=1)
sample_stem_excerpt = sample_stem.apply(lambda x: format_data(x, include_title=False, include_excerpt=True), axis=1)
sample_stem_all = sample_stem.apply(lambda x: format_data(x, include_title=True, include_excerpt=True), axis=1)

In [None]:
# format data
data_list = [sample_lemmatize_title, sample_lemmatize_excerpt, sample_lemmatize_all, sample_stem_title, sample_stem_excerpt, sample_stem_all]
filename_list = ["data/fasttext/sample_lemmatize_title.txt", "data/fasttext/sample_lemmatize_excerpt.txt", \
    "data/fasttext/sample_lemmatize_all.txt", "data/fasttext/sample_stem_title.txt", "data/fasttext/sample_stem_excerpt.txt", \
        "data/fasttext/sample_stem_all.txt"]
save_format_data(data_list=data_list, filename_list=filename_list)

## Train Model

In [4]:
model = fasttext.train_supervised('data/fasttext/sample_lemmatize_title.txt', dim=300, pretrained_vectors="utils/fasttext/wiki-news-300d-1M.vec")

In [6]:

model.predict('data/fasttext/sample_lemmatize_title.txt')

(('__label__LOW',), array([1.00001001]))

In [7]:
sample_lemmatize_title.head()

NameError: name 'sample_lemmatize_title' is not defined

In [28]:
model.predict("hack: person xx billion")

(('__label__HIGH',), array([0.99131745]))