# FastText Sentiment Analysis Model

In [1]:
# read and pre-process data
import pandas as pd
import string
import nltk
from nltk.stem import WordNetLemmatizer # word lemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords # stopwords

# format data
import csv

# modelling
import io # to generate pre-trained vector
import fasttext

## Data Pre-Processing

In [2]:
def pre_processing(text, lemmatize=True, stem=False):
    # strip accents\n",
    text = text.encode('ascii', 'ignore')
    text = str(text.decode("utf-8"))

    # covert to lowercase\n",
    text = text.lower()

    # remove punctuation\n",
    text = text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))

    # tokenize\n",
    text_words = nltk.word_tokenize(text)

    # lemmatize
    if lemmatize:
        wordnet_lemmatizer = WordNetLemmatizer()
        text_words = [wordnet_lemmatizer.lemmatize(x, pos="v") for x in text_words]

    if stem:
        stemmer = SnowballStemmer("english")
        text_words = [stemmer.stem(x) for x in text_words]

    # remove stop words
    stop = list(stopwords.words('english'))
    keep_stopwords = ["no", "not", "nor"]
    for word in keep_stopwords:
        stop.remove(word)
        stop = set(stop)
    filtered_words = [x for x in text_words if not x in stop]

    return ' '.join(filtered_words)

### Conventional and Cryptonews Data

In [None]:
# sample with words lemmatized
# lemmatize: fradulent -> fradulent
sample_lemmatize = sample.copy()
sample_lemmatize["title"] = sample_lemmatize["title"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))
sample_lemmatize["excerpt"] = sample_lemmatize["excerpt"].apply(lambda x: pre_processing(x, lemmatize=True, stem=False))

# sample with words stemmed
# stem: fradulent -> fradul
sample_stem = sample.copy()
sample_stem["title"] = sample_stem["title"].apply(lambda x: pre_processing(x, lemmatize=False, stem=True))
sample_stem["excerpt"] = sample_stem["excerpt"].apply(lambda x: pre_processing(x, lemmatize=False, stem=True))

### Reddit Data

### Twitter Data

### Combined Data

## Formatting Data
Prepare data in format required for fasttext model

In [3]:
def format_data(row, include_title=True, include_excerpt=False):
    current_row = []

    # prepare label
    label = "__label__" + ("HIGH" if row["label"]==1 else "LOW")
    current_row.append(label)

    if include_title:
        current_row.extend(nltk.word_tokenize(row["title"]))
    if include_excerpt:
        current_row.extend(nltk.word_tokenize(row["excerpt"]))
    return current_row

In [None]:
def save_format_data(data_list, filename_list):
    for i in range(len(data_list)):
        with open(labels[i], "w") as csvoutfile:
            csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n')
            for row in sample_list[i]:
                csv_writer.writerow(row)

In [None]:

sample_lemmatize_title = sample_lemmatize.apply(lambda x: format_data(x, include_title=True, include_excerpt=False), axis=1)
sample_lemmatize_excerpt = sample_lemmatize.apply(lambda x: format_data(x, include_title=False, include_excerpt=True), axis=1)
sample_lemmatize_all = sample_lemmatize.apply(lambda x: format_data(x, include_title=True, include_excerpt=True), axis=1)

sample_stem_title = sample_stem.apply(lambda x: format_data(x, include_title=True, include_excerpt=False), axis=1)
sample_stem_excerpt = sample_stem.apply(lambda x: format_data(x, include_title=False, include_excerpt=True), axis=1)
sample_stem_all = sample_stem.apply(lambda x: format_data(x, include_title=True, include_excerpt=True), axis=1)

In [None]:
# format data
data_list = [sample_lemmatize_title, sample_lemmatize_excerpt, sample_lemmatize_all, sample_stem_title, sample_stem_excerpt, sample_stem_all]
filename_list = ["data/fasttext/sample_lemmatize_title.txt", "data/fasttext/sample_lemmatize_excerpt.txt", \
    "data/fasttext/sample_lemmatize_all.txt", "data/fasttext/sample_stem_title.txt", "data/fasttext/sample_stem_excerpt.txt", \
        "data/fasttext/sample_stem_all.txt"]
save_format_data(data_list=data_list, filename_list=filename_list)

## Train Model

In [4]:
model = fasttext.train_supervised('data/fasttext/sample_lemmatize_title.txt', dim=300, pretrained_vectors="utils/fasttext/wiki-news-300d-1M.vec")

In [6]:

model.predict('data/fasttext/sample_lemmatize_title.txt')

(('__label__LOW',), array([1.00001001]))

In [7]:
sample_lemmatize_title.head()

NameError: name 'sample_lemmatize_title' is not defined