# Fasttext vs LSTM

![fasttext](https://opendatascience.com/wp-content/uploads/2021/01/ogimage-e1610396279996-300x89.png)

### What is Fasttext?

FastText is an open-source, free, lightweight library that allows users to learn text representations and text classifiers. It works on standard, generic hardware. Models can later be reduced in size to even fit on mobile devices.

### What is LSTM?

Long-Short-Term Memory (LSTM) is a special kind of recurrent neural network capable of learning long-term dependencies, remembering information for long periods as its default behaviour. There are three steps in an LSTM network:
- Step 1: The network decides what to forget and what to remember.
- Step 2: It selectively updates cell state values.
- Step 3: The network decides what part of the current state makes it to the output.

In [None]:
!pip install num2words
!pip install fasttext

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from num2words import num2words
import os
import fasttext
import fasttext.util
from sklearn.svm import SVC
from sklearn import metrics

Load pre-trained model from fasttext

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz

In [None]:
!gzip -d ./cc.en.300.bin.gz

In [None]:
ft = fasttext.load_model('cc.en.300.bin')

Preprocess functions

In [None]:
stemmer = PorterStemmer()
lemmatizer= WordNetLemmatizer()

In [None]:
def remove_url(x):
    
    x = re.sub(r'@\w+','',x)
    x = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', x)
    
    return x

In [None]:
def remove_stopwords(x):
    stop_words = set(stopwords.words('english')) 
    return [word for word in x if word not in stop_words]

In [None]:
def number_remove(tokens): # Alternatif olarak numeric değerler kelimelerede cevrilebilir.
        
    return [word for word in tokens if word.isalpha()]

In [None]:
def steming(tokens):
        
    return [stemmer.stem(word) for word in tokens]

In [None]:
def lemmetizing(tokens):
        
    return [lemmatizer.lemmatize(word) for word in tokens]

In [None]:
def number_to_word(tokens):
           
    return [num2words(word) if word.isdigit() else word for word in tokens]

In [None]:
def list_to_str(tokens):
    return ' '.join([str(item) for item in tokens ])

Vectorize tweets with fasttext pre-trained model

In [None]:
def preprocess_ft_vector(df):
    df['OriginalTweet'] = df['OriginalTweet'].apply(remove_url)
    df['OriginalTweet'] = df['OriginalTweet'].apply(lambda x : x.lower())
    df['OriginalTweet'] = df['OriginalTweet'].apply(lambda x : x.translate(str.maketrans('', '', string.punctuation)))
    df['OriginalTweet'] = df['OriginalTweet'].apply(word_tokenize)
    df['OriginalTweet'] = df['OriginalTweet'].apply(remove_stopwords)
    df['OriginalTweet'] = df['OriginalTweet'].apply(number_to_word)
    df = df[~df['OriginalTweet'].str.len().eq(0)]
    df['OriginalTweet'] = df['OriginalTweet'].apply(list_to_str)
    df['fasttext'] = df['OriginalTweet'].apply(ft.get_sentence_vector)
    return df

Add label for fasttext train without pre-trained

In [None]:
def preprocess_fasttext(df):
    df['OriginalTweet'] = df['OriginalTweet'].apply(remove_url)
    df['OriginalTweet'] = df['OriginalTweet'].apply(lambda x : x.lower())
    df['OriginalTweet'] = df['OriginalTweet'].apply(lambda x : x.translate(str.maketrans('', '', string.punctuation)))
    df['OriginalTweet'] = df['OriginalTweet'].apply(word_tokenize)
    df['OriginalTweet'] = df['OriginalTweet'].apply(remove_stopwords)
    df['OriginalTweet'] = df['OriginalTweet'].apply(number_to_word)
    df = df[~df['OriginalTweet'].str.len().eq(0)]
    df['OriginalTweet'] = df['OriginalTweet'].apply(list_to_str)
    df['Sentiment'] = df['Sentiment'].apply(lambda x : '__label__' + str(x))
    return df

Stemming and lemmetizing are good preprocessing for the lstm model, although not good for fasttext

In [None]:
def preprocess_lstm(df):
    df['OriginalTweet'] = df['OriginalTweet'].apply(remove_url)
    df['OriginalTweet'] = df['OriginalTweet'].apply(lambda x : x.lower())
    df['OriginalTweet'] = df['OriginalTweet'].apply(lambda x : x.translate(str.maketrans('', '', string.punctuation)))
    df['OriginalTweet'] = df['OriginalTweet'].apply(word_tokenize)
    df['OriginalTweet'] = df['OriginalTweet'].apply(remove_stopwords)
    df['OriginalTweet'] = df['OriginalTweet'].apply(number_remove)
    df['OriginalTweet'] = df['OriginalTweet'].apply(steming)
    df['OriginalTweet'] = df['OriginalTweet'].apply(lemmetizing)
    df = df[~df['OriginalTweet'].str.len().eq(0)]
    return df

In [None]:
df_train = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv', 
                        encoding = 'latin-1')

df_test = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv', 
                        encoding = 'latin-1')

### Fasttext classification

In [None]:
filtered_df_train = df_train[['Sentiment', 'OriginalTweet']]
filtered_df_test = df_test[['Sentiment', 'OriginalTweet']]

In [None]:
encoding = {'Extremely Negative': 'Negative',
            'Negative': 'Negative',
            'Neutral': 'Neutral',
            'Positive':'Positive',
            'Extremely Positive': 'Positive'
           }

filtered_df_train['Sentiment'].replace(encoding, inplace=True)
filtered_df_test['Sentiment'].replace(encoding, inplace=True)

In [None]:
filtered_df_train = preprocess_fasttext(filtered_df_train)
filtered_df_test = preprocess_fasttext(filtered_df_test)

Returns dataframes into txt files to train with fasttext

In [None]:
filtered_df_train.to_csv('train.txt', header=False, index=False, sep=' ')

In [None]:
filtered_df_test.to_csv('test.txt', header=False, index=False, sep=' ')

In [None]:
ft_model = fasttext.train_supervised(input='train.txt', epoch=5, dim=200)

In [None]:
ft_scores = ft_model.test('test.txt')
ft_scores[1]

### Fasttext classification with pre-trained vectors

In [None]:
filtered_df_train = df_train[['OriginalTweet', 'Sentiment']]
filtered_df_test = df_test[['OriginalTweet', 'Sentiment']]

In [None]:
encoding = {'Extremely Negative': 0,
            'Negative': 0,
            'Neutral': 1,
            'Positive':2,
            'Extremely Positive': 2
           }

filtered_df_train['Sentiment'].replace(encoding, inplace=True)
filtered_df_test['Sentiment'].replace(encoding, inplace=True)

In [None]:
filtered_df_train = preprocess_ft_vector(filtered_df_train)
filtered_df_test = preprocess_ft_vector(filtered_df_test)

In [None]:
X_train = np.stack(filtered_df_train['fasttext'])
y_train = np.array(filtered_df_train['Sentiment'])

In [None]:
X_test = np.stack(filtered_df_test['fasttext'])
y_test = np.array(filtered_df_test['Sentiment'])

Use Support Vector Classification for vectorized tweets

In [None]:
svm = SVC()

In [None]:
svm.fit(X_train, y_train)

In [None]:
svm_pred= svm.predict(X_test)

In [None]:
svm_score = metrics.accuracy_score(y_test, svm_pred)
svm_score

### LSTM Classification

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

In [None]:
tokenizer = Tokenizer()

In [None]:
filtered_df_train = df_train[['OriginalTweet', 'Sentiment']]
filtered_df_test = df_test[['OriginalTweet', 'Sentiment']]

In [None]:
encoding = {'Extremely Negative': 0,
            'Negative': 0,
            'Neutral': 1,
            'Positive':2,
            'Extremely Positive': 2
           }

filtered_df_train['Sentiment'].replace(encoding, inplace=True)
filtered_df_test['Sentiment'].replace(encoding, inplace=True)

In [None]:
filtered_df_train = preprocess_lstm(filtered_df_train)
filtered_df_test = preprocess_lstm(filtered_df_test)

In [None]:
tokenizer.fit_on_texts(filtered_df_train['OriginalTweet'])
vocab_len = len(tokenizer.word_index) + 1

max_len = np.max(filtered_df_train['OriginalTweet'].apply(lambda x :len(x)))
print(vocab_len, max_len)

In [None]:
X_train = tokenizer.texts_to_sequences(filtered_df_train['OriginalTweet'])
X_test = tokenizer.texts_to_sequences(filtered_df_test['OriginalTweet'])

y_train = filtered_df_train['Sentiment']
y_test = filtered_df_test['Sentiment']

In [None]:
X_train = pad_sequences(X_train, maxlen= max_len, padding='post')
X_test = pad_sequences(X_test, maxlen= max_len, padding='post')

In [None]:
from tensorflow.keras.utils import to_categorical
y_train = to_categorical(y_train, 3)
y_test = to_categorical(y_test, 3)

In [None]:
import tensorflow.keras.layers as layers
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.optimizers import Adam

In [None]:
embedding_dim = 200
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_len, embedding_dim, input_length=X_train.shape[1]),
    tf.keras.layers.LSTM(128, return_sequences=True),
    tf.keras.layers.LSTM(128, return_sequences=True),
    tf.keras.layers.LSTM(128, return_sequences=True),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Dense(3, activation='softmax')
])

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='Adam',metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=32)

In [None]:
lstm_score = model.evaluate(X_test, y_test)
lstm_score

### Comparision of Scores

In [None]:
scores = pd.DataFrame({'Model' : ['LSTM', 'SVM', 'Fasttext'],
          'Score' : [lstm_score[1], svm_score, ft_scores[1]]})

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
fig = plt.figure(figsize=(5,5))
ax = sns.barplot(x="Model", y="Score", data=scores)

for index, row in scores.iterrows():
    ax.text(index, row.Score, round(row.Score,3), color='black', ha="center")
    
plt.title("Accuracy Score Table")
plt.show()