**Sentiment analysis - NLP using LSTM:**

Import, install and load dependencies:

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
%pip install contractions clean-text spacy

In [None]:
!python -m spacy download en_core_web_sm

**Define preprocess logic:**

In [None]:
import contractions
from cleantext import clean
import spacy
import re

nlp = spacy.load('en_core_web_sm')  #for stop words

#handle contractions, remove extra-spaces, clean etc
def preprocess(doc):
  text = doc.text # Extract text from the Doc object
  txt = contractions.fix(text)

  txt = clean(
      txt,
      lower=True,
      no_emoji=True,
      no_urls=True,
      no_punct=True,
      replace_with_url=""
  )

  txt = re.sub(r'\s+', ' ', txt)
  txt = txt.strip()

  txt = [token.text for token in nlp(txt) if not token.is_stop]
  return txt

In [None]:
test_text = " and the a an you; your, they. 'quote', ALL_CAPS, oughtn't it be? won't, can't causes but airline is beautifull â™¥, contact http://t.co/aQjn4HwNaC, test  space, ...  "
processed_test_text = [preprocess(doc) for doc in nlp.pipe([test_text])] # Pass a list containing the text to nlp.pipe and iterate
print(processed_test_text)

**Load data:**

In [None]:
df = pd.read_csv("/content/Tweets.csv")
df.info()

In [None]:
df['airline_sentiment'].value_counts()

**Select and format fields:**

In [None]:
#Take relevant columns: 'airline_sentiment' and 'text'. Fix target field with numbers
ddf = df[['airline_sentiment', 'text']]
ddf.loc[:, 'airline_sentiment'] = ddf['airline_sentiment'].apply(lambda x: 0 if x=="negative" else 1 if x=="neutral" else 2)

**Preprocess textual data:**

In [None]:
text_processed = [preprocess(doc) for doc in nlp.pipe(ddf['text'])]
text_processed[:5]

**Tokenize text to numbers:**

In [None]:
from tensorflow.keras.layers import TextVectorization

# max_words = len(set([word for sublist in text_processed for word in sublist])) + 1 # Add 1 for padding token
# max from top freq. words to consider for vocab, remaining words become OOV
max_words = 10000

# Determine the maximum sequence length. Pad/truncate min/max sentences
max_len = max([len(seq) for seq in text_processed])

vectorize_layer = TextVectorization(
    max_tokens=max_words,
    output_mode='int',
    output_sequence_length=max_len)

#build vocabulary - key method
vectorize_layer.adapt([' '.join(x) for x in text_processed])

**Split data for validation set:**

In [None]:
percent = round(len(ddf) * 0.2)
x_val = text_processed[-percent:]
y_val = ddf['airline_sentiment'][-percent:]
x_train = text_processed[:-percent]
y_train = ddf['airline_sentiment'][:-percent]

**model architecture:**

In [None]:
from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding, Dense, LSTM, SpatialDropout1D
from tensorflow.keras.models import Sequential

embedding_dim = 128
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(SpatialDropout1D(0.25))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))

In [None]:
import numpy as np

# Apply the vectorize_layer to the training and validation data
x_train_vectorized = vectorize_layer([' '.join(x) for x in x_train])
x_val_vectorized = vectorize_layer([' '.join(x) for x in x_val])

# Convert y_train and y_val to NumPy arrays of integers
y_train_numeric = np.array(y_train).astype(int)
y_val_numeric = np.array(y_val).astype(int)

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(x_train_vectorized, y_train_numeric, batch_size=64, epochs=10, validation_data=(x_val_vectorized, y_val_numeric))
