Importing all the necessary libraries
----

In [None]:
# Essential Libraries
import os
import shutil
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy import array, asarray, zeros
from string import punctuation as pun

# Natural Language Processing
import nltk
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import STOPWORDS
from chat_word_dict import abbreviations
from textblob import TextBlob
from spacy import load as spacy_load

# Machine Learning Libraries
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

# Deep Learning Libraries
from keras.models import Sequential
from keras.layers import (
    Activation, Dropout, Dense, Flatten, GlobalMaxPooling1D, Embedding, 
    Conv1D, LSTM, SimpleRNN, Bidirectional, GlobalAveragePooling1D
)
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

# TensorFlow Libraries
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

# Sentence Transformers and Torch
from sentence_transformers import SentenceTransformer
import torch

# TensorFlow Configuration
tf.get_logger().setLevel('ERROR')


Preprocessing for Standard ML Models:
----

Data Import

In [2]:
data = pd.read_csv('data.csv', encoding='utf-8', header=None, names=['target', 'ids', 'date', 'flag', 'user', 'text'])

In [3]:
data.drop(columns=['ids', 'date', 'flag', 'user'], inplace=True)

PreProcessing:

Lower-Casing:

In [5]:
data['text'] = data['text'].str.lower()

Removing Urls:

In [6]:
url_pattern = re.compile(r'https?://\S+|www\.\S+')
data['text'] = data['text'].apply(lambda x: url_pattern.sub(r'', x))

Removing Mentions/ hashtags:

In [7]:
mentions_pattern = re.compile(r'@\S+')
data['text'] = data['text'].apply(lambda x: mentions_pattern.sub(r'', x))

In [8]:
hashs_pattern = re.compile(r'#\S+')
data['text'] = data['text'].apply(lambda x: hashs_pattern.sub(r'', x))

Removing Extra White Spaces:

In [9]:
data['text'] = data['text'].apply(lambda x: re.sub(r'\s+', ' ', x))
data['text'] = data['text'].str.strip()

Removing Puntuation:

In [10]:
pun = pun.replace("'", '')
pun = pun.replace(".", '')
pun

'!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~'

In [11]:
translator = str.maketrans('', '', pun)                                     # Probably better to drop this step
data['text'] = data['text'].apply(lambda x: x.translate(translator))

Converting Abbreviations:

In [12]:
pattern = r'\b(' + '|'.join(re.escape(key) for key in abbreviations.keys()) + r')\b'
data['text'] = data['text'].apply(lambda x: re.sub(pattern, lambda match: abbreviations[match.group(0).lower()], x))

Removing Stopwords:

In [13]:
data['text'] = data['text'].apply(lambda x: ' '.join(word for word in x.split() if word.lower() not in STOPWORDS))

Removing emojies:

In [None]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

data['text'] = data['text'].apply(lambda x: remove_emoji(x))


In [None]:
data.to_csv('processed_data.csv', index=False)

In [None]:
nlp = spacy.load('en_core_web_sm')
data = pd.read_csv('processed_data.csv')
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)

In [None]:
data['processed_text'] = data['text'].apply(lambda x: nlp(x))

In [None]:
# Step 1: Calculate the length of each entry in 'processed_text'
data['text_length'] = data['processed_text'].apply(len)

# Step 2: Find the average length
min_length = data['text_length'].min()
max_length = data['text_length'].max()
average_length = data['text_length'].mean()

# Output the result
print(f'Average text length: {average_length}')
print(f'Min text length: {min_length}')
print(f'Max text length: {max_length}')

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

embeddings = model.encode(data['text'].tolist(), convert_to_tensor=False)
embeds = pd.DataFrame(embeddings)
embedding_target = pd.concat([embeds, data['target']], axis=1)
embedding_target.to_csv('embedding_target.csv', index=False)

Preprocessing for BERT Model:
----

In [None]:
data = pd.read_csv('data.csv', encoding='utf-8', header=None, names=['target', 'ids', 'date', 'flag', 'user', 'text'])
data.drop(columns=['ids', 'date', 'flag', 'user'], inplace=True)
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)
data = shuffle(data, random_state=42).reset_index(drop=True)

In [None]:
df = data

In [None]:
df['target'] = df['target'].apply(lambda x: 0 if x == 0 else 1)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_data = tf.data.Dataset.from_tensor_slices((train_df['text'].values, train_df['target'].values))
test_data = tf.data.Dataset.from_tensor_slices((test_df['text'].values, test_df['target'].values))


In [None]:
bert_model_url = "https://tfhub.dev/google/experts/bert/wiki_books/sst2/2"  # SST-2 model trained for sentiment analysis
preprocessor_url = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"

bert_preprocessor = hub.KerasLayer(preprocessor_url)
bert_encoder = hub.KerasLayer(bert_model_url)

In [None]:
# Original shape
original_shape = df.shape
print("Original DataFrame shape:", original_shape)

# Calculate the new size (1/100 of the original)
new_size = original_shape[0] // 100  # integer division to get the whole number
print("New size for training data:", new_size)

# Randomly sample the training data
smaller_train_df = train_df.sample(n=new_size, random_state=42)

# Check the shape of the new training dataset
print("Smaller Training DataFrame shape:", smaller_train_df.shape)

smaller_train_data = tf.data.Dataset.from_tensor_slices((smaller_train_df['text'].values, smaller_train_df['target'].values))

smaller_test_df = test_df.sample(n=new_size, random_state=42)

# Check the shape of the new training dataset
print("Smaller Training DataFrame shape:", smaller_test_df.shape)

smaller_test_data = tf.data.Dataset.from_tensor_slices((smaller_test_df['text'].values, smaller_test_df['target'].values))

Preprocessing for LSTM
----

In [None]:
data = pd.read_csv(r'../data-files/processed_data.csv')
data = shuffle(data, random_state=42).reset_index(drop=True)
X = data['text'].tolist()
Y = data['target'].tolist()
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train = [str(text) for text in X_train if isinstance(text, (str, float))]
X_test = [str(text) for text in X_test if isinstance(text, (str, float))]
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(X_train)

X_train = word_tokenizer.texts_to_sequences(X_train)
X_test = word_tokenizer.texts_to_sequences(X_test)

In [None]:
vocab_length = len(word_tokenizer.word_index) + 1

In [None]:
maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
embeddings_dictionary = dict()
glove_file = open('a2_glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [None]:
embedding_matrix = zeros((vocab_length, 100))
for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

embedding_matrix.shape

In [None]:
type(embedding_matrix)
np.savetxt('embedding_matrix_lstm.csv', embedding_matrix, delimiter=',')