# **LSTM**

# **Importing necessary modules**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import text_to_word_sequence
from transformers import BertTokenizer
#from keras.applications.preprocessing import text_to_sequence  # for pre-trained embeddings
from transformers import BertTokenizer, TFBertModel  # Example using pre-trained BERT model

In [2]:
import numpy as np

In [3]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993227 sha256=95bf3ec57deaa1603bed6aef9d43e1ee1a3b57438e4a13f387f9f736455e4fde
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [4]:
from langdetect import detect

# **Loading Data**

In [5]:

data = pd.read_json("/content/filtered_train.json")

**Data Sample Exploration**

In [6]:
data.sample(5)

Unnamed: 0,pairs
188,"{'id': 1540, 'image': 'https://sammlung.staede..."
8,"{'id': 28, 'image': 'https://sammlung.staedelm..."
74,"{'id': 585, 'image': 'https://sammlung.staedel..."
38,"{'id': 347, 'image': 'https://sammlung.staedel..."
111,"{'id': 1013, 'image': 'https://sammlung.staede..."


In [7]:
texts = [entry['text'] for entry in data['pairs']]

In [8]:
texts[0:5]

["Dotty was wakened next morning by a variety of sounds . The mocking - bird , the canary , the hens , and Horace ' s guinea pig were astir , and wished their little world to be aware of it .",
 'These glacial streams contain no animal life , at all events no fish , till they have received the waters of warmer tributaries .',
 "Finally , rising listlessly from the couch on which he lay , his countenance irradiated with a very peculiar smile , he sauntered slowly to the door of the room . I sneered he , while making his characteristic exit , Blanche or Loo must stitch me an embroidered bib and tucker ere I pay my court to the little De Bys ! And yet , faith , jesting aside , who knows what might be effected ? Still , it 's ignoble game , a girl fresh from a foreign nursery , ' smelling of bread and butter , ' as Byron says .",
 'The people people generally appear to be very poor . Their mode of life mean ; their food coarse and indifferent , indifferent , except fish , which is excellen

In [9]:
labels = [entry['subtask1_label'] for entry in data['pairs']]

In [10]:
labels[0:5]

['NO', 'NO', 'YES', 'NO', 'NO']

In [11]:
ids = [entry['id'] for entry in data['pairs']]

**Data Pre-Processing**

In [12]:
# Define constants
MAX_LEN = 100  # Maximum sequence length
EMBEDDING_DIM = 128  # Embedding dimension
VOCAB_SIZE = 100000  # Vocabulary size

In [13]:
import unicodedata
import re

In [14]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
# Define stopwords and tokenizer
stop_words = set(stopwords.words('english'))
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [16]:
# Text preprocessing function
def preprocess_text(text):
    # Tokenize text using BERT tokenizer
    tokens = tokenizer.tokenize(text)
    # Remove stopwords and convert to lowercase
    filtered_tokens = [token.lower() for token in tokens if token.lower() not in stop_words]
    return ' '.join(filtered_tokens)

In [17]:
# Preprocess text data
processed_texts = [preprocess_text(text) for text in texts]

In [18]:
# Define constants
MAX_LEN = 100  # Maximum sequence length

In [19]:
EMBEDDING_DIM = 128  # Embedding dimension

In [20]:
VOCAB_SIZE = 100000  # Vocabulary size

In [21]:
# Tokenize processed text data using BERT tokenizer
encoded_texts = [tokenizer.encode(text, add_special_tokens=True, max_length=MAX_LEN, truncation=True) for text in processed_texts]

In [22]:
# Pad sequences
padded_sequences = pad_sequences(encoded_texts, maxlen=MAX_LEN, padding='post')

In [23]:
 #Split data into training and testing sets
X_train, X_test, y_train, y_test, id_train, id_test = train_test_split(padded_sequences, labels, ids, test_size=0.2, random_state=42)

In [24]:
# Define label mapping dictionary
label_mapping = {'YES': 1, 'NO': 0}  # Adjust as per your label mapping

In [25]:
# Convert output labels to numeric format
training_labels_numeric = [label_mapping[label] for label in y_train]
testing_labels_numeric = [label_mapping[label] for label in y_test]

In [26]:
# Convert numeric labels to NumPy arrays
training_labels_numeric_np = np.array(training_labels_numeric)
testing_labels_numeric_np = np.array(testing_labels_numeric)

**Model Definition**

In [27]:
# Define LSTM-based model
model = Sequential()
model.add(Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LEN))
model.add(LSTM(units=64, return_sequences=True))  # LSTM layer with 64 units
model.add(Dropout(0.2))
model.add(LSTM(units=32))  # LSTM layer with 32 units
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

**Model** **Compilation**

In [28]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

**Model Training**

In [29]:
# Train the model
model.fit(X_train, training_labels_numeric_np, epochs=50, batch_size=32, validation_data=(X_test, testing_labels_numeric_np))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


**Model Evaluation**:

In [30]:
# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, testing_labels_numeric_np)

# Print the accuracy
print("Test Accuracy:", accuracy)

Test Accuracy: 0.6226415038108826


# RNN

# **Importing necessary modules**

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense, Dropout
from transformers import BertTokenizer
import numpy as np
from langdetect import detect

**Data importing**

In [32]:
data = pd.read_json("/content/filtered_train.json")


In [33]:
texts = [entry['text'] for entry in data['pairs']]
labels = [entry['subtask1_label'] for entry in data['pairs']]
ids = [entry['id'] for entry in data['pairs']]

In [34]:
MAX_LEN = 100
VOCAB_SIZE = 100000

**Pre Processing**

In [35]:
# Tokenization and padding
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
processed_texts = [tokenizer.tokenize(text.lower()) for text in texts]
encoded_texts = [tokenizer.encode(text, add_special_tokens=True, max_length=MAX_LEN, truncation=True) for text in processed_texts]
padded_sequences = pad_sequences(encoded_texts, maxlen=MAX_LEN, padding='post')

In [36]:
X_train, X_test, y_train, y_test, id_train, id_test = train_test_split(padded_sequences, labels, ids, test_size=0.2, random_state=42)


In [37]:
# Label mapping
label_mapping = {'YES': 1, 'NO': 0}
training_labels_numeric = [label_mapping[label] for label in y_train]
testing_labels_numeric = [label_mapping[label] for label in y_test]

In [38]:
# Convert numeric labels to NumPy arrays
training_labels_numeric_np = np.array(training_labels_numeric)
testing_labels_numeric_np = np.array(testing_labels_numeric)

**Model Definition**

In [39]:
# Define RNN-based model
model = Sequential()
model.add(Embedding(VOCAB_SIZE, 128, input_length=MAX_LEN))
model.add(SimpleRNN(units=64, return_sequences=True))
model.add(Dropout(0.2))
model.add(SimpleRNN(units=32))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

**Model Compilation**

In [40]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

**Model Training**

In [41]:
model.fit(X_train, training_labels_numeric_np, epochs=50, batch_size=32, validation_data=(X_test, testing_labels_numeric_np))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7dee4ea86cb0>

**evaluation**

In [42]:
# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, testing_labels_numeric_np)

# Print the accuracy
print("Test Accuracy:", accuracy)

Test Accuracy: 0.7169811129570007
