In [24]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Bidirectional, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import RandomOverSampler
import re

#### Data Exploration & pre-processing

In [25]:
train_data = pd.read_csv('train.csv')
print("Initial class distribution:")
print(train_data['target'].value_counts())

Initial class distribution:
target
0    1225312
1      80810
Name: count, dtype: int64


In [27]:
#hyperparameters
max_words = 30000
max_len = 100
embedding_dim = 300 
lstm_units = 128
dense_units = 64
dropout_rate = 0.5

In [28]:
# Preprocess text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = ' '.join(text.split())
    return text

train_data['question_text'] = train_data['question_text'].apply(clean_text)

In [29]:
# Create and fit tokenizer
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_data['question_text'])

In [30]:
# Convert text to sequences and pad
sequences = tokenizer.texts_to_sequences(train_data['question_text'])
x_data = pad_sequences(sequences, maxlen=max_len)
y_data = train_data['target'].values

In [31]:
# Balance the dataset using RandomOverSampler
ros = RandomOverSampler(random_state=42)
x_data_resampled, y_data_resampled = ros.fit_resample(x_data, y_data)

print("Resampled class distribution:")
print(pd.Series(y_data_resampled).value_counts())

Resampled class distribution:
0    1225312
1    1225312
Name: count, dtype: int64


In [32]:
# Load GloVe embeddings
print("Loading GloVe embeddings...")
embedding_index = {}
with open('glove.6B.300d.txt', 'r', encoding='utf-8') as file:
    for line in file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs
print(f"Found {len(embedding_index)} word vectors.")

Loading GloVe embeddings...
Found 400000 word vectors.


In [33]:
# Create embedding matrix
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < max_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

#### Model Training

In [34]:
# Split the data
x_train, x_val, y_train, y_val = train_test_split(
    x_data_resampled, y_data_resampled,
    test_size=0.2,
    random_state=42,
    stratify=y_data_resampled
)

In [36]:
# Build the model
input_layer = Input(shape=(max_len,))

embedding_layer = Embedding(
    max_words,
    embedding_dim,
    weights=[embedding_matrix],
    trainable=False
)(input_layer)

lstm_1 = Bidirectional(LSTM(lstm_units, return_sequences=True))(embedding_layer)
lstm_2 = Bidirectional(LSTM(lstm_units, return_sequences=True))(lstm_1)

pooled = GlobalAveragePooling1D()(lstm_2)

dense_1 = Dense(dense_units, activation='relu')(pooled)
dropout_1 = Dropout(dropout_rate)(dense_1)
dense_2 = Dense(dense_units // 2, activation='relu')(dropout_1)
dropout_2 = Dropout(dropout_rate)(dense_2)

output_layer = Dense(1, activation='sigmoid')(dropout_2)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC()]
)

callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_auc',
        patience=3,
        restore_best_weights=True,
        mode='max'
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.2,
        patience=2,
        min_lr=1e-6
    )
]


In [37]:
# Train the model
history = model.fit(
    x_train,
    y_train,
    validation_data=(x_val, y_val),
    epochs=15,
    batch_size=32,
    callbacks=callbacks,
    verbose=1
)

Epoch 1/15
[1m61266/61266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6084s[0m 99ms/step - accuracy: 0.9046 - auc_1: 0.9564 - loss: 0.2589 - val_accuracy: 0.9530 - val_auc_1: 0.9829 - val_loss: 0.1393 - learning_rate: 0.0010
Epoch 2/15
[1m    1/61266[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:45:13[0m 162ms/step - accuracy: 0.9688 - auc_1: 0.9961 - loss: 0.1028

  current = self.get_monitor_value(logs)


[1m61266/61266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6702s[0m 109ms/step - accuracy: 0.9595 - auc_1: 0.9839 - loss: 0.1268 - val_accuracy: 0.9647 - val_auc_1: 0.9863 - val_loss: 0.1115 - learning_rate: 0.0010
Epoch 3/15
[1m61266/61266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6871s[0m 112ms/step - accuracy: 0.9699 - auc_1: 0.9881 - loss: 0.0993 - val_accuracy: 0.9692 - val_auc_1: 0.9877 - val_loss: 0.1015 - learning_rate: 0.0010
Epoch 4/15
[1m61266/61266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6783s[0m 111ms/step - accuracy: 0.9738 - auc_1: 0.9900 - loss: 0.0878 - val_accuracy: 0.9712 - val_auc_1: 0.9883 - val_loss: 0.0953 - learning_rate: 0.0010
Epoch 5/15
[1m61266/61266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6868s[0m 112ms/step - accuracy: 0.9759 - auc_1: 0.9908 - loss: 0.0824 - val_accuracy: 0.9711 - val_auc_1: 0.9884 - val_loss: 0.0956 - learning_rate: 0.0010
Epoch 6/15
[1m61266/61266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7059s[0m 115

#### Evaluation

In [38]:
# Evaluate the model
y_prob = model.predict(x_val)
thresholds = np.arange(0.1, 1.0, 0.1)
best_threshold = 0.5
best_f1 = 0

[1m15317/15317[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1248s[0m 81ms/step


In [39]:
# Find the best threshold
for threshold in thresholds:
    y_pred = (y_prob > threshold).astype(int)
    current_f1 = f1_score(y_val, y_pred)
    if current_f1 > best_f1:
        best_f1 = current_f1
        best_threshold = threshold

print(f"Best threshold: {best_threshold}, Best F1 score: {best_f1}")

Best threshold: 0.9, Best F1 score: 0.9889477784550302


In [40]:
#classification report
y_pred_optimal = (y_prob > best_threshold).astype(int)
print("\nClassification Report:")
print(classification_report(y_val, y_pred_optimal))


Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    245063
           1       0.98      1.00      0.99    245062

    accuracy                           0.99    490125
   macro avg       0.99      0.99      0.99    490125
weighted avg       0.99      0.99      0.99    490125

