In [183]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout, Bidirectional
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import pickle

In [203]:
dev_train_df=pd.read_csv('final_train.csv')
full_train_df=pd.read_csv('pool_train.csv')
val_df=pd.read_csv('cleaned_val.csv')
test_df=pd.read_csv('final_test.csv')
TEXT_COLUMN='clean_tweet'
LABEL_COLUMN='label'
GLOVE_FILE='glove.6B.100d.txt'
for df in [dev_train_df, full_train_df, val_df, test_df]:
    df.dropna(subset=[TEXT_COLUMN,LABEL_COLUMN], inplace=True)



In [171]:
print(f"Development training set size : {len(dev_train_df)}")
print(f"Full training set size : {len(full_train_df)}")
print(f"Validation set size : {len(val_df)}")
print(f"Test set size: {len(test_df)}")

Development training set size : 150000
Full training set size : 1353611
Validation set size : 42085
Test set size: 238873


In [184]:
label_encoder=LabelEncoder()
all_labels = pd.concat([dev_train_df[LABEL_COLUMN], val_df[LABEL_COLUMN]], ignore_index=True)
label_encoder.fit(all_labels)

In [185]:
y_dev_train = label_encoder.transform(dev_train_df[LABEL_COLUMN])
y_val = label_encoder.transform(val_df[LABEL_COLUMN])


In [204]:
VOCAB_SIZE=20000
MAX_LEN=100
EMBEDDING_DIM=100
OOV_TOKEN="<OOV>"

In [187]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token=OOV_TOKEN)
tokenizer.fit_on_texts(full_train_df[TEXT_COLUMN])
word_index = tokenizer.word_index

In [205]:
def tokenize_and_pad(df,text_column,tokenizer,max_len):
    sequences = tokenizer.texts_to_sequences(df[text_column])
    padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')
    return padded_sequences

In [189]:
X_dev_train_padded=tokenize_and_pad(dev_train_df,TEXT_COLUMN,tokenizer,MAX_LEN)
X_val_padded = tokenize_and_pad(val_df, TEXT_COLUMN, tokenizer, MAX_LEN)

In [206]:
embeddings_index = {}
with open(GLOVE_FILE, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = vector

print(f"Found {len(embeddings_index)} word vectors in GloVe file.")

Found 400000 word vectors in GloVe file.


In [207]:
print("Creating embedding matrix...")
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
for word, i in word_index.items():
    if i < VOCAB_SIZE:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

print(f"Embedding matrix created with shape: {embedding_matrix.shape}")

Creating embedding matrix...
Embedding matrix created with shape: (20000, 100)


In [208]:
def building_model(vocab_size,embedding_dim,input_length, embedding_matrix):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix],input_length=input_length,trainable=False),
        Bidirectional(GRU(units=64)),
        Dropout(0.5),
        Dense(32,activation='relu',kernel_regularizer=regularizers.l2(0.001)),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy',optimizer=Adam(learning_rate=0.005),metrics=['accuracy'])
    return model

In [193]:
dev_model=building_model(VOCAB_SIZE,EMBEDDING_DIM,MAX_LEN,embedding_matrix)
dev_model.summary()



In [194]:
dev_checkpoint = ModelCheckpoint(
    'dev_best_model.keras',
    monitor='val_loss', 
    save_best_only=True,    
    mode='min',            
    verbose=1
)

In [195]:
dev_early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=2,
    verbose=1,
    mode='min',
    restore_best_weights=True 
)

In [196]:
history=dev_model.fit(X_dev_train_padded,
    y_dev_train,
    epochs=10, 
    batch_size=128,
    validation_data=(X_val_padded, y_val),
    callbacks=[dev_checkpoint, dev_early_stopping]
)

Epoch 1/10
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step - accuracy: 0.6823 - loss: 0.6014
Epoch 1: val_loss improved from inf to 0.57075, saving model to dev_best_model.keras
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 84ms/step - accuracy: 0.6823 - loss: 0.6014 - val_accuracy: 0.7049 - val_loss: 0.5707
Epoch 2/10
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - accuracy: 0.7542 - loss: 0.5203
Epoch 2: val_loss improved from 0.57075 to 0.54789, saving model to dev_best_model.keras
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 88ms/step - accuracy: 0.7542 - loss: 0.5203 - val_accuracy: 0.7447 - val_loss: 0.5479
Epoch 3/10
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - accuracy: 0.7652 - loss: 0.5063
Epoch 3: val_loss did not improve from 0.54789
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 88ms/step - accuracy: 0.7652 

In [218]:
label_encoder=LabelEncoder()
label_encoder.fit(full_train_df[LABEL_COLUMN])
with open('label_encoder.pickle', 'wb') as handle:
    pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)
y_full_train = label_encoder.transform(full_train_df[LABEL_COLUMN])
y_val = label_encoder.transform(val_df[LABEL_COLUMN])
y_test = label_encoder.transform(test_df[LABEL_COLUMN])

In [219]:
tokenizer.fit_on_texts(full_train_df[TEXT_COLUMN])
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
word_index = tokenizer.word_index

In [211]:
X_full_train_padded = tokenize_and_pad(full_train_df, TEXT_COLUMN, tokenizer, MAX_LEN)
X_val_padded = tokenize_and_pad(val_df, TEXT_COLUMN, tokenizer, MAX_LEN)
X_test_padded = tokenize_and_pad(test_df, TEXT_COLUMN, tokenizer, MAX_LEN)

In [212]:
final_model = building_model(VOCAB_SIZE, EMBEDDING_DIM, MAX_LEN, embedding_matrix)
final_model.summary()



In [213]:
final_checkpoint = ModelCheckpoint(
    'final_best_model.keras',
    monitor='val_loss', 
    save_best_only=True,    
    mode='min',            
    verbose=1
)

In [214]:
final_early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=2,
    verbose=1,
    mode='min',
    restore_best_weights=True 
)

In [215]:
history_final=final_model.fit(X_full_train_padded,  
    y_full_train,         
    epochs=10,            
    batch_size=256,       
    validation_data=(X_val_padded, y_val),
    callbacks=[final_checkpoint, final_early_stopping]
)

Epoch 1/10
[1m5288/5288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step - accuracy: 0.7386 - loss: 0.5376
Epoch 1: val_loss improved from inf to 0.55669, saving model to final_best_model.keras
[1m5288/5288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m618s[0m 117ms/step - accuracy: 0.7386 - loss: 0.5375 - val_accuracy: 0.7437 - val_loss: 0.5567
Epoch 2/10
[1m5288/5288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step - accuracy: 0.7683 - loss: 0.4997
Epoch 2: val_loss improved from 0.55669 to 0.54628, saving model to final_best_model.keras
[1m5288/5288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m627s[0m 119ms/step - accuracy: 0.7683 - loss: 0.4997 - val_accuracy: 0.7428 - val_loss: 0.5463
Epoch 3/10
[1m5288/5288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - accuracy: 0.7732 - loss: 0.4935
Epoch 3: val_loss did not improve from 0.54628
[1m5288/5288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m621s[0m 117ms/step - accura

In [216]:
from tensorflow.keras.models import load_model
best_model=load_model('final_best_model.keras')

In [217]:
test_loss, test_accuracy = best_model.evaluate(X_test_padded, y_test)

[1m7465/7465[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 9ms/step - accuracy: 0.7755 - loss: 0.4838
