In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from keras.models import Sequential
from keras.layers import LSTM, Bidirectional, Dense, Dropout, Conv1D, MaxPooling1D, GlobalAveragePooling1D, BatchNormalization, Activation
from keras import regularizers
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

import numpy as np

from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LEGION\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LEGION\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LEGION\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Load the datasets
train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('val.csv')
test_data = pd.read_csv('test.csv')

lengthTestData = len(test_data)

In [3]:
# Combine text data for preprocessing
text = pd.concat([train_data['text'], val_data['text']], ignore_index=True)
text_test = test_data['text']

In [4]:
# Define stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [5]:
# Function to preprocess text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenization and lowercase
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]  # Remove stopwords and lemmatize
    return ' '.join(filtered_tokens)

In [6]:
text_preprocessed = text.apply(preprocess_text)

In [7]:
print(text_preprocessed)

0        beirut ( reuters ) - iran military chief met s...
1        hanoi ( reuters ) - top u.s. envoy began two-d...
2        ( reuters ) - four u.s. senator asked senate j...
3        first read morning briefing meet press nbc pol...
4        cairo ( reuters ) - six month egypt election ,...
                               ...                        
54714    lack oversight prof donald trump totally unfit...
54715    tucker carlson responded espn anchor calling p...
54716    getting something nothing rage president profe...
54717    black emanuelle fixed 1976. attila speaking eu...
54718    chaos broke legal american illegal alien clash...
Name: text, Length: 54719, dtype: object


In [8]:
print(text)

0        beirut (reuters) - iran s military chief met w...
1        hanoi (reuters) - a top u.s. envoy began a two...
2        (reuters) - four u.s. senators have asked the ...
3        first read is a morning briefing from meet the...
4        cairo (reuters) - six months before egypt s el...
                               ...                        
54714    this lack of oversight proves that donald trum...
54715    tucker carlson responded to an espn anchor cal...
54716    because getting something for nothing is all t...
54717    black emanuelle fixed all that in 1976. attila...
54718    chaos broke out after legal americans and ille...
Name: text, Length: 54719, dtype: object


In [9]:
# Initialize Count vectorizer without specifying max_features
count_vectorizer = CountVectorizer()

In [10]:
# Fit and transform the preprocessed text data
count_matrix = count_vectorizer.fit_transform(text_preprocessed)

In [11]:
# Get the number of unique tokens
num_unique_tokens = len(count_vectorizer.get_feature_names_out())
print(num_unique_tokens)

169079


In [12]:
# Re-initialize Count vectorizer with the determined max_features
count_vectorizer = CountVectorizer(max_features=num_unique_tokens)

In [13]:
#Fit and transform the text data again with the updated max_features
count_matrix = count_vectorizer.fit_transform(text_preprocessed)

# Convert the TF-IDF matrix to a CSR (Compressed Sparse Row) matrix for efficient row-wise operations
csr_count_matrix = csr_matrix(count_matrix)

# Find the row index with the maximum number of filled values
max_features_row_index = csr_count_matrix.getnnz(axis=1).argmax()

# Get the number of features in the document with the most filled values
max_features = csr_count_matrix[max_features_row_index].count_nonzero()

svd = TruncatedSVD(n_components=int(max_features*0.3))
count_matrix = svd.fit_transform(count_matrix)

In [15]:
#Define the CNN + Bi-LSTM model
model = Sequential()
model.add(Conv1D(filters=256, kernel_size=2, activation='relu', input_shape=(count_matrix.shape[1], 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=64, kernel_size=4, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.3))
model.add(Dense(16))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

#Define the Bi-LSTM model
model = Sequential()
model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=(count_matrix.shape[1],1)))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.3))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

In [16]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [17]:
dense_count_matrix = count_matrix[:len(train_data)]
dense_val_count_matrix = count_matrix[len(train_data):len(train_data) + len(val_data)]

# Assuming train_data['label'] and val_data['label'] are Pandas Series, convert them to arrays
train_labels = train_data['label'].values
val_labels = val_data['label'].values

early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=1)

# Fit the model using NumPy arrays with callbacks
model.fit(dense_count_matrix, train_labels, 
          epochs=100, batch_size=32, 
          validation_data=(dense_val_count_matrix, val_labels),
          callbacks=[early_stopping, reduce_lr])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 19: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 19: early stopping


<keras.callbacks.History at 0x16660750ac0>

In [18]:
text_test_preprocessed = text_test.apply(preprocess_text)
test_count_matrix = count_vectorizer.transform(text_test_preprocessed)
dense_test_count_matrix = svd.transform(test_count_matrix)
test_labels = test_data['label'].values

# Evaluate the model using the test data
test_loss, test_accuracy = model.evaluate(dense_test_count_matrix, test_labels)
print(f'Test accuracy: {test_accuracy}')

Test accuracy: 0.8743627667427063


In [None]:
#model.save('88Acc_64L_32L_16D_100E_32B.h5')

In [None]:
test_accuracy * lengthTestData