In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install Keras-Preprocessing

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Keras-Preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Keras-Preprocessing
Successfully installed Keras-Preprocessing-1.1.2


In [None]:
!pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Linear SVM

In [None]:
# Importing required libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# Reading data into a pandas DataFrame
data = pd.read_csv (r'/content/drive/MyDrive/New/Clean/Combined_clean.csv')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_v = TfidfVectorizer(max_features=5000, ngram_range=(1,3))
X = tfidf_v.fit_transform(data['statement'].values.astype('U'))
y = data['label'].values

In [None]:
import joblib
from sklearn.svm import LinearSVC

# Creating a Linear SVM classifier
svm = LinearSVC()

# Performing five-fold cross-validation
scores = cross_val_score(svm, X, y, cv=5)

# Printing the cross-validation scores
print("Cross-validation scores:", scores)

# Computing and printing accuracy, recall, precision, and F1 score
y_pred = cross_val_predict(svm, X, y, cv=5)
accuracy = accuracy_score(y, y_pred)
recall = recall_score(y, y_pred)
precision = precision_score(y, y_pred)
f1 = f1_score(y, y_pred)
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 score:", f1)

# Save the trained model to a file
joblib.dump(svm, '/content/drive/MyDrive/Colab Notebooks/weights/Combined/linearsvc_combined.pkl')
 

Cross-validation scores: [0.94795068 0.94815062 0.86711096 0.73600373 0.62709944]
Accuracy: 0.8252669181451022
Recall: 0.8284745966048828
Precision: 0.7889355382286579
F1 score: 0.8082217833369907


['/content/drive/MyDrive/Colab Notebooks/weights/Combined/linearsvc_combined.pkl']

# LSTM

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the data from CSV file
data = pd.read_csv('/content/drive/MyDrive/New/Clean/Combined_clean.csv')

# Split data into statements and labels
statements = data['statement']
labels = data['label']

# Tokenize statements
tokenizer = Tokenizer()
tokenizer.fit_on_texts(statements)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(statements)

# Pad sequences
max_len = 100
padded_sequences = pad_sequences(sequences, maxlen=max_len)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2)

# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=32, input_length=100),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), verbose=2)

# Evaluate the model
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

# Calculate the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

# Save the model
model.save('/content/drive/MyDrive/Colab Notebooks/weights/Combined/lstm_combined.h5')

# Load the saved model
# loaded_model = tf.keras.models.load_model('lstm_fn.h5')


Epoch 1/10
1876/1876 - 88s - loss: 0.3487 - accuracy: 0.8509 - val_loss: 0.3039 - val_accuracy: 0.8751 - 88s/epoch - 47ms/step
Epoch 2/10
1876/1876 - 26s - loss: 0.2689 - accuracy: 0.8895 - val_loss: 0.2972 - val_accuracy: 0.8754 - 26s/epoch - 14ms/step
Epoch 3/10
1876/1876 - 15s - loss: 0.2361 - accuracy: 0.9039 - val_loss: 0.3049 - val_accuracy: 0.8805 - 15s/epoch - 8ms/step
Epoch 4/10
1876/1876 - 15s - loss: 0.2088 - accuracy: 0.9149 - val_loss: 0.3154 - val_accuracy: 0.8800 - 15s/epoch - 8ms/step
Epoch 5/10
1876/1876 - 14s - loss: 0.1856 - accuracy: 0.9244 - val_loss: 0.3299 - val_accuracy: 0.8684 - 14s/epoch - 7ms/step
Epoch 6/10
1876/1876 - 15s - loss: 0.1639 - accuracy: 0.9337 - val_loss: 0.3710 - val_accuracy: 0.8740 - 15s/epoch - 8ms/step
Epoch 7/10
1876/1876 - 13s - loss: 0.1463 - accuracy: 0.9407 - val_loss: 0.4059 - val_accuracy: 0.8586 - 13s/epoch - 7ms/step
Epoch 8/10
1876/1876 - 13s - loss: 0.1276 - accuracy: 0.9486 - val_loss: 0.3996 - val_accuracy: 0.8634 - 13s/epoch -

# BI LSTM


In [None]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Bidirectional
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the data from CSV file
data = pd.read_csv('/content/drive/MyDrive/New/Clean/Combined_clean.csv')

# Split data into statements and labels
statements = data['statement']
labels = data['label']

# Tokenize statements
tokenizer = Tokenizer()
tokenizer.fit_on_texts(statements)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
sequences = tokenizer.texts_to_sequences(statements)

# Pad sequences
max_len = 100
padded_sequences = pad_sequences(sequences, maxlen=max_len)

# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2)

# Define model architecture
model = Sequential()
model.add(Embedding(len(word_index) + 1, 128, input_length=max_len))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(1, activation='sigmoid'))

# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Train model
model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_test, y_test))

# Evaluate the model
y_pred = model.predict(x_test)
y_pred = (y_pred > 0.5).astype(int)

# Calculate the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

# Save the model
model.save('/content/drive/MyDrive/Colab Notebooks/weights/Combined/bilstm_combined.h5')

Found 34902 unique tokens.
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 128)          4467584   
                                                                 
 bidirectional (Bidirectiona  (None, 100, 128)         98816     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               41216     
 nal)                                                            
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 4,607,681
Trainable params: 4,607,681
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
E

# HYBRID


In [None]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

df = pd.read_csv (r'/content/drive/MyDrive/New/Clean/Combined_clean.csv')

# Preprocessing
MAX_NB_WORDS = 50000  # Maximum number of words to be used in the tokenizer
MAX_SEQUENCE_LENGTH = 300  # Maximum length of each news statement
EMBEDDING_DIM = 100  # Dimension of the word embedding
VALIDATION_SPLIT = 0.2  # Percentage of data to use for validation
BATCH_SIZE = 128
EPOCHS = 10

# Tokenize the news statements
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(df['statement'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# Convert the news statements to sequences of integers
sequences = tokenizer.texts_to_sequences(df['statement'])

# Pad the sequences
Max_Len = max([len(x) for x in df['statement']])
padded = pad_sequences(sequences, maxlen=Max_Len)

print('Shape of data tensor:', padded.shape)

# Define the maximum number of words to keep in the vocabulary
MAX_NUM_WORDS = 20000

# Convert the labels to one-hot encoding
y = df['label']
print('Shape of label tensor:', y.shape)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded, df['label'].values, test_size=0.2)

# Build the model
model = Sequential()
model.add(Embedding(input_dim=MAX_NB_WORDS, output_dim=EMBEDDING_DIM, input_length=Max_Len))
model.add(Conv1D(128, 5, activation='relu'))
model.add(Dropout(0.3))
model.add(MaxPooling1D(2))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(2))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=128)

# Evaluate the model
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

# Calculate the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

# Save the model
model.save('/content/drive/MyDrive/Colab Notebooks/weights/Combined/hybrid_combined.h5')

# Load the saved model
# loaded_model = tf.keras.models.load_model('lstm_fn.h5')


Found 34902 unique tokens.
Shape of data tensor: (75023, 29569)
Shape of label tensor: (75023,)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 29569, 100)        5000000   
                                                                 
 conv1d (Conv1D)             (None, 29565, 128)        64128     
                                                                 
 dropout (Dropout)           (None, 29565, 128)        0         
                                                                 
 max_pooling1d (MaxPooling1D  (None, 14782, 128)       0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 14778, 128)        82048     
                                                                 
 max_pooling1d_1 (MaxPooli

# GLOVE HYBRID


In [None]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

df = pd.read_csv (r'/content/drive/MyDrive/New/Clean/Combined_clean.csv')

MAX_NB_WORDS = 50000  # Maximum number of words to be used in the tokenizer
VALIDATION_SPLIT = 0.2

# Tokenize the news statements
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(df['statement'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# Convert the news statements to sequences of integers
sequences = tokenizer.texts_to_sequences(df['statement'])

# Pad the sequences
Max_Len = max([len(x) for x in df['statement']])
padded = pad_sequences(sequences, maxlen=Max_Len)

print('Shape of data tensor:', padded.shape)

# Define the maximum number of words to keep in the vocabulary
MAX_NUM_WORDS = 20000

# Convert the labels to one-hot encoding
y = df['label']
print('Shape of label tensor:', y.shape)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded, df['label'].values, test_size=0.2)


# Load the GloVe embeddings
embeddings_index = {}
f = open('/content/drive/MyDrive/Colab Notebooks/weights/glove.6B.300d.txt', encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

# Create an embedding matrix
embedding_dim = 300
word_index = tokenizer.word_index
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Build the model
model = Sequential()
model.add(Embedding(input_dim=num_words, output_dim=embedding_dim, input_length=Max_Len, weights=[embedding_matrix], trainable=False))
model.add(Conv1D(128, 5, activation='relu'))
model.add(Dropout(0.3))
model.add(MaxPooling1D(2))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(2))
model.add(Bidirectional(LSTM(32)))

model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=128)

# Evaluate the model
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

# Calculate the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

# Save the model
model.save('/content/drive/MyDrive/Colab Notebooks/weights/Combined/glove6b300_combined.h5')

# Load the saved model
# loaded_model = tf.keras.models.load_model('lstm_fn.h5')


Found 34902 unique tokens.
Shape of data tensor: (75023, 29569)
Shape of label tensor: (75023,)
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 29569, 300)        6000000   
                                                                 
 conv1d_2 (Conv1D)           (None, 29565, 128)        192128    
                                                                 
 dropout_1 (Dropout)         (None, 29565, 128)        0         
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 14782, 128)       0         
 1D)                                                             
                                                                 
 conv1d_3 (Conv1D)           (None, 14778, 128)        82048     
                                                                 
 max_pooling1d_3 (MaxPoo

In [None]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

df = pd.read_csv (r'/content/drive/MyDrive/New/Clean/Combined_clean.csv')

MAX_NB_WORDS = 50000  # Maximum number of words to be used in the tokenizer
VALIDATION_SPLIT = 0.2

# Tokenize the news statements
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(df['statement'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# Convert the news statements to sequences of integers
sequences = tokenizer.texts_to_sequences(df['statement'])

# Pad the sequences
Max_Len = max([len(x) for x in df['statement']])
padded = pad_sequences(sequences, maxlen=Max_Len)

print('Shape of data tensor:', padded.shape)

# Define the maximum number of words to keep in the vocabulary
MAX_NUM_WORDS = 20000

# Convert the labels to one-hot encoding
y = df['label']
print('Shape of label tensor:', y.shape)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded, df['label'].values, test_size=0.2)


# Load the GloVe embeddings
embeddings_index = {}
f = open('/content/drive/MyDrive/Colab Notebooks/weights/glove.42B.300d.txt', encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

# Create an embedding matrix
embedding_dim = 300
word_index = tokenizer.word_index
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Build the model
model = Sequential()
model.add(Embedding(input_dim=num_words, output_dim=embedding_dim, input_length=Max_Len, weights=[embedding_matrix], trainable=False))
model.add(Conv1D(128, 5, activation='relu'))
model.add(Dropout(0.3))
model.add(MaxPooling1D(2))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(2))
model.add(Bidirectional(LSTM(32)))

model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=128)

# Evaluate the model
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

# Calculate the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

# Save the model
model.save('/content/drive/MyDrive/Colab Notebooks/weights/Combined/glove42b_combined.h5')

# Load the saved model
# loaded_model = tf.keras.models.load_model('lstm_fn.h5')


Found 34902 unique tokens.
Shape of data tensor: (75023, 29569)
Shape of label tensor: (75023,)
Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 29569, 300)        6000000   
                                                                 
 conv1d_6 (Conv1D)           (None, 29565, 128)        192128    
                                                                 
 dropout_3 (Dropout)         (None, 29565, 128)        0         
                                                                 
 max_pooling1d_6 (MaxPooling  (None, 14782, 128)       0         
 1D)                                                             
                                                                 
 conv1d_7 (Conv1D)           (None, 14778, 128)        82048     
                                                                 
 max_pooling1d_7 (MaxPoo

# Word2Vec

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


df = pd.read_csv (r'/content/drive/MyDrive/New/Clean/Combined_clean.csv')

MAX_NB_WORDS = 50000  # Maximum number of words to be used in the tokenizer
VALIDATION_SPLIT = 0.2

# Tokenize the news statements
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(df['statement'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# Convert the news statements to sequences of integers
sequences = tokenizer.texts_to_sequences(df['statement'])

# Pad the sequences
Max_Len = max([len(x) for x in df['statement']])
padded = pad_sequences(sequences, maxlen=Max_Len)

print('Shape of data tensor:', padded.shape)

# Define the maximum number of words to keep in the vocabulary
MAX_NUM_WORDS = 20000

# Convert the labels to one-hot encoding
y = df['label']
print('Shape of label tensor:', y.shape)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded, df['label'].values, test_size=0.2)

# Load the Word2Vec embeddings
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api

# Load the pre-trained Word2Vec model
w2v_model = api.load('word2vec-google-news-300')

# Create an embedding matrix
embedding_dim = 300
word_index = tokenizer.word_index
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    if word in w2v_model:
        embedding_vector = w2v_model[word]
        embedding_matrix[i] = embedding_vector

# Build the model
model = Sequential()
model.add(Embedding(input_dim=num_words, output_dim=embedding_dim, input_length=Max_Len, weights=[embedding_matrix], trainable=False))
model.add(Conv1D(128, 5, activation='relu'))
model.add(Dropout(0.3))
model.add(MaxPooling1D(2))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(2))
model.add(Bidirectional(LSTM(32)))

model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=128)

# Evaluate the model
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

# Calculate the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

# Save the model
model.save('/content/drive/MyDrive/Colab Notebooks/weights/Combined/hybrid_word2vec_combined.h5')

# Load the saved model
# loaded_model = tf.keras.models.load_model('lstm_fn.h5')

Found 34902 unique tokens.
Shape of data tensor: (75023, 29569)
Shape of label tensor: (75023,)
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 29569, 300)        6000000   
                                                                 
 conv1d_8 (Conv1D)           (None, 29565, 128)        192128    
                                                                 
 dropout_4 (Dropout)         (None, 29565, 128)        0         
                                                                 
 max_pooling1d_8 (MaxPooling  (None, 14782, 128)       0         
 1D)                                                             
                                                                 
 conv1d_9 (Conv1D)           (None, 14778, 128)        82048     
                                                                 
 max_pooling1d_9 (MaxPoo

# FastText

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the data from CSV file
df = pd.read_csv('/content/drive/MyDrive/New/Clean/Combined_clean.csv')

MAX_NB_WORDS = 50000  # Maximum number of words to be used in the tokenizer
VALIDATION_SPLIT = 0.2

# Split data into statements and labels
statements = df['statement'].astype(str)
labels = df['label']

# Tokenize statements
tokenizer = Tokenizer()
tokenizer.fit_on_texts(statements)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(statements)

# Pad sequences
Max_Len = max([len(x) for x in statements])
padded_sequences = pad_sequences(sequences, maxlen=Max_Len)

print('Shape of data tensor:', padded_sequences.shape)

# Define the maximum number of words to keep in the vocabulary
MAX_NUM_WORDS = 20000

# Convert the labels to one-hot encoding
y = df['label']
print('Shape of label tensor:', y.shape)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['label'].values, test_size=0.2)

# Load the Word2Vec embeddings
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api

# Load the pre-trained Word2Vec model
ft_model = api.load('fasttext-wiki-news-subwords-300')

# Create an embedding matrix
embedding_dim = 300
word_index = tokenizer.word_index
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    if word in ft_model:
        embedding_vector = ft_model[word]
        embedding_matrix[i] = embedding_vector

# Build the model
model = Sequential()
model.add(Embedding(input_dim=num_words, output_dim=embedding_dim, input_length=Max_Len, weights=[embedding_matrix], trainable=False))
model.add(Conv1D(128, 5, activation='relu'))
model.add(Dropout(0.3))
model.add(MaxPooling1D(2))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(2))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=128)

# Evaluate the model
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

# Calculate the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

# Save the model
model.save('/content/drive/MyDrive/Colab Notebooks/weights/Combined/ft_combined.h5')

# Load the saved model
# loaded_model = tf.keras.models.load_model('lstm_fn.h5')


Shape of data tensor: (75023, 29569)
Shape of label tensor: (75023,)
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 29569, 300)        6000000   
                                                                 
 conv1d_4 (Conv1D)           (None, 29565, 128)        192128    
                                                                 
 dropout_2 (Dropout)         (None, 29565, 128)        0         
                                                                 
 max_pooling1d_4 (MaxPooling  (None, 14782, 128)       0         
 1D)                                                             
                                                                 
 conv1d_5 (Conv1D)           (None, 14778, 128)        82048     
                                                                 
 max_pooling1d_5 (MaxPooling  (None, 7389, 128)    