In [None]:
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
# Load data
train_data = pd.read_csv('..\\raw_data\\raw_data\\fulltrain.csv', header=None, names=['label', 'text'])
test_data = pd.read_csv('..\\raw_data\\raw_data\\balancedtest.csv', header=None, names=['label', 'text'])

X_train = train_data['text'].values
y_train = train_data['label'].values
X_test = test_data['text'].values
y_test = test_data['label'].values

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
y_train_categorical = to_categorical(y_train_encoded)
y_test_categorical = to_categorical(y_test_encoded)

# Tokenize text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Set a reasonable max length for padding
max_length = 1000  # Adjust based on your dataset distribution

# Pad sequences to ensure uniform input size, truncating longer sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, truncating='post')

# Define the CNN-LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=100, input_length=max_length))
# Add a Convolutional layer
model.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
# Add a MaxPooling layer
model.add(MaxPooling1D(pool_size=4))
# Add an LSTM layer
model.add(LSTM(64))
# Output layer
model.add(Dense(units=len(label_encoder.classes_), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_pad, y_train_categorical, epochs=5, validation_data=(X_test_pad, y_test_categorical), batch_size=32)
model.summary()