In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, LSTM, Input, concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Load data
train_data = pd.read_csv('..\\raw_data\\raw_data\\fulltrain.csv', header=None, names=['label', 'text'])
test_data = pd.read_csv('..\\raw_data\\raw_data\\balancedtest.csv', header=None, names=['label', 'text'])

X_train_texts = train_data['text'].values
y_train = train_data['label'].values
X_test_texts = test_data['text'].values
y_test = test_data['label'].values

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
y_train_categorical = to_categorical(y_train_encoded)
y_test_categorical = to_categorical(y_test_encoded)

# Tokenize text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train_texts)
X_train_seq = tokenizer.texts_to_sequences(X_train_texts)
X_test_seq = tokenizer.texts_to_sequences(X_test_texts)

# Set a reasonable max length for padding
max_length = 1000

# Pad sequences to ensure uniform input size
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, truncating='post')

# Simulate LIWC-like features
np.random.seed(42)  # For reproducibility
additional_features_train = np.random.rand(len(X_train_texts), 10)
additional_features_test = np.random.rand(len(X_test_texts), 10)

# Feature Selection with RFE
selector = RFE(RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=8, step=1)
selector = selector.fit(additional_features_train, y_train_encoded)

# Transform feature sets to only include selected features
additional_features_train_selected = selector.transform(additional_features_train)
additional_features_test_selected = selector.transform(additional_features_test)

# Normalize the selected features
scaler = MinMaxScaler()
additional_features_train_scaled = scaler.fit_transform(additional_features_train_selected)
additional_features_test_scaled = scaler.transform(additional_features_test_selected)

# Define model architecture with Functional API to handle multiple inputs
text_input = Input(shape=(max_length,), dtype='int32', name='text_input')
additional_input = Input(shape=(8,), name='additional_input')  # Adjusted to the number of selected features

# Text branch with LSTM
embedded_text = Embedding(input_dim=5000, output_dim=100, input_length=max_length)(text_input)
lstm_out = LSTM(64)(embedded_text)  # LSTM layer with 64 units

# Combine branches
combined = concatenate([lstm_out, additional_input])

# Output layer
predictions = Dense(units=len(label_encoder.classes_), activation='softmax')(combined)

# Create model
model = Model(inputs=[text_input, additional_input], outputs=predictions)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model with both the text and additional inputs
model.fit([X_train_pad, additional_features_train_scaled], y_train_categorical, epochs=2, validation_data=([X_test_pad, additional_features_test_scaled], y_test_categorical), batch_size=32)

# Model summary to see the architecture
model.summary()