In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Dropout, concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report

In [3]:
df = pd.read_csv('fil_data.csv')

print(df.head())
df.dropna(subset=['utterance', 'label'], inplace=True)

             utterance  label
0            \talright      0
1  \tyou have fun okay      0
2               \tokay      1
3            \talright      0
4   \tdid you see this      0


In [4]:
def calculate_avg_sentence_length(text):
    sentences = text.split('.')  # Split by period (you can refine this for more accurate sentence splitting)
    sentence_lengths = [len(sentence.split()) for sentence in sentences if sentence.strip() != '']  # Count words per sentence
    if len(sentence_lengths) == 0:
        return 0
    return np.mean(sentence_lengths)  # Return the average sentence length

In [5]:
# Calculate the linguistic feature for all utterances
df['avg_sentence_length'] = df['utterance'].apply(calculate_avg_sentence_length)

# Normalize the linguistic feature
scaler = MinMaxScaler()
df['avg_sentence_length'] = scaler.fit_transform(df[['avg_sentence_length']])

In [6]:
# Hyperparameters
vocab_size = 2000  # Limit vocabulary size for faster training
embedding_dim = 100
max_length = 50  # Truncate or pad sequences to this length
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"

In [7]:
# Prepare the text and labels
texts = df['utterance'].values
linguistic_features = df['avg_sentence_length'].values  # Now using avg sentence length as the feature
labels = df['label'].values

In [8]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [9]:
#Combine padded sequences and linguistic features
X_combined = np.hstack((padded_sequences, linguistic_features.reshape(-1, 1)))
X_train, X_test, y_train, y_test = train_test_split(X_combined, labels, test_size=0.2, random_state=42)

In [10]:
#Utterance input branch
input_1 = Input(shape=(max_length,), name="utterance_input")
embedding = Embedding(vocab_size, embedding_dim, input_length=max_length)(input_1)

#Bidirectional LSTM layers
bilstm_out = Bidirectional(LSTM(128, return_sequences=True))(embedding)
bilstm_out = Dropout(0.2)(bilstm_out)
bilstm_out = Bidirectional(LSTM(64, return_sequences=True))(bilstm_out)  # Second Bidirectional LSTM layer
bilstm_out = Dropout(0.2)(bilstm_out)
bilstm_out = Bidirectional(LSTM(32))(bilstm_out)  # Final Bidirectional LSTM layer#



In [11]:
#Linguistic feature input branch
input_2 = Input(shape=(1,), name="linguistic_feature_input")
dense_feat = Dense(16, activation='relu')(input_2)

#Combining both branches
combined = concatenate([bilstm_out, dense_feat])
dense_out = Dense(64, activation='relu')(combined)
dense_out = Dropout(0.2)(dense_out)
final_output = Dense(1, activation='sigmoid')(dense_out)

In [12]:
model = Model(inputs=[input_1, input_2], outputs=final_output)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

num_epochs = 5
batch_size = 40
history = model.fit(
    [X_train[:, :-1], X_train[:, -1]], y_train,
    epochs=num_epochs,
    batch_size=batch_size,
    validation_data=([X_test[:, :-1], X_test[:, -1]], y_test)
)

y_pred = (model.predict([X_test[:, :-1], X_test[:, -1]]) > 0.5).astype("int32")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Epoch 1/5
[1m1628/1628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m339s[0m 187ms/step - accuracy: 0.7699 - loss: 0.4805 - val_accuracy: 0.8105 - val_loss: 0.4244
Epoch 2/5
[1m1628/1628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m352s[0m 206ms/step - accuracy: 0.8199 - loss: 0.4040 - val_accuracy: 0.8136 - val_loss: 0.4115
Epoch 3/5
[1m1628/1628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m270s[0m 166ms/step - accuracy: 0.8217 - loss: 0.3871 - val_accuracy: 0.8162 - val_loss: 0.4154
Epoch 4/5
[1m1628/1628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m241s[0m 148ms/step - accuracy: 0.8320 - loss: 0.3696 - val_accuracy: 0.8165 - val_loss: 0.4123
Epoch 5/5
[1m1628/1628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m230s[0m 141ms/step - accuracy: 0.8390 - loss: 0.3554 - val_accuracy: 0.8225 - val_loss: 0.4103
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 46ms/step
Accuracy: 0.8225499231950845
              precision    recall  f1-score   support

    