In [3]:
!pip install numpy
!pip install pandas
!pip install scikit-learn
!pip install tensorflow



In [15]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Access dataset
file_path = '/content/drive/My Drive/colab_datasets/dialogueText_301.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 1. Imports

In [12]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.mixed_precision import set_global_policy
from sklearn.model_selection import train_test_split

## 2. Function Definitions

In [20]:
# Set mixed precision policy
set_global_policy('mixed_float16')

def load_data(file_path):
    """Load data from a CSV file and select relevant columns."""
    data = pd.read_csv(file_path)
    data = data[['from', 'to', 'text']]  # Select relevant columns
    return data

def preprocess_text(data, max_len=50):
    """Preprocess text data by tokenizing and padding sequences."""
    data = data.dropna(subset=['text'])  # Drop rows with missing text

    # Tokenizer initialization and fitting
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(data['text'].astype(str))  # Fit tokenizer on text

    # Convert texts to sequences and pad them
    sequences = tokenizer.texts_to_sequences(data['text'].astype(str))
    padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

    return padded_sequences, tokenizer

def build_model(vocab_size, max_len):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len))  # Embedding layer
    model.add(LSTM(64, return_sequences=True))  # First LSTM layer, return sequences for each timestep
    model.add(Dropout(0.2))
    model.add(LSTM(64, return_sequences=True))  # Second LSTM layer, return sequences for each timestep
    model.add(Dense(vocab_size, activation='softmax', dtype='float32'))  # Predict a word (softmax) for each timestep
    return model

## 3. Model Selection

In [16]:
# Load and preprocess data
data = load_data(file_path)
padded_sequences, tokenizer = preprocess_text(data)

## 4. Training Model (Train/Validate/Test)

In [None]:
# Prepare input/output data
X, y = padded_sequences[:, :-1], padded_sequences[:, 1:]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# No reshaping needed; target remains 2D
y_train = y_train
y_test = y_test

# Convert data to tf.data.Dataset for better performance
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(128).prefetch(tf.data.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(128).prefetch(tf.data.AUTOTUNE)

# Build model
vocab_size = len(tokenizer.word_index) + 1  # Account for padding
max_len = padded_sequences.shape[1]

model = build_model(vocab_size, max_len)

# Compile model with optimizer tweaks
optimizer = Adam(learning_rate=1e-3)  # Initial learning rate
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Add callbacks for checkpointing, early stopping, and learning rate adjustment
checkpoint = ModelCheckpoint('best_model.keras', monitor='val_loss', save_best_only=True)
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1, min_lr=1e-6)  # Reduces LR on plateau

# Train model with callbacks
history = model.fit(train_dataset, validation_data=test_dataset, epochs=10, batch_size=16, callbacks=[checkpoint, early_stopping, reduce_lr])

# Evaluate model performance on test data
results = model.evaluate(test_dataset)
print(f"Test Loss: {results[0]}")
print(f"Test Accuracy: {results[1]}")

## 5. Evaluation

In [None]:
# Evaluate model
results = model.evaluate(test_dataset)
print(f"Test Loss: {results[0]}")
print(f"Test Accuracy: {results[1]}")

## 6. Hyperparameter Tuning

In [None]:
# Hyperparameter tuning example (adjusting LSTM units and dropout rates)
def hyperparameter_tuning():
    best_accuracy = 0
    for units in [64, 128]:  # Exploring different LSTM units
        for dropout in [0.2, 0.3]:  # Exploring different dropout rates
            model = Sequential([
                Embedding(input_dim=vocab_size, output_dim=128),
                LSTM(units, return_sequences=True),
                Dropout(dropout),
                LSTM(units),
                Dense(vocab_size, activation='softmax')
            ])
            model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
            history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=64)

            # Extract and compare validation accuracy
            accuracy = history.history['val_accuracy'][-1]
            if accuracy > best_accuracy:
                best_accuracy = accuracy
    return best_accuracy

## 7. Evaluate and Compare Results

In [None]:
# Compare results after tuning hyperparameters
best_accuracy = hyperparameter_tuning()
print(f"Best Validation Accuracy after tuning: {best_accuracy}")