In [None]:
pip install numpy
pip install pandas
pip install scikit-learn
pip install tensorflow

In [4]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Access dataset
file_path = '/content/drive/My Drive/colab_datasets/dialogueText_301.csv'

Mounted at /content/drive


## 1. Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import tensorflow as tf

## 2. Function Definitions

In [2]:
def load_data(file_path):
    data = pd.read_csv(file_path)
    data = data[['from', 'to', 'text']]  # Select relevant columns
    return data

def preprocess_text(data, max_len=50):
    # Drop rows where 'text' is NaN or missing
    data = data.dropna(subset=['text'])

    # Initialize tokenizer
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(data['text'].astype(str))  # Convert 'text' to string

    sequences = tokenizer.texts_to_sequences(data['text'].astype(str))
    padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

    return padded_sequences, tokenizer

def build_model(vocab_size, max_len):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=128))  # Remove input_length
    model.add(LSTM(128, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(128))
    model.add(Dense(vocab_size, activation='softmax'))
    return model

## 3. Model Selection

In [3]:
# Load and preprocess data
data = load_data('dialogueText_301.csv')  # Adjust file name if needed
padded_sequences, tokenizer = preprocess_text(data)

# Get vocabulary size and max sequence length
vocab_size = len(tokenizer.word_index) + 1
max_len = 50

FileNotFoundError: [Errno 2] No such file or directory: 'dialogueText_301.csv'

## 4. Training Model (Train/Validate/Test)

In [None]:
# Assuming `padded_sequences` is already defined and shaped (num_samples, sequence_length)
X, y = padded_sequences[:, :-1], padded_sequences[:, 1:]

# Print shapes to debug
print(f"Shape of X: {X.shape}")  # Should be (num_samples, sequence_length - 1)
print(f"Shape of y: {y.shape}")  # Should be (num_samples, sequence_length - 1)

# Now split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# y_train and y_test should match the first dimension of X_train and X_test
y_train = y_train[:, -1]  # Take the last column for the target
y_test = y_test[:, -1]    # Take the last column for the target

# Print shapes after splitting to debug
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_test: {y_test.shape}")

# Convert data to tf.data.Dataset for better performance
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(64).prefetch(tf.data.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(64).prefetch(tf.data.AUTOTUNE)

# Build model
model = build_model(vocab_size, max_len)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Add callbacks for model checkpointing and early stopping
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train model with callbacks and dataset
history = model.fit(train_dataset, validation_data=test_dataset, epochs=10, callbacks=[checkpoint, early_stopping])


Shape of X: (16586545, 49)
Shape of y: (16586545, 49)
Shape of X_train: (13269236, 49)
Shape of y_train: (13269236,)
Shape of X_test: (3317309, 49)
Shape of y_test: (3317309,)
Epoch 1/10




[1m101250/207332[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m20:27:15[0m 694ms/step - accuracy: 0.9925 - loss: 0.1156

KeyboardInterrupt: 

## 5. Evaluation

In [None]:
# Evaluate model
results = model.evaluate(test_dataset)
print(f"Test Loss: {results[0]}")
print(f"Test Accuracy: {results[1]}")

## 6. Hyperparameter Tuning

In [None]:
# Hyperparameter tuning example (adjusting LSTM units and dropout rates)
def hyperparameter_tuning():
    best_accuracy = 0
    for units in [64, 128]:  # Exploring different LSTM units
        for dropout in [0.2, 0.3]:  # Exploring different dropout rates
            model = Sequential([
                Embedding(input_dim=vocab_size, output_dim=128),
                LSTM(units, return_sequences=True),
                Dropout(dropout),
                LSTM(units),
                Dense(vocab_size, activation='softmax')
            ])
            model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
            history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=64)

            # Extract and compare validation accuracy
            accuracy = history.history['val_accuracy'][-1]
            if accuracy > best_accuracy:
                best_accuracy = accuracy
    return best_accuracy

## 7. Evaluate and Compare Results

In [None]:
# Compare results after tuning hyperparameters
best_accuracy = hyperparameter_tuning()
print(f"Best Validation Accuracy after tuning: {best_accuracy}")