In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
from datetime import datetime
from sklearn.metrics import confusion_matrix  # Import the confusion_matrix function


# Function to load and split data into train, validation, and test sets

def load_and_split_data(data_path):
    dataset = pd.read_pickle(data_path)
    
    # Define the percentage to set aside for sampling (1% in this case)
    sample_percentage = 0.01
    total_samples = int(len(dataset) * sample_percentage)
    
    # Randomly select 1% of the data
    sampled_data = dataset.sample(n=total_samples, random_state=42)
    
    # Save the "text" column from the sampled data to a file.txt
    sampled_data['text'].to_csv('test_text.txt', index=False, header=False)
    
    # Remove the sampled data from the original dataset
    dataset = dataset.drop(sampled_data.index)
    
    # Split the remaining data into train, validation, and test sets
    train_ratio = 0.8
    val_ratio = 0.1
    test_ratio = 0.1

    train_val_df, test = train_test_split(dataset, test_size=test_ratio, random_state=42)
    train, val = train_test_split(train_val_df, test_size=val_ratio / (train_ratio + val_ratio), random_state=42)

    dataframes = {
        'train': train,
        'val': val,
        'test': test
    }

    return dataframes


# Function to get tweets and labels from data
def get_tweets(data):
    tweets = data['text'].tolist()
    labels = data['emotions'].tolist()
    return tweets, labels

# Function to tokenize tweets and save the tokenizer
def tokenize_tweets(tweets):
    tokenizer = Tokenizer(num_words=10000, oov_token='<UNK>')
    tokenizer.fit_on_texts(tweets)
    with open('tokenizer.pkl', 'wb') as f:
        pickle.dump(tokenizer, f)
    return tokenizer

# Function to preprocess sequences
def preprocess_sequences(tokenizer, tweets):
    sequences = tokenizer.texts_to_sequences(tweets)
    padded_sequences = pad_sequences(sequences, truncating='post', maxlen=50, padding='post')
    return padded_sequences

# Function to map emotion labels to numeric IDs
def map_labels_to_ids(labels):
    emotion_sequence = ['anger', 'fear', 'sadness', 'surprise', 'joy', 'love']
    classes_to_index = {c: i for i, c in enumerate(emotion_sequence)}
    return np.array([classes_to_index.get(x) for x in labels])

# Function to create the model
def create_model(input_length, num_classes):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Embedding(10000, 16, input_length=input_length),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20, return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20)),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])

    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )

    return model

# Function to train the model
def train_model(model, train_data, val_data, epochs=20):
    h = model.fit(
        train_data['padded_sequences'], train_data['labels'],
        validation_data=(val_data['padded_sequences'], val_data['labels']),
        epochs=epochs,
        callbacks=[
            tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5),
            tf.keras.callbacks.ModelCheckpoint('best_model.h5', monitor='val_accuracy', save_best_only=True,verbose=2 , )
        ]
    )
    return h

def evaluate_model(model, test_data):
    # Evaluate the model
    loss, accuracy = model.evaluate(test_data['padded_sequences'], test_data['labels'])
    print(f"Test Loss: {loss:.4f}")
    print(f"Test Accuracy: {accuracy:.4f}")

    # Predict labels for test data
    predicted_labels = model.predict(test_data['padded_sequences'])
    predicted_labels = np.argmax(predicted_labels, axis=1)

    # Calculate the confusion matrix
    true_labels = test_data['labels']
    confusion = confusion_matrix(true_labels, predicted_labels)

    # Create a heatmap for the confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(confusion, annot=True, fmt='d', cmap='Blues', xticklabels=emotion_sequence, yticklabels=emotion_sequence)
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title('Confusion Matrix')
    plt.show()

# Function to show training and validation history
def show_history(h):
    epochs_trained = len(h.history['loss'])
    plt.figure(figsize=(16, 6))

    plt.subplot(1, 2, 1)
    plt.plot(range(0, epochs_trained), h.history.get('accuracy'), label='Training')
    plt.plot(range(0, epochs_trained), h.history.get('val_accuracy'), label='Validation')
    plt.ylim([0., 1.])
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(range(0, epochs_trained), h.history.get('loss'), label='Training')
    plt.plot(range(0, epochs_trained), h.history.get('val_loss'), label='Validation')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

# Function for the main preprocessing and training process
def preprocess_and_train(data_path):
    # Phase 1: Data Loading and Splitting
    print("Phase 1: Data Loading and Splitting")
    start_time = datetime.now()
    dataframes = load_and_split_data(data_path)
    tweets, labels = get_tweets(dataframes['train'])
    end_time = datetime.now()
    elapsed_time = end_time - start_time
    print(f"Phase 1 completed in {elapsed_time}")

    # Phase 3: Tokenization
    print("Phase 3: Tokenization")
    start_time = datetime.now()
    tokenizer = tokenize_tweets(tweets)
    end_time = datetime.now()
    elapsed_time = end_time - start_time
    print(f"Phase 3 completed in {elapsed_time}")

    # Phase 4: Sequence Padding
    print("Phase 4: Sequence Padding")
    start_time = datetime.now()
    padded_sequences = preprocess_sequences(tokenizer, tweets)
    end_time = datetime.now()
    elapsed_time = end_time - start_time
    print(f"Phase 4 completed in {elapsed_time}")

    # Phase 5: Label Mapping
    print("Phase 5: Label Mapping")
    start_time = datetime.now()
    train_labels = map_labels_to_ids(labels)
    end_time = datetime.now()
    elapsed_time = end_time - start_time
    print(f"Phase 5 completed in {elapsed_time}")

    # Phase 6: Model Creation
    print("Phase 6: Model Creation")
    start_time = datetime.now()
    num_classes = len(set(labels))
    model = create_model(input_length=50, num_classes=num_classes)
    end_time = datetime.now()
    elapsed_time = end_time - start_time
    print(f"Phase 6 completed in {elapsed_time}")

    # Phase 7: Validation Data Preprocessing
    print("Phase 7: Validation Data Preprocessing")
    start_time = datetime.now()
    val_tweets, val_labels = get_tweets(dataframes['val'])
    val_sequences = preprocess_sequences(tokenizer, val_tweets)
    val_labels = map_labels_to_ids(val_labels)
    end_time = datetime.now()
    elapsed_time = end_time - start_time
    print(f"Phase 7 completed in {elapsed_time}")

    # Phase 8: Model Training
    print("Phase 8: Model Training")
    start_time = datetime.now()
    h = train_model(model, {'padded_sequences': padded_sequences, 'labels': train_labels}, {'padded_sequences': val_sequences, 'labels': val_labels}, epochs=20)
    end_time = datetime.now()
    elapsed_time = end_time - start_time
    print(f"Phase 8 completed in {elapsed_time}")

    # Saving THe Model
    model.save("emotion_model_trained.h5")


    # Phase 9: Model Evaluation
    print("Phase 9: Model Saving an Evaluation")
    start_time = datetime.now()
    test_data = dataframes['test']
    test_tweets, test_labels = get_tweets(test_data)
    test_sequences = preprocess_sequences(tokenizer, test_tweets)
    test_labels = map_labels_to_ids(test_labels)
    evaluate_model(model, {'padded_sequences': test_sequences, 'labels': test_labels})
    end_time = datetime.now()
    elapsed_time = end_time - start_time
    print(f"Phase 9 completed in {elapsed_time}")

    # Phase 10: Logging and Saving
    print("Phase 10: Logging and Saving")
    start_time = datetime.now()
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    log_file = open("model_perf_log.txt", "a")
    log_file.write(f"Timestamp: {timestamp}\n")
    log_file.write("Model Architecture:\n")
    model.summary(print_fn=lambda x: log_file.write(x + '\n'))
    

    log_file.write("Model Performance:\n")
    log_file.write(f"Max Validation Accuracy: {max(h.history['val_accuracy']):.4f}\n")
    log_file.write(f"Min Validation Loss: {min(h.history['val_loss']):.4f}\n")
    log_file.write(f"Final Validation Accuracy: {h.history['val_accuracy'][-1]:.4f}\n")
    log_file.write(f"Final Validation Loss: {h.history['val_loss'][-1]:.4f}\n")
    log_file.write("\n")
    log_file.close()
    end_time = datetime.now()
    elapsed_time = end_time - start_time
    print(f"Phase 10 completed in {elapsed_time}")


    # Phase 11: Visualization
    print("Phase 11: Visualization")
    show_history(h)


In [None]:

# Main execution
def main():
    data_path = r"dataset\merged_training.pkl"
    preprocess_and_train(data_path)

if __name__ == "__main__":
    main()


In [None]:
# import tensorflow as tf
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
# from sklearn.metrics import confusion_matrix  # Import the confusion_matrix function
# import pickle

# # Load the saved tokenizer
# tokenizer = None
# with open('tokenizer.pkl', 'rb') as f:
#     tokenizer = pickle.load(f)

# # Load the saved model
# model = tf.keras.models.load_model("emotion_model_trained.h5")

# # Load the original data and split it using the same function
# data_path = r"dataset\merged_training.pkl"
# dataframes = load_and_split_data(data_path)

# # Extract the test data (last 10% of the data)
# test_data = dataframes['test']

# # Preprocess the test data as you did during training
# test_tweets, test_labels = get_tweets(test_data)
# test_sequences = preprocess_sequences(tokenizer, test_tweets)
# test_labels = map_labels_to_ids(test_labels)

# # Evaluate the model on the test data
# loss, accuracy = model.evaluate(test_sequences, test_labels)
# print(f"Test Loss: {loss:.4f}")
# print(f"Test Accuracy: {accuracy:.4f}")

# # Predict labels for test data
# predicted_labels = model.predict(test_sequences)
# predicted_labels = np.argmax(predicted_labels, axis=1)

# # Calculate the confusion matrix
# true_labels = test_labels
# confusion = confusion_matrix(true_labels, predicted_labels)

# # Create a heatmap for the confusion matrix
# plt.figure(figsize=(8, 6))
# sns.heatmap(confusion, annot=True, fmt='d', cmap='Blues', xticklabels=emotion_sequence, yticklabels=emotion_sequence)
# plt.xlabel('Predicted Labels')
# plt.ylabel('True Labels')
# plt.title('Confusion Matrix')
# plt.show()
