<a href="https://colab.research.google.com/github/UncleSamTech/301Lab1WalkThrough/blob/main/mrr_100_samples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import os
import numpy as np
import random
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import EarlyStopping
from datetime import datetime
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score,confusion_matrix, classification_report
import pickle
import time
from sklearn.utils.class_weight import compute_class_weight
from random import sample
import seaborn as sns
from concurrent.futures import ProcessPoolExecutor
import heapq

In [None]:
data = None
token_list = []
tokenizer = None
input_sequences = []
total_words = 0
ne_input_sequences = []
encompass = []
model = keras.Sequential()

In [None]:
def tokenize_data_inp_seq(file_name, result_path):
        with open(file_name, "r", encoding="utf-8") as rf:
            lines = rf.readlines()
            #shuffle trainset every run
            random.shuffle(lines)
            # Replace specific characters
            lines = [line.replace("_", "UNDERSCORE").replace(">", "RIGHTANG").replace("<", "LEFTANG").lower() for line in lines]
            print("see lines:", lines)

            # Initialize and fit the tokenizer
            tokenizer = Tokenizer(oov_token='<oov>')
            tokenizer.fit_on_texts(lines)

            # Save the tokenizer
            with open(f"{result_path}tokenized_file_50embedtime1.pickle", "wb") as tk:
                pickle.dump(tokenizer, tk, protocol=pickle.HIGHEST_PROTOCOL)

            # Define total_words based on the tokenizer
            total_words = len(tokenizer.word_index) + 1  # +1 to account for <oov>

            print(f"Total words (vocabulary size): {total_words}")

            # Generate token sequences (ngrams)
            encompass = []
            max_index = 0  # Track max token index to verify alignment with `total_words`
            for each_line in lines:
                each_line = each_line.strip()
                token_list = tokenizer.texts_to_sequences([each_line])[0]
                #max_index = max(max_index, max(self.token_list, default=0))  # Update max_index
                for i in range(1, len(token_list)):
                    ngram_seq = token_list[:i + 1]
                    encompass.append(ngram_seq)

            # Verify that total_words aligns with max index in token_list
            # if max_index >= self.total_words:
            #     print(f"Adjusting total_words to cover max token index: {max_index}")
            #     self.total_words = max_index + 1  # Update total_words if needed

            #print(f"First stage complete with encompass: {self.encompass}, total_words: {self.total_words}")
            return encompass, total_words, tokenizer


In [None]:
def quick_iterate(list_words):
        word_lengths = {word: len(word) for word in list_words if isinstance(list_words,list) and len(list_words) > 0}
        max_word = max(word_lengths,key=word_lengths.get)
        max_count = word_lengths[max_word]

        max_word_dict = {max_word:max_count}
        return word_lengths, max_word_dict

In [None]:
def pad_sequ(input_seq):


        max_seq_len = max([len(x) for x in input_seq])
        padded_in_seq = np.array(pad_sequences(input_seq,maxlen=max_seq_len,padding='pre'))
        #print("input shape training  ", padded_in_seq.shape)
        return padded_in_seq,max_seq_len

In [None]:
def prep_seq_labels(padded_seq,total_words):
        xs,labels = padded_seq[:,:-1],padded_seq[:,-1]

        max_label_index = np.max(labels)
        if max_label_index >= total_words:
            print(f"Adjusting total_words from {total_words} to {max_label_index + 1} based on labels.")
            total_words = max_label_index + 1

        # Ensure labels do not exceed the total words range
        if np.any(labels >= total_words):
            raise ValueError(f"Labels contain indices >= total_words: {np.max(labels)} >= {total_words}")

        ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)
        return xs, ys, labels
        #ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)
        #return xs,ys,labels

In [None]:
def predict_token_score(context, token, tokenz, model, maxlen):
        #token_list = tokenz.texts_to_sequences([context])
        # Early check for out-of-vocabulary token
        if token not in tokenz.word_index:
            return -1  # Assign low score for empty contexts

        # Tokenize combined context and token
        token_value = tokenz.texts_to_sequences([context + " " + token])[0]
        # Ensure the input is the correct length
        if len(token_value) < maxlen - 1:
            token_value = pad_sequences([token_value], maxlen=maxlen-1, padding="pre")[0]
        else:
            token_value = token_value[-(maxlen-1):]

         # Convert to a NumPy array (TensorFlow can process this directly)
        padded_in_seq = np.array([token_value])
        # Model prediction
        prediction = model.predict(padded_in_seq, verbose=0)
        return prediction[0][-1]  # Score of the token

In [None]:
def evaluate_bilstm_mrr_chunked(test_data, maxlen, model, result_path, proj_number, chunk_size=4000):
        #loaded_model = load_model(model, compile=False)
        with open(os.path.join(result_path, "tokenized_file_50embedtime1.pickle"), "rb") as tk:
            tokenz = pickle.load(tk)

        vocab = list(tokenz.word_index.keys())
        total_cumulative_rr = 0
        total_count = 0

        start_time = time.time()
        current_chunk = []

        def process_chunk(chunk):
            nonlocal total_cumulative_rr, total_count
            for line in chunk:
                if not line.strip():
                    continue

                line = line.replace("_", "UNDERSCORE").replace(">", "RIGHTANG").replace("<", "LEFTANG").lower()
                sentence_tokens = line.split(" ")
                if len(sentence_tokens) < 2:
                    continue

                context = " ".join(sentence_tokens[:-1])
                true_next_word = sentence_tokens[-1].lower()

                heap = []
                for token in vocab:
                    context_score = predict_token_score(context, token, tokenz, model, maxlen)
                    if len(heap) < 10:
                        heapq.heappush(heap, (context_score, token))
                    elif context_score > heap[0][0]:
                        heapq.heappushpop(heap, (context_score, token))

                heap.sort(reverse=True, key=lambda x: x[0])
                token_ranks = {t: rank + 1 for rank, (score, t) in enumerate(heap)}

                rank = token_ranks.get(true_next_word.strip(), 0)
                if rank:
                    total_cumulative_rr += 1 / rank
                total_count += 1

        # Read and process file in chunks
        with open(test_data, "r", encoding="utf-8") as f:
            for i, line in enumerate(f):
                current_chunk.append(line)
                if len(current_chunk) >= chunk_size:
                    process_chunk(current_chunk)
                    current_chunk = []
                    print(f"Processed {i + 1} lines so far.")

            # Process any remaining lines in the last chunk
            if current_chunk:
                process_chunk(current_chunk)

        mrr = total_cumulative_rr / total_count if total_count > 0 else 0
        print(f"Total MRR: {mrr}")
        time_spent = time.time() - start_time

        metrics_file = os.path.join(result_path, f"bilstm_mrr_metrics_{proj_number}.txt")
        os.makedirs(result_path, exist_ok=True)
        with open(metrics_file, "a") as blm:
            if os.path.getsize(metrics_file) == 0:
                blm.write("MRR,Evaluation_Time\n")
            blm.write(f"{mrr},{time_spent:.2f}\n")

        return mrr

In [None]:
def compute_confusion_matrix(y_true, y_pred, result_path, proj_number,top_k=10):

        labels = np.unique(np.concatenate((y_true, y_pred)))  # Get unique labels
        id2label = {i: str(label) for i, label in enumerate(labels)}  # Map indices to labels
        label2id = {v: k for k, v in id2label.items()}  # Reverse mapping (if needed)

        # Compute confusion matrix
        print("\nComputing Confusion Matrix...")

        # Compute the confusion matrix
        conf_matrix = confusion_matrix(y_true, y_pred)
        num_classes = conf_matrix.shape[0]
        print(f" number of classes {num_classes}")
        metrics = {id2label[i]:{"TP":0,"FP":0,"FN":0,"TN":0} for i in range(num_classes)}
        total_tp, total_fp, total_fn, total_tn = 0, 0, 0, 0

        for i in range(num_classes):
            TP = conf_matrix[i,i]
            FP = np.sum(conf_matrix[:,1]) - TP
            FN = np.sum(conf_matrix[i, :]) - TP
            TN = np.sum(conf_matrix) - (TP + FP + FN)

            label = id2label[i]
            metrics[label]["TP"] = TP
            metrics[label]["FP"] = FP
            metrics[label]["FN"] = FN
            metrics[label]["TN"] = TN

            total_tp += TP
            total_fp += FP
            total_fn += FN
            total_tn += TN

        # Write metrics to file and print
        with open(f"{result_path}/tp_fp_fn_tn_label_val.txt", "w") as af:
            af.write("Class,TP,FP,FN,TN\n")  # Header
            for label, values in metrics.items():
                #print(f"Label {label}: TP={values['TP']}, FP={values['FP']}, FN={values['FN']}, TN={values['TN']}")
                af.write(f"{label},{values['TP']},{values['FP']},{values['FN']},{values['TN']}\n")

        # Print total metrics
        #print(f"\nTotal TP={total_tp}, FP={total_fp}, FN={total_fn}, TN={total_tn}")
        #print(f"Confusion Matrix:\n{conf_matrix}")
        with open(f"{result_path}/total_results_bilstm_tp_tn_fp_fn.txt","w") as tot:
          tot.write("total_tn,total_fp,total_fn,total_tp,no_of_classes\n")
          tot.write(f"{total_tn},{total_fp},{total_fn},{total_tp},{num_classes}")

        conf_matrix = np.array([[total_tp, total_fn],
                            [total_fp, total_tn]])

        # Plotting the confusion matrix
        plt.figure(figsize=(6, 4))
        sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues', cbar=False,
                xticklabels=['Predicted Positive', 'Predicted Negative'],
                yticklabels=['Actual Positive', 'Actual Negative'])

        plt.title("Confusion Matrix")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        #plt.show()

        # # Get the unique class labels in sorted order (this will be used for indexing)
        # unique_classes = np.unique(np.concatenate((y_true, y_pred)))  # Combine y_true and y_pred to cover all classes

        # # Determine the top-k most frequent classes based on y_true
        # class_counts = pd.Series(y_true).value_counts().head(top_k).index

        # # Map the class labels to indices based on the sorted unique classes
        # class_indices = [np.where(unique_classes == label)[0][0] for label in class_counts]

        # # Use np.ix_ to index into the confusion matrix
        # filtered_conf_matrix = conf_matrix[np.ix_(class_indices, class_indices)]

        # # Optional: Save confusion matrix as a heatmap
        # plt.figure(figsize=(10, 8))
        # sns.heatmap(filtered_conf_matrix, annot=True, fmt='d', cmap='Blues',
        #         xticklabels=class_counts, yticklabels=class_counts)

        # # Rotate x-axis labels to avoid overlap
        # plt.xticks(rotation=45, ha='right')  # Rotate labels and align them to the right
        # plt.yticks(rotation=0)  # Keep y-axis labels as they are

        # plt.xlabel('Predicted Labels')
        # plt.ylabel('True Labels')
        # plt.title(f'Confusion Matrix (Top {top_k} Classes)')
        # # Adjust layout to make sure everything fits
        # plt.tight_layout()
        plt.savefig(f"{result_path}/confusion_matrix_run_an_bilstm_tp_tn_fp_fn{proj_number}.pdf")
        plt.close()


In [None]:
def predict_token(context, tokenz, load_mod, maxseqlen,v):
        token_list = None
        token_value = None
        output_word = ""


        # Tokenize context
        context = context.strip()
        #context = context.replace("_","UNDERSCORE")
        token_list = tokenz.texts_to_sequences([context])
        if not token_list or len(token_list[0]) == 0:
            print("Empty token list, unable to predict token.")
            return None

        token_value = token_list[0]
        padded_in_seq = pad_sequences([token_value], maxlen=maxseqlen - 1, padding='pre')

        # Ensure input is a tensor with consistent shape
        padded_in_seq = tf.convert_to_tensor(padded_in_seq)

        # Predict the next token
        predicted = load_mod.predict(padded_in_seq)

        # Retrieve the predicted token
        pred_token_index = np.argmax(predicted, axis=-1)
        for token, index in tokenz.word_index.items():
            if index == pred_token_index:
                output_word = token
                print(output_word)
                break
        #output_word  = output_word.replace("UNDERSCORE","_")
        return output_word


In [None]:
def evaluate_bilstm(test_data,maxlen,model,result_path,proj_number,train_time,verbose=False):
        y_true = []
        y_pred = []
        tokenz = None
        model_path = f"{result_path}{model}"
        loaded_model = load_model(f"{model_path}",compile=False)
        #loaded_model = load_model(f"{model_path}",compile=False)
        with open(f"{result_path}tokenized_file_50embedtime1.pickle","rb") as tk:
            tokenz = pickle.load(tk)


        # Start the evaluation timer
        start_time = time.time()

        with open(test_data,"r",encoding="utf-8") as f:
            lines= f.readlines()
            random.shuffle(lines)

            lines = [line.replace("_", "UNDERSCORE").replace(">", "RIGHTANG").replace("<", "LEFTANG").lower() for line in lines]
            for i,line in enumerate(lines):

                line = line.strip()


                sentence_tokens = line.split(" ")

                context = ' '.join(sentence_tokens[:-1])  # Use all words except the last one as context
                true_next_word = sentence_tokens[-1].lower()

                predicted_next_word = predict_token(context,tokenz,loaded_model,maxlen,verbose)



                if predicted_next_word is not None:
                    y_true.append(true_next_word)

                    y_pred.append(predicted_next_word)


                if i % 500 == 0:
                    print(f"Progress: {i} lines processed.")

        if not y_true or not y_pred:
            print("No valid predictions made.")
            return None, None, None, None

        end_time = time.time()
        time_spent = end_time - start_time
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, average='weighted',zero_division=0)
        recall = recall_score(y_true, y_pred, average='weighted',zero_division=0)
        f1score = f1_score(y_true,y_pred,average="weighted",zero_division=0)

        metrics_file = f"{result_path}bilstmmetrics_150embedtime1_{proj_number}_projects.txt"
        if not os.path.exists(metrics_file) or os.path.getsize(metrics_file) == 0:
            with open(metrics_file,"a") as fl:
                fl.write(f"accuracy,precision,recall,f1score,training_time,evaluation_time\n")
        with open(metrics_file,"a") as blm:
            blm.write(f"{accuracy},{precision},{recall},{f1score},{train_time},{time_spent:.2f}\n")

        return y_true,y_pred



In [None]:
def train_model_five_runs(total_words, max_seq, xs, ys, result_path,test_data,proj_number):
        print(tf.__version__)
        print("max length",max_seq)


        gpus = tf.config.experimental.list_physical_devices('GPU')
        if gpus:
            print(f"Default GPU device: {gpus[0]}")
            try:
                for gpu in gpus:
                    tf.config.experimental.set_memory_growth(gpu, True)
                print(f"Using GPU: {tf.test.gpu_device_name()}")

            except RuntimeError as e:
                print(f"Error setting up GPU: {e}")
                return

        else:
            print("No GPU available. Running on CPU.")


        lr_scheduler = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=5, verbose=1)
        early_stopping = EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)


        # Run model training for 2 runs, with each run with a sampled data

        for run in range(1, 2):
            print(f"\nStarting run {run}...\n")
            start_time = time.time()



            model = Sequential([
                Embedding(total_words, 100, input_shape=(max_seq - 1,)),
                Bidirectional(LSTM(150)),
                Dense(total_words, activation='softmax')
                ])
            adam = Adam(learning_rate=0.01)
            model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

            file_name = f"{result_path}main_bilstm_scratch_model_150embedtime_{proj_number}.keras"
            if os.path.exists(file_name):
                os.remove(file_name)



            # Fit the model
            history = model.fit(xs, ys, epochs=50, verbose=2, callbacks=[lr_scheduler, early_stopping])

            # Save the history
            with open(f"{result_path}main_historyrec_150embedtime{run}.pickle", "wb") as hs:
                pickle.dump(history.history, hs)

            model.save(file_name)

            end_time = time.time()
            time_spent = end_time - start_time
            print(f"Run {run} complete. Training time: {time_spent:.2f} seconds")

            # Save the model and record training details
            #model_file_name = f"{result_path}main_bilstm_scratch_model_150embedtime1_main_{run}.keras"
            #evaluate_bilstm_mrr_chunked(test_data,max_seq,model,result_path,proj_number,time_spent)
            ytrue,ypred = evaluate_bilstm(test_data,max_seq,model,result_path,proj_number,time_spent)
            compute_confusion_matrix(ytrue,ypred,result_path,total_words,run)
            #model.save(model_file_name)

In [None]:
def consolidate_data_train(filepath,result_path,test_data,proj_number,model_name):
        input_seq,total_words,tokenizer = tokenize_data_inp_seq(filepath,result_path)
        padd_seq,max_len = pad_sequ(input_seq)
        xs,ys,labels = prep_seq_labels(padd_seq,total_words)
        ytrue,ypred = evaluate_bilstm(test_data,max_len,model_name,result_path,proj_number,"0")
        compute_confusion_matrix(ytrue,ypred,result_path,total_words,1)
        #self.evaluate_bilstm_mrr_single_main2(test_data,39,model_name,result_path,proj_number)
        #self.evaluate_bilstm_mrr_single(test_data,max_len,"/media/crouton/siwuchuk/newdir/vscode_repos_files/scratch_models_ngram3/thesis_models/train_models/train_results/bilstm/models_10_v2/main_bilstm_scratch_model_150embedtime1_main_2.keras",result_path,proj_number)

        #train_model_five_runs(total_words,max_len,xs,ys,result_path,test_data,proj_number)
        #print(history)

        #self.train_model_again(model_name,result_path,xs,ys)

        #self.plot_graph("loss",result_path)

In [None]:
consolidate_data_train("/content/datasets/scratch_train_data_500_projects.txt","/content/result_path/","/content/datasets/scratch_test_data_20.txt","500","main_bilstm_scratch_model_150embedtime_500.keras")

Buffered data was truncated after reaching the output size limit.