<a href="https://colab.research.google.com/github/SZAftabi/User-Oriented-RAG-CQA/blob/main/SE-PQA/Evaluations_on_SE_PQA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Data Preparation


In [None]:
import re
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
PostLinks = pd.read_csv('/content/drive/MyDrive/SE-PQA/postlinks.csv')
PostLinks.head(3)

In [None]:
DuplicatePairIDs = PostLinks[PostLinks['LinkType']=='duplicated']
display(DuplicatePairIDs.head(3))
print("Total number of duplicate pairs = ", len(DuplicatePairIDs))

In [None]:
columns = ['ParentId', 'Id', 'Text']
Answers = pd.read_csv('/content/drive/MyDrive/SE-PQA/answers.csv', usecols=columns)
Answers.head(3)

In [None]:
Pairs_with_AcceptedAnswers = DuplicatePairIDs[
    DuplicatePairIDs['PostId'].isin(Answers['ParentId']) &
    DuplicatePairIDs['RelatedPostId'].isin(Answers['ParentId'])
]
len(Pairs_with_AcceptedAnswers)

In [None]:
Pairs_with_AcceptedAnswers = Pairs_with_AcceptedAnswers.merge(
    Answers, left_on='PostId', right_on='ParentId', suffixes=('', '_Post')
)
display(Pairs_with_AcceptedAnswers.head(3))
Pairs_with_AcceptedAnswers = Pairs_with_AcceptedAnswers[['PostId', 'RelatedPostId', 'LinkType', 'Id', 'Text']]

In [None]:
Pairs_with_AcceptedAnswers = Pairs_with_AcceptedAnswers.merge(
    Answers, left_on='RelatedPostId', right_on='ParentId', suffixes=('_Post', '_Related')
)
Pairs_with_AcceptedAnswers = Pairs_with_AcceptedAnswers[['PostId', 'RelatedPostId', 'LinkType', 'Id_Post', 'Text_Post', 'Id_Related', 'Text_Related']]
Pairs_with_AcceptedAnswers.head(3)

In [None]:
Pairs_with_AcceptedAnswers = Pairs_with_AcceptedAnswers.rename(
    columns={'PostId': 'id_Q1',
             'RelatedPostId': 'id_Q2',
             'LinkType': 'entailment',
             'Id_Post': 'answer_id_Q1',
             'Text_Post': 'answer_body_Q1',
             'Id_Related': 'answer_id_Q2',
             'Text_Related': 'answer_body_Q2'})
Pairs_with_AcceptedAnswers.head(3)

In [None]:
columns = ['Id', 'AcceptedAnswerId', 'CreationDate', 'Body', 'Tags', 'Title', 'AccountId']      # 'Community'
Questions = pd.read_csv('/content/drive/MyDrive/SE-PQA/questions_with_answer.csv', usecols=columns)
display(Questions.head(3))
print("Total number of questions with accepted answer = ", len(Questions))

In [None]:
MyData = Pairs_with_AcceptedAnswers.merge(
    Questions, left_on='id_Q1', right_on='Id', how='inner', suffixes=('', '_Q1')
)

MyData = MyData[['id_Q1',	'id_Q2', 'entailment',
                 'answer_id_Q1', 'answer_body_Q1',
                 'answer_id_Q2', 'answer_body_Q2',
                 'CreationDate', 'Body', 'Title',
                 'Tags',	'AcceptedAnswerId', 'AccountId']]
MyData.head(3)

In [None]:
MyData = MyData.merge(
    Questions, left_on='id_Q2', right_on='Id', how='inner', suffixes=('_Q1', '_Q2')
)

MyData = MyData[['id_Q1',	'id_Q2', 'entailment',
                 'answer_id_Q1', 'answer_body_Q1',
                 'answer_id_Q2', 'answer_body_Q2',
                 'CreationDate_Q1', 'Body_Q1', 'Title_Q1',
                 'Tags_Q1',	'AcceptedAnswerId_Q1', 'AccountId_Q1',
                 'CreationDate_Q2', 'Body_Q2', 'Title_Q2',
                 'Tags_Q2',	'AcceptedAnswerId_Q2', 'AccountId_Q2']]
MyData.head(3)

In [None]:
MyData = MyData.rename(
    columns={'CreationDate_Q1': 'creationDate_Q1',
             'CreationDate_Q2': 'creationDate_Q2',
             'Body_Q1': 'body_Q1',
             'Body_Q2': 'body_Q2',
             'Title_Q1': 'title_Q1',
             'Title_Q2': 'title_Q2',
             'Tags_Q1': 'tags_Q1',
             'Tags_Q2': 'tags_Q2',
             'AccountId_Q1': 'userid_Q1',
             'AccountId_Q2': 'userid_Q2',
             'AcceptedAnswerId_Q1': 'acceptedAnswerId_Q1',
             'AcceptedAnswerId_Q2': 'acceptedAnswerId_Q2'
             })
MyData.head(3)

In [None]:
MyData = MyData[
    (MyData["acceptedAnswerId_Q1"] == MyData["answer_id_Q1"]) &
    (MyData["acceptedAnswerId_Q2"] == MyData["answer_id_Q2"])
]
MyData.head(4)

In [None]:
MyData.to_pickle("/content/drive/MyDrive/SE-PQA/SE_PQA_Data_All.pkl")
print("The number of rows in the dataset = ", len(MyData))

In [None]:
def remove_html_tags(text):
    return BeautifulSoup(text, "html.parser").get_text()

def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    return text.strip()

In [None]:
MyData.loc[:, 'body_Q1'] = MyData['body_Q1'].apply(remove_html_tags)
MyData.loc[:, 'body_Q2'] = MyData['body_Q2'].apply(remove_html_tags)
MyData.loc[:, 'body_Q1'] = MyData['body_Q1'].apply(clean_text)
MyData.loc[:, 'body_Q2'] = MyData['body_Q2'].apply(clean_text)

In [None]:
MyData.to_pickle("/content/drive/MyDrive/SE-PQA/SE_PQA_Data_All_cleaned.pkl")
display(MyData.head(3))
print("The number of rows in the dataset = ", len(MyData))

In [None]:
MyData = pd.read_pickle("/content/drive/MyDrive/SE-PQA/SE_PQA_Data_All_cleaned.pkl")
display(MyData.head(3))
print("The number of rows in the dataset = ", len(MyData))

In [None]:
MyData2 = MyData[MyData['body_Q1'].str.split().apply(len) < 500]
MyData2 = MyData2[MyData2['body_Q2'].str.split().apply(len) < 500]
display(MyData2.head(3))
print("The number of rows in the dataset = ", len(MyData2))

In [None]:
MyData2.to_pickle("/content/drive/MyDrive/SE-PQA/SE_PQA_Data_All_cleaned_Len500.pkl")
display(MyData2.head(3))
print("The number of rows in the dataset = ", len(MyData2))

In [None]:
MyData2 = pd.read_pickle("/content/drive/MyDrive/SE-PQA/SE_PQA_Data_All_cleaned_Len500.pkl")
display(MyData2.head(3))
print("The number of rows in the dataset = ", len(MyData2))

In [None]:
MyData2['entailment'] = MyData2['entailment'].replace('duplicated', 'positive')
MyData2.head(3)

In [None]:
positive_samples = MyData2.sample(n=5000, random_state=42)
positive_samples.head(3)

In [None]:
q1_ids = positive_samples['id_Q1'].unique()
q2_ids = positive_samples['id_Q2'].unique()

all_pairs = pd.MultiIndex.from_product([q1_ids, q2_ids], names=['id_Q1', 'id_Q2']).to_frame(index=False)
all_pairs = all_pairs[all_pairs['id_Q1'] != all_pairs['id_Q2']]
existing_pairs = MyData[['id_Q1', 'id_Q2']]

all_pairs.head(3)

In [None]:
existing_pairs = existing_pairs[existing_pairs['id_Q1'].isin(all_pairs['id_Q1'])]
existing_pairs = existing_pairs[existing_pairs['id_Q2'].isin(all_pairs['id_Q2'])]

In [None]:
all_pairs = all_pairs.sample(n=10000, random_state=42)
merged = all_pairs.merge(existing_pairs, how='left', indicator=True)

In [None]:
merged.head(3)

In [None]:
len(merged[merged['_merge'] == 'left_only'])

In [None]:
negative_samples = merged[merged['_merge'] == 'left_only'].drop(columns=['_merge'])
negative_samples = negative_samples.sample(n=5000, random_state=42)

cols_Q1 = ['id_Q1', 'answer_id_Q1', 'answer_body_Q1', 'creationDate_Q1',
           'body_Q1', 'title_Q1', 'tags_Q1', 'acceptedAnswerId_Q1', 'userid_Q1']
q1_info = MyData[cols_Q1].drop_duplicates(subset='id_Q1')
negative_samples = pd.merge(negative_samples, q1_info, on='id_Q1', how='left')


cols_Q2 = ['id_Q2', 'answer_id_Q2', 'answer_body_Q2', 'creationDate_Q2',
           'body_Q2', 'title_Q2', 'tags_Q2', 'acceptedAnswerId_Q2', 'userid_Q2']
q2_info = MyData[cols_Q2].drop_duplicates(subset='id_Q2')
negative_samples = pd.merge(negative_samples, q2_info, on='id_Q2', how='left')

negative_samples['entailment'] = 'negative'

In [None]:
display(negative_samples.head(3))

In [None]:
negative_samples = negative_samples[positive_samples.columns]
MyData3 = pd.concat([negative_samples, positive_samples], ignore_index=True)
display(MyData3.head(3))

In [None]:
MyData3 = MyData3.sample(frac=1, random_state=42).reset_index(drop=True)
MyData3.to_pickle("/content/drive/MyDrive/SE-PQA/SE_PQA_Data_10000_cleaned_Len500.pkl")
display(MyData3.head(10))
print("The number of samples in the final data = ", len(MyData3))

In [None]:
MyData3 = pd.read_pickle("/content/drive/MyDrive/SE-PQA/SE_PQA_Data_10000_cleaned_Len500.pkl")
display(MyData3.head(10))
print("The number of samples in the final data = ", len(MyData3))

# Tag Clustering

In [None]:
!pip install -q networkx

In [None]:
import pandas as pd
import re
from collections import Counter
from collections import OrderedDict
import networkx as nx
import matplotlib.pyplot as plt
import pickle

In [None]:
columns = ['Tags']
Tags = pd.read_csv('/content/drive/MyDrive/SE-PQA/questions_with_answer.csv', usecols=columns)
display(Tags.head(3))
print("Total number of questions with accepted answer = ", len(Tags))

In [None]:
tags_lists = Tags['Tags'].str.findall(r"<(.*?)>")
all_posts = tags_lists.tolist()
print("Number of posts : ", len(all_posts))

In [None]:
all_posts

In [None]:
unique_tags = list(OrderedDict.fromkeys(tag for post in all_posts for tag in post))
print("Number of unique tags: ", len(unique_tags))

In [None]:
Tags.to_pickle("/content/drive/MyDrive/SE-PQA/tags_QwAs.pkl")

In [None]:
Tags = pd.read_pickle("/content/drive/MyDrive/SE-PQA/tags_QwAs.pkl")

In [None]:
import numpy as np

In [None]:
co_occurrence_matrix = np.zeros((len(unique_tags), len(unique_tags)))                         # Create an empty co-occurrence matrix

for post in all_posts:
    post_tags = set(post)                                                       # Extract tags from the post
    for tag1 in post_tags:                                                      # Iterate through pairs of tags in the post
        for tag2 in post_tags:
            if tag1 != tag2:
                index1 = unique_tags.index(tag1)                                       # Find the indices of tag1 and tag2 in the 'tags' list
                index2 = unique_tags.index(tag2)
                co_occurrence_matrix[index1, index2] += 1                       # Update the co-occurrence matrix
                co_occurrence_matrix[index2, index1] += 1

In [None]:
display(co_occurrence_matrix[0:10, 0:10])
print(co_occurrence_matrix.shape)

co_occurrence_df = pd.DataFrame(co_occurrence_matrix)
co_occurrence_matrix_file = "/content/drive/MyDrive/SE-PQA/co_occurrence_matrix.pkl"
co_occurrence_df.to_pickle(co_occurrence_matrix_file)

In [None]:
co_occurrence_matrix_file = "/content/drive/MyDrive/SE-PQA/co_occurrence_matrix.pkl"
co_occurrence_df = pd.read_pickle(co_occurrence_matrix_file)
co_occurrence_matrix = co_occurrence_df.to_numpy()

In [None]:
G = nx.Graph()
for i in range(len(unique_tags)):
    for j in range(i + 1, len(unique_tags)):
        if co_occurrence_matrix[i][j] > 0:
            tag1 = unique_tags[i]
            tag2 = unique_tags[j]
            weight = co_occurrence_matrix[i][j]
            G.add_edge(tag1, tag2, weight=weight)

In [None]:
with open("/content/drive/MyDrive/SE-PQA/tag_graph.pkl", "wb") as f:
    pickle.dump(G, f)

In [None]:
with open("/content/drive/MyDrive/SE-PQA/tag_graph.pkl", "rb") as f:
    G = pickle.load(f)

In [None]:
degrees = dict(G.degree())
num_nodes = len(G.nodes)

average_degree = sum(degrees.values()) / len(degrees)
max_degree = max(degrees.values())
min_degree = min(degrees.values())

print(f"Number of Nodes: {num_nodes}")
print(f"Average Degree: {average_degree}")
print(f"Maximum Degree: {max_degree}")
print(f"Minimum Degree: {min_degree}")

plt.figure(figsize=(8.5, 4))

degree_counts = Counter(degrees.values())
degrees, counts = zip(*sorted(degree_counts.items()))

plt.subplot(2, 1, 1)
plt.bar(degrees, counts, alpha=0.8, color="cyan")
plt.ylabel("Frequency")
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Subplot 2: Closer Look at Degree Distribution
plt.subplot(2, 1, 2)
plt.bar(degrees[:80], counts[:80], alpha=0.8, color="cyan")
plt.xlabel('Degree')
plt.ylabel("Frequency")
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)


plt.tight_layout()
plt.savefig(f'/content/drive/MyDrive/SE-PQA/Graph_Degree_Distribution.tiff', dpi=300)
plt.show()

In [None]:
!pip install --upgrade node2vec

In [None]:
from node2vec import Node2Vec

In [None]:
walk_length = 10 #80
num_walks = 30 #60 #50 #20...
dimensions = 128
window = 10
min_count = 1
num_workers = 2
sg = 1
epochs = 20
alpha = 1e-3
p = 1
q = 0.5 #16
seed = 42

In [None]:
node2vec_instance = Node2Vec(
    G,
    dimensions=dimensions,
    walk_length=walk_length ,
    num_walks=num_walks,
    workers=num_workers,
    p = p,
    q = q,
    seed = seed,
    weight_key = 'weight'
  )
print("Done!")

In [None]:
model = node2vec_instance.fit(
    window=window,
    min_count=min_count,
    sg=sg,
    # compute_loss=True,
    epochs = epochs,
    alpha = alpha,
    batch_words=4,
  )

In [None]:
tag_embeddings = {node: model.wv[node] for node in G.nodes()}
tag_names = list(tag_embeddings.keys())

In [None]:
model.wv.save_word2vec_format(f"/content/drive/MyDrive/SE-PQA/n2v_tag_embeddings.txt")
model.save(f"/content/drive/MyDrive/SE-PQA/Node2vec_model.txt")


tag_names_pckl = pd.DataFrame(tag_names)
tag_names_file_name = f"/content/drive/MyDrive/SE-PQA/n2v_tag_names.pkl"
tag_names_pckl.to_pickle(tag_names_file_name)

In [None]:
Node2vec_embeddings = np.array(list(tag_embeddings.values()))

Node2vec_embeddings_pckl = pd.DataFrame(Node2vec_embeddings)
Node2vec_file_name = f"/content/drive/MyDrive/SE-PQA/n2v_embeddings.pkl"
Node2vec_embeddings_pckl.to_pickle(Node2vec_file_name)

In [None]:
print("Done!")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    precision_recall_fscore_support
    )

In [None]:
edges = list(G.edges())
non_edges = [(i, j) for i in list(G.nodes()) for j in list(G.nodes()) if not G.has_edge(i, j)]
positive_samples = [(tag_embeddings[i], tag_embeddings[j], 1) for i, j in edges]
negative_samples = [(tag_embeddings[i], tag_embeddings[j], 0) for i, j in non_edges]

In [None]:
all_samples2 = positive_samples + negative_samples[:296748]
np.random.shuffle(all_samples2)

X = np.array([(np.concatenate((i, j))) for i, j, _ in all_samples2])
y = np.array([label for _, _, label in all_samples2])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

predictions = classifier.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy for edge prediction: {accuracy:.2f}")

print(precision_recall_fscore_support(y_test, predictions))

In [None]:
Node2vec_file_name = f"/content/drive/MyDrive/SE-PQA/n2v_embeddings.pkl"
Node2vec_embeddings_pckl = pd.read_pickle(Node2vec_file_name)
Node2vec_embeddings = Node2vec_embeddings_pckl.to_numpy()
node_embeddings = Node2vec_embeddings
node_embeddings.shape

In [None]:
!git clone -q https://github.com/rapidsai/rapidsai-csp-utils.git                   # Fast t-SNE
!python rapidsai-csp-utils/colab/pip-install.py
import cudf
import cuml
import copy
cuml.__version__
from sklearn.cluster import AgglomerativeClustering
from cuml.manifold import TSNE
from scipy.spatial.distance import cdist
from sklearn.metrics import (
    silhouette_score,
    davies_bouldin_score,
    calinski_harabasz_score
    )
from scipy.cluster.hierarchy import (
    dendrogram,
    linkage,
    fcluster
    )

In [None]:
def hierarchical_clustering(embeddings, n_clusters, metric, dist_threshold, linkage, full_tree):
    agglomerative_cluster = AgglomerativeClustering(
        n_clusters=None,
        metric=metric,
        distance_threshold=dist_threshold,
        linkage=linkage,
        compute_full_tree = full_tree
    )
    pr = agglomerative_cluster.fit_predict(embeddings)
    model = agglomerative_cluster.fit(embeddings)
    return pr, model

In [None]:
def perform_tsne(embeddings, perplexity):                                       # Perform t-SNE for dimensionality reduction
    tsne = TSNE(
        n_components=2,
        random_state=42,
        perplexity=perplexity,
        n_iter=10000,
        learning_rate = 300.0,
        n_iter_without_progress = 3000,
        early_exaggeration = 50,
        method = 'barnes_hut',
        # learning_rate_method = None
        )
    return tsne.fit_transform(embeddings)

In [None]:
def extract_cluster_statistics(embeddings_2d, cluster_labels, tags, tag_frequency_dict):
    data = pd.DataFrame(columns=['X', 'Y', 'Cluster', 'Tag'])                   # Create an empty DataFrame to store data
    cluster_means, cluster_variances, cluster_sizes = [], [], []                # Initialize lists to store cluster statistics
    representative_samples, representative_samples_freq = {}, {}                # Initialize a dictionary to store the representative sample for each cluster
    representative_samples_freq_2nd = {}
    representative_samples_freq_3nd = {}

    for cluster_label in np.unique(cluster_labels):
        samples_tags = [tags[i] for i, lbl in enumerate(cluster_labels) if lbl == cluster_label]
        indexes = [i for i, lbl in enumerate(cluster_labels) if lbl == cluster_label]
        samples_2d = embeddings_2d[cluster_labels == cluster_label]

        cluster_center = np.mean(samples_2d, axis=0)
        distances = cdist(samples_2d, [cluster_center])
        nearest_sample_idx = np.argmin(distances)
        representative_samples[cluster_label] = samples_tags[nearest_sample_idx]

        print(samples_tags)
        representative_samples_freq[cluster_label] = max(samples_tags, key=lambda tag: tag_frequency_dict.get(tag, 0))
        mostfreq_sample_idx = samples_tags.index(representative_samples_freq[cluster_label])

        samples_tags_copy = copy.deepcopy(samples_tags)
        samples_tags_copy.remove(representative_samples_freq[cluster_label])
        representative_samples_freq_2nd[cluster_label] = max(samples_tags_copy, key=lambda tag: tag_frequency_dict.get(tag, 0))
        mostfreq_sample_idx_2nd = samples_tags.index(representative_samples_freq_2nd[cluster_label])

        samples_tags_copy.remove(representative_samples_freq_2nd[cluster_label])
        representative_samples_freq_3nd[cluster_label] = max(samples_tags_copy, key=lambda tag: tag_frequency_dict.get(tag, 0))
        mostfreq_sample_idx_3nd = samples_tags.index(representative_samples_freq_3nd[cluster_label])


        distances_to_representative = cdist(samples_2d, [samples_2d[nearest_sample_idx]])
        mean_distance = np.mean(distances_to_representative)
        variance_distance = np.var(distances_to_representative)
        cluster_size = len(samples_tags)
        cluster_means.append(mean_distance)
        cluster_variances.append(variance_distance)
        cluster_sizes.append(cluster_size)

        cluster_data = pd.DataFrame(                                            # Add data for the current cluster to the DataFrame
            {
                'X': samples_2d[:, 0],
                'Y': samples_2d[:, 1],
                # 'Cluster': samples_tags[nearest_sample_idx],
                'Cluster': samples_tags[mostfreq_sample_idx],
                'Tag': samples_tags,
                'Indexes': indexes
            }
        )
        data = pd.concat([data, cluster_data])
        # data = data.append(cluster_data)


    cluster_stats = pd.DataFrame({                                              # Create a new DataFrame for cluster statistics
        'Cluster': np.unique(cluster_labels),
        'Representative sample': [representative_samples[cluster] for cluster in np.unique(cluster_labels)],
        'representative_samples_freq': [representative_samples_freq[cluster] for cluster in np.unique(cluster_labels)],
        'representative_samples_freq_2nd': [representative_samples_freq_2nd[cluster] for cluster in np.unique(cluster_labels)],
        'representative_samples_freq_3nd': [representative_samples_freq_3nd[cluster] for cluster in np.unique(cluster_labels)],
        'Mean_Distance': cluster_means,
        'Variance_Distance': cluster_variances,
        'Cluster_Size': cluster_sizes,
    }).set_index('Cluster', drop=True)

    silhouette_metric = silhouette_score(
        embeddings_2d,
        cluster_labels
        )
    calinski_harabasz_metric = calinski_harabasz_score(
        embeddings_2d,
        cluster_labels
        )
    davies_bouldin_metric = davies_bouldin_score(
        embeddings_2d,
        cluster_labels
        )
    return cluster_stats, silhouette_metric, calinski_harabasz_metric, davies_bouldin_metric, data, len(np.unique(cluster_labels))

In [None]:
import plotly.express as px
import plotly.graph_objects as go
from matplotlib import cm
from matplotlib.colors import to_hex

# Generate a distinct color palette
def generate_distinct_colors(n_clusters):
    cmap = cm.get_cmap('tab20', n_clusters)  # 'tab20' or 'tab20c' are good options for distinct colors
    return [to_hex(cmap(i)) for i in range(cmap.N)]

In [None]:
def visualize_clusters(data, representative_samples, custom_color_scale):       # Create an interactive scatter plot
    fig = px.scatter(
        data,
        x='X',
        y='Y',
        color= 'Cluster',
        hover_data=['Tag'],
        labels={'X': 'Dimension 1', 'Y': 'Dimension 2'},
        # color_continuous_scale=custom_color_scale
        color_discrete_sequence=custom_color_scale,
    )
    fig.update_traces(
        marker=dict(size=2),
        selector=dict(mode='markers+text')
    )
    fig.update_layout(showlegend=False)

    # Loop over representative samples and assign unique cluster IDs
    for counter, (cluster_label, sample_tag) in enumerate(representative_samples.items()):
        representative_sample = data[data['Tag'] == sample_tag]
        cluster_color = custom_color_scale[counter % len(custom_color_scale)]

        fig.add_trace(go.Scatter(
            x=[representative_sample['X'].values[0]],
            y=[representative_sample['Y'].values[0]],
            mode="markers",
            marker=dict(
                size=5,
                color='white',
                line=dict(width=2, color=cluster_color)
            ),
            showlegend=False,
            hoverinfo="text"
        ))

    fig.update_layout(
        width=1000,
        height=800,
        plot_bgcolor='rgba(255,255,255,255)'
    )
    fig.update_xaxes({'gridcolor': 'lightgray', 'zerolinecolor': 'lightgray'})
    fig.update_yaxes({'gridcolor': 'lightgray', 'zerolinecolor': 'lightgray'})

    return fig

In [None]:
n_clusters=None
metric= 'euclidean'  #  'cosine'
linkage_= 'ward'       #'complete' 'average'
full_tree = True

In [None]:
def plot_dendrogram(model, **kwargs):
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    dendrogram(linkage_matrix, **kwargs)                                        # Plot the corresponding dendrogram

    return linkage_matrix

In [None]:
_, model_Agg = hierarchical_clustering(                                             # Perform hierarchical clustering
        embeddings = node_embeddings,
        # embeddings = embeddings,
        n_clusters = n_clusters,
        metric = metric,
        dist_threshold = 0,                                                     # setting distance_threshold=0 ensures we compute the full tree.
        linkage = linkage_,
        full_tree = full_tree
    )

plt.figure(figsize=(10, 8))
plt.title("Hierarchical Clustering Dendrogram")
linkage_matrix = plot_dendrogram(model_Agg)
plt.xticks([])
plt.xlabel('')
plt.savefig(f"/content/drive/MyDrive/SE-PQA/n2v_old_Dendogram.tiff", dpi=300)
plt.show()

In [None]:
node_embeddings=np.array(node_embeddings)
node_embeddings

In [None]:
tag_frequency = Counter(tag for post in all_posts for tag in post)                  # Step 1: Create a dictionary to store the frequency of each tag
tag_frequency_dict = dict(tag_frequency)                                        # Step 2: Count the frequency of each tag across all posts

In [None]:
import plotly.io as pio
import plotly.express as px
import matplotlib.pyplot as plt

In [None]:
pd.options.display.max_rows = 200
thresholds = [60, 35, 17]                                                       # Define a list of threshold values for the three levels
# embeddings=np.array(list(tag_embeddings.values()))

all_cluster_labels, representative_samples, Whole_data = [], {}, []             # Create a list to store cluster labels at each level
                                                                                # and a dictionary to store representative sample names

embeddings_2d = perform_tsne(                                                   # Perform t-SNE
        embeddings=node_embeddings,
        # embeddings=embeddings,
        perplexity=30
    )


for level, threshold in enumerate(thresholds):
    print(f"Threshold: {threshold}")

    # Cut the dendrogram into clusters at the current threshold
    cluster_labels = fcluster(
        linkage_matrix,
        t=threshold,
        criterion='distance'
        )
    all_cluster_labels.append(cluster_labels)

    cluster_stats, silhouette_metric,\
     calinski_harabasz_metric, davies_bouldin_metric,\
      data, num_classes = extract_cluster_statistics(                                        # Extract cluster statistics
        embeddings_2d=embeddings_2d,
        cluster_labels=cluster_labels,
        tags=list(G.nodes()), #<==
        tag_frequency_dict=tag_frequency_dict
    )

    Whole_data.append(data)
    for c_id, rep_name in cluster_stats['representative_samples_freq'].items():       # Store representative sample names with level information
      if level==1:
          if any(value == rep_name for (key_level, _), value in representative_samples.items() if key_level == level-1):
            representative_samples[(level, c_id)] = cluster_stats['representative_samples_freq_2nd'][c_id]
          else:
            representative_samples[(level, c_id)] = rep_name
      elif level==2:
          if any(value == rep_name for (key_level, _), value in representative_samples.items() if key_level == level-1):
              representative_samples[(level, c_id)] = cluster_stats['representative_samples_freq_3nd'][c_id]
          elif any(value == rep_name for (key_level, _), value in representative_samples.items() if key_level == level-2):
              representative_samples[(level, c_id)] = cluster_stats['representative_samples_freq_3nd'][c_id]
          else:
            representative_samples[(level, c_id)] = rep_name
      else:
          representative_samples[(level, c_id)] = rep_name

    # for c_id, rep_name in cluster_stats['Representative sample'].items():       # Store representative sample names with level information
    #   representative_samples[(level, c_id)] = rep_name

    print("Cluster Statistics:")
    display(cluster_stats)
    print("Silhouette Metric:", silhouette_metric)
    print("Calinski Harabasz Metric:", calinski_harabasz_metric)
    print("Davies Bouldin Metric:", davies_bouldin_metric)


    # Visualize clusters
    custom_color_scale = px.colors.sample_colorscale("Turbo", [n/num_classes for n in range(num_classes)])
    fig = visualize_clusters(
        data=data,
        representative_samples=cluster_stats['representative_samples_freq'].to_dict(),
        # representative_samples=cluster_stats['Representative sample'].to_dict(),
        custom_color_scale=custom_color_scale
    )
    file_path = f"/content/drive/MyDrive/SE-PQA/n2v_old_clustering_level{level}.html"
    pio.write_html(fig, file_path)
    fig.show()


    # Visualize clusters
    custom_color_scale = px.colors.sequential.Turbo
    fig = visualize_clusters(
        data=data,
        representative_samples=cluster_stats['representative_samples_freq'].to_dict(),
        # representative_samples=cluster_stats['Representative sample'].to_dict(),
        custom_color_scale=custom_color_scale
    )
    file_path = f"/content/drive/MyDrive/SE-PQA/n2v_old_clustering_level{level}-b.html"
    pio.write_html(fig, file_path)
    fig.show()

    custom_color_scale = generate_distinct_colors(num_classes)
    fig = visualize_clusters(
        data=data,
        representative_samples=cluster_stats['representative_samples_freq'].to_dict(),
        # representative_samples=cluster_stats['Representative sample'].to_dict(),
        custom_color_scale=custom_color_scale
    )
    file_path = f"/content/drive/MyDrive/SE-PQA/n2v_old_clustering_level{level}-c.html"
    pio.write_html(fig, file_path)
    fig.show()

In [None]:
hierarchical_data = []
for tag_idx, tag in enumerate(list(G.nodes())):
    level_1_cluster = representative_samples.get((0, all_cluster_labels[0][tag_idx]))
    level_2_cluster = representative_samples.get((1, all_cluster_labels[1][tag_idx]))
    level_3_cluster = representative_samples.get((2, all_cluster_labels[2][tag_idx]))
    hierarchical_data.append([level_1_cluster, level_2_cluster, level_3_cluster, tag])

print("Hierarchical DataFrame:")
hierarchical_df = pd.DataFrame(hierarchical_data, columns=["Level 1", "Level 2", "Level 3", "Tag"])
display(hierarchical_df)

In [None]:
hierarchical_df_file_name = f"/content/drive/MyDrive/SE-PQA/n2v_old_hierarchical_df.pkl"
hierarchical_df.to_pickle(hierarchical_df_file_name)

In [None]:
hierarchical_df_file_name = f"/content/drive/MyDrive/SE-PQA/n2v_old_hierarchical_df.pkl"
hierarchical_df = pd.read_pickle(hierarchical_df_file_name)
hierarchical_df

In [None]:
!pip install --upgrade numpy

In [None]:
import pandas as pd
import numpy as np

In [None]:
data_path_LLama = f"/content/drive/MyDrive/SE-PQA/SE_PQA_Data_10000_cleaned_Len500.pkl"
MyData_LLama = pd.read_pickle(data_path_LLama)
MyData_LLama.head(3)

In [None]:
TG_Data = MyData_LLama[['body_Q1', 'tags_Q1']]
TG_Data = TG_Data.rename(columns={'body_Q1': 'text', 'tags_Q1': 'tags'})
TG_Data['tags'] = TG_Data['tags'].str.findall(r"<(.*?)>").apply(lambda x: ', '.join(x))
display(TG_Data)

In [None]:
hierarchical_df_file_name = f"/content/drive/MyDrive/SE-PQA/n2v_old_hierarchical_df.pkl"
hierarchical_df = pd.read_pickle(hierarchical_df_file_name)

In [None]:
# Step 1: Create a mapping dictionary from hierarchical_df
tag_mapping = {}
for index, row in hierarchical_df.iterrows():
    tags = [row['Level 1'], row['Level 2'], row['Level 3']]
    tag_mapping[row['Tag']] = ', '.join(tags)

# Step 2: Process tags in TG_Data
result_tags_list = []
oldtags_list = []
tags_with_duplicates_list = []

for tags_str in TG_Data['tags']:
    tags = tags_str.split(', ')
    processed_tags = [tag_mapping.get(tag, tag) if len(tag_mapping.get(tag, tag))!=0 else tag for tag in tags]
    result_tags_list.append(', '.join(processed_tags))
    oldtags_list.append(', '.join(tags))
    tags_with_duplicates_list.append(', '.join(processed_tags))

# Step 3: Create result_dataframe with sorted newtags, tag frequencies, oldtags, and tags_with_duplicates
result_dataframe = pd.DataFrame({
    'text': TG_Data['text'],
    'oldtags': oldtags_list,
    'newtags': result_tags_list,
    'tags_with_duplicates': tags_with_duplicates_list
})

# Calculate tag frequencies using Counter
tag_frequencies_list = [dict(Counter(tags.split(', '))) for tags in result_tags_list]
result_dataframe['tag_frequencies'] = tag_frequencies_list

# Sort newtags based on frequencies, with the condition to maintain the original order
result_dataframe['newtags'] = result_dataframe.apply(lambda row: ', '.join(sorted(set(row['newtags'].split(', ')), key=lambda tag: (row['tag_frequencies'].get(tag, 0), -row['newtags'].split(', ').index(tag)), reverse=True)), axis=1)


# Reorder the columns as per the desired output
result_dataframe = result_dataframe[['text', 'oldtags', 'newtags', 'tags_with_duplicates', 'tag_frequencies']]

# Display the result
TG_Data_After_HieClustering_file_name = f"/content/drive/MyDrive/SE-PQA/n2v_old_TG_Data_After_HieClustering.pkl"
result_dataframe.to_pickle(TG_Data_After_HieClustering_file_name)
display(result_dataframe)


In [None]:
from sklearn.model_selection import train_test_split

MyData = pd.read_pickle(f"/content/drive/MyDrive/SE-PQA/n2v_old_TG_Data_After_HieClustering.pkl")
len_tr = int(0.8 * MyData.shape[0])
len_te = int(0.2 * MyData.shape[0])
train_data, test_data = train_test_split(MyData,
                                          test_size=len_te,
                                          random_state=42)
# train_data = MyData
# test_data = MyData

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "/content/drive/MyDrive/llama-2-7b-chat-hf",
    padding_side='left'
)
tokenizer.pad_token_id = 0

In [None]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcol
import numpy as np

color_train = (255 / 255, 0 / 255, 0 / 255)
color_test = (0 / 255, 176 / 255, 240 / 255)

def count_tokens_in_tags(tags):
    tag_list = tags.split(', ')
    return tag_list

def y_axis_formatter(x, pos):
    return f"{int(x / 100)}"

def prepare_data(data, tokenizer):
    data['token_oldtags'] = data['oldtags'].apply(lambda x: len(tokenizer.tokenize(x)))
    oldtags_tokens = data['token_oldtags'].value_counts().sort_index()

    data['num_oldtags'] = data['oldtags'].apply(lambda x: len(x.split(', ')))
    oldtags_values = data['num_oldtags'].value_counts().sort_index()
    return oldtags_values, data['token_oldtags']


def plot_combined(train_data, test_data, tokenizer):
    train_oldtags, train_oldtags_tokens = prepare_data(train_data, tokenizer)
    test_oldtags, test_oldtags_tokens = prepare_data(test_data, tokenizer)
    fig, axes = plt.subplots(1, 2, figsize=(10, 3), sharey=False)
    bar_width = 0.5
    overlap_offset = bar_width * 0.25

    x_train = train_oldtags.index - overlap_offset
    x_test = train_oldtags.index + overlap_offset

    bars_train = axes[0].bar(
        x_train,
        train_oldtags.values,
        width=bar_width,
        fill=False,
        hatch="///",
        edgecolor=color_train,
        alpha=0.8,
        label="Train"
    )
    bars_test = axes[0].bar(
        x_test,
        test_oldtags.loc[train_oldtags.index].fillna(0).values,
        width=bar_width,
        hatch="..",
        color='white',
        edgecolor=color_test,
        label="Test"
    )
    axes[0].set_xlabel('Number of tags')
    axes[0].set_ylabel('Number of samples')

    axes[0].annotate(
        r"$\times 10^2$",
        xy=(0, 1),
        xycoords='axes fraction',
        xytext=(-10, 5),
        textcoords='offset points',
        ha='left',
        va='center',
        fontsize=9
    )

    c = [color_train, 'black']
    h = [170, 170, 40, 40]
    i = 0
    for bars, data, color, offset in [(bars_train, train_oldtags, color_train, 0), (bars_test, test_oldtags, color_test, 0.15)]:
        for bar, value in zip(bars, data.values):
            percentage = value / sum(data.values)
            axes[0].text(
                bar.get_x() + bar.get_width() / 2 + offset,
                bar.get_height() + h[i+2],
                f"{percentage:.1%}",
                ha='center',
                va='bottom',
                fontsize=9,
                color=color,
                rotation = 90
            )
        i += 1
    token_data = [train_oldtags_tokens.values, test_oldtags_tokens.values]
    box = axes[1].boxplot(
        token_data,
        vert=False,
        patch_artist=True,
        notch=True,
        medianprops=dict(color="black", linewidth=1.5),
        whiskerprops=dict(color=color_train),
        capprops=dict(color=color_train)
    )
    hatches = ['///', '..']
    colors = [color_train, color_test]

    for patch, hatch, color in zip(box['boxes'], hatches, colors):
        patch.set_facecolor('white')
        patch.set_edgecolor(color)
        patch.set_hatch(hatch)


    for i, color in enumerate(colors):
        box['whiskers'][2 * i].set_color(color)
        box['whiskers'][2 * i + 1].set_color(color)
        box['caps'][2 * i].set_color(color)
        box['caps'][2 * i + 1].set_color(color)

    means = [np.mean(train_oldtags_tokens.values), np.mean(test_oldtags_tokens.values)]
    for i, (mean, label, color) in enumerate(zip(means, ["Train", "Test"], colors)):
        y_position = i + 1
        axes[1].axvline(
            mean,
            color=color,
            linestyle="--",
            linewidth=1,
            ymin=0,
            ymax=(y_position + 0) / len(token_data)
        )

        axes[1].text(
            mean + 0.3,
            y_position + 0.2,
            f"Mean: {mean:.1f}",
            va="center",
            ha="left",
            fontsize=9,
            color=color
        )

    axes[1].set_xlabel('Number of tokens')

    fig.legend(
        loc='upper center',
        bbox_to_anchor=(0.5, 1.05),
        ncol=2,
        fontsize=10,
        edgecolor='black'
    )

    axes[0].yaxis.set_major_formatter(y_axis_formatter)
    for ax in axes:
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)

    for i, ax in enumerate(axes):
        x_min, x_max = ax.get_xlim()
        y_min, y_max = ax.get_ylim()
        ax.annotate('', xy=(x_min, y_max), xytext=(x_min, y_max - (y_max - y_min) * 0.02),
                    arrowprops=dict(facecolor='black', arrowstyle='-|>'))
        ax.annotate('', xy=(x_max, y_min), xytext=(x_max - (x_max - x_min) * 0.02, y_min),
                    arrowprops=dict(facecolor='black', arrowstyle='-|>'))

    plt.tight_layout(rect=[0, 0, 1, 0.98])
    plt.savefig(
        "/content/drive/MyDrive/SE-PQA/Data_statistics(Combined).tiff",
        format='tiff', dpi=300, bbox_inches='tight'
    )
    plt.show()

plot_combined(train_data, test_data, tokenizer)


In [None]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcol
import numpy as np

color_train = (255 / 255, 0 / 255, 0 / 255)
color_test = (0 / 255, 176 / 255, 240 / 255)

def count_tokens_in_tags(tags):
    tag_list = tags.split(', ')
    return tag_list


def y_axis_formatter(x, pos):
    return f"{int(x / 100)}"

def prepare_data(data, tokenizer):

    data['token_newtags'] = data['newtags'].apply(lambda x: len(tokenizer.tokenize(x)))
    newtags_tokens = data['token_newtags'].value_counts().sort_index()

    data['num_newtags'] = data['newtags'].apply(lambda x: len(x.split(', ')))
    newtags_values = data['num_newtags'].value_counts().sort_index()
    return newtags_values, data['token_newtags']


def plot_combined(train_data, test_data, tokenizer):
    train_newtags, train_newtags_tokens = prepare_data(train_data, tokenizer)
    test_newtags, test_newtags_tokens = prepare_data(test_data, tokenizer)
    fig, axes = plt.subplots(1, 2, figsize=(10, 3), sharey=False)
    bar_width = 0.5
    overlap_offset = bar_width * 0.25

    aligned_test_newtags = test_newtags.reindex(train_newtags.index, fill_value=0)
    x_train = train_newtags.index - overlap_offset
    x_test = train_newtags.index + overlap_offset

    bars_train = axes[0].bar(
        x_train,
        train_newtags.values,
        width=bar_width,
        fill=False,
        hatch="///",
        edgecolor=color_train,
        alpha=0.8,
        label="Train"
    )
    bars_test = axes[0].bar(
        x_test,
        aligned_test_newtags.values,
        width=bar_width,
        hatch="..",
        color='white',
        edgecolor=color_test,
        label="Test"
    )
    axes[0].set_xlabel('Number of tags')
    axes[0].set_ylabel('Number of samples')
    axes[0].set_xticks(np.arange(1, 13))
    axes[0].set_xticklabels(np.arange(1, 13), ha='right')

    axes[0].annotate(
        r"$\times 10^2$",
        xy=(0, 1),
        xycoords='axes fraction',
        xytext=(-10, 5),
        textcoords='offset points',
        ha='left',
        va='center',
        fontsize=9
    )

    c = [color_train, 'black']
    h = [150, 150, 60, 600]
    i = 0
    for bars, data, color, offset in [(bars_train, train_newtags, color_train, 0), (bars_test, test_newtags, color_test, 0.3)]:
        for bar, value in zip(bars, data.values):
            percentage = value / sum(data.values)
            axes[0].text(
                bar.get_x() + bar.get_width() / 2 + offset,
                bar.get_height() + h[i],
                f"{percentage:.1%}",
                ha='center',
                va='bottom',
                fontsize=9,
                color=color,
                rotation = 90
            )
        i += 1

    token_data = [train_newtags_tokens.values, test_newtags_tokens.values]
    box = axes[1].boxplot(
        token_data,
        vert=False,
        patch_artist=True,
        notch=True,
        medianprops=dict(color="black", linewidth=1.5),
        whiskerprops=dict(color=color_train),
        capprops=dict(color=color_train)
    )
    hatches = ['///', '..']
    colors = [color_train, color_test]

    for patch, hatch, color in zip(box['boxes'], hatches, colors):
        patch.set_facecolor('white')
        patch.set_edgecolor(color)
        patch.set_hatch(hatch)

    for i, color in enumerate(colors):
        box['whiskers'][2 * i].set_color(color)
        box['whiskers'][2 * i + 1].set_color(color)
        box['caps'][2 * i].set_color(color)
        box['caps'][2 * i + 1].set_color(color)

    means = [np.mean(train_newtags_tokens.values), np.mean(test_newtags_tokens.values)]
    for i, (mean, label, color) in enumerate(zip(means, ["Train", "Test"], colors)):
        y_position = i + 1
        axes[1].axvline(
            mean,
            color=color,
            linestyle="--",
            linewidth=1,
            ymin=0,
            ymax=(y_position + 0) / len(token_data)
        )
        axes[1].text(
            mean + 0.2,
            y_position + 0.2,
            f"Mean: {mean:.1f}",
            va="center",
            ha="left",
            fontsize=9,
            color=color
        )


    axes[1].set_xlabel('Number of tokens')

    fig.legend(
        loc='upper center',
        bbox_to_anchor=(0.5, 1.05),
        ncol=2,
        fontsize=10,
        edgecolor='black'
    )

    axes[0].yaxis.set_major_formatter(y_axis_formatter)

    for ax in axes:
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)

    for i, ax in enumerate(axes):
        x_min, x_max = ax.get_xlim()
        y_min, y_max = ax.get_ylim()
        ax.annotate('', xy=(x_min, y_max), xytext=(x_min, y_max - (y_max - y_min) * 0.02),
                    arrowprops=dict(facecolor='black', arrowstyle='-|>'))
        ax.annotate('', xy=(x_max, y_min), xytext=(x_max - (x_max - x_min) * 0.02, y_min),
                    arrowprops=dict(facecolor='black', arrowstyle='-|>'))

    plt.tight_layout(rect=[0, 0, 1, 0.98])
    plt.savefig(
        "/content/drive/MyDrive/SE-PQA/Data_statistics(Combined)2.tiff",
        format='tiff', dpi=300, bbox_inches='tight'
    )
    plt.show()
plot_combined(train_data, test_data, tokenizer)


# Train LLaMA-2 (TG)


In [None]:
!pip install -q -U transformers         # ==4.31.0
!pip install -q torchmetrics
!pip install -q pytorch_lightning
!pip install -q bitsandbytes
!pip install -q -U peft                    # ==0.4.0
!pip install -q accelerate              # ==0.21.0
!pip install -q trl
!pip install -q tensorboard
!pip install -q datasets
!pip install -q rouge
!pip install -q bert-score

In [None]:
import os
import gc
import re
import torch
import warnings
import nltk
import json
import time
import requests
nltk.download('punkt')

import numpy as np
import pandas as pd
import bitsandbytes as bnb
import pytorch_lightning as pl
import matplotlib.pyplot as plt

In [None]:
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning import Callback
from tensorboard import notebook

from torchmetrics import MetricCollection
from torchmetrics.text.bert import BERTScore
from torchmetrics.text.rouge import ROUGEScore
from torchmetrics.classification import (
    BinaryAccuracy,
    BinaryPrecision,
    BinaryRecall,
    BinaryF1Score
    )

from peft import (
    TaskType,
    PeftModel,
    PeftConfig,
    LoraConfig,
    get_peft_model,
    AutoPeftModelForCausalLM,
    prepare_model_for_kbit_training,
    )

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    HfArgumentParser,
    TrainingArguments,
    )

from dataclasses import dataclass, field
from nltk.tokenize import word_tokenize
from typing import Optional
from tqdm import tqdm
from bert_score import BERTScorer
from rouge import Rouge
from statistics import mean
from sklearn.model_selection import train_test_split

tqdm.pandas()
warnings.filterwarnings('ignore')
import transformers
print(transformers.__version__)

In [None]:
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
B_INST, E_INST = "[INST]", "[/INST]"

In [None]:
def get_tg_prompt(_question, _tags = None):
  system_prompt = 'You are a Tag Generator. Respond only with a list of tags; do not include any additional text or explanations.'
  user_prompt = f'''Please generate at least 5 tags for the provided question. Tags can include multi-word phrases if appropriate and should help hierarchically categorize the question's topics.
### Question:
{_question}
### Tags:
'''
  prompt = f"{B_INST} {B_SYS}{system_prompt}{E_SYS}{user_prompt} {E_INST}\n\n"
  if _tags: prompt += f'{_tags}</s>'
  return prompt

In [None]:
len(MyModel.tokenizer('''You are a Tag Generator. Respond only with a list of tags; do not include any additional text or explanations.
  Please generate at least 5 tags for the provided question. Tags can include multi-word phrases if appropriate and should help hierarchically categorize the question's topics.
### Question:
### Tags:
''')['input_ids'])

In [None]:
def get_response_index(_input_ids, _task):
  _index = None
  _skip_tokens = None
  if _task == 'RQE':
    _index = 2
    _skip_tokens = 10
  if _task == 'SUM':
    _index = 1
    _skip_tokens = 11
  if _task == 'TG':
    _index = 1 #1
    _skip_tokens = 10 #10
  hashtags_indexes = [i for i, n in enumerate(_input_ids) if n == 29937]
  if len(hashtags_indexes) > _index:
    return [i for i, n in enumerate(_input_ids) if n == 29937][_index] + _skip_tokens
  elif _task == 'RQE':
    return 0
  else:
    return -1

In [None]:
def generate_prompt(data, tokenizer, is_eval):
  promp = None
  q1 = tokenizer.decode(tokenizer(data['text'])['input_ids'][:380],
                        skip_special_tokens=True,
                        clean_up_tokenization_spaces=True)
  if is_eval: prompt = get_tg_prompt(q1)
  else: prompt = get_tg_prompt(q1, data['tags'])
  return prompt

In [None]:
@dataclass
class ScriptArguments:
    # ##########################################################################
    #                             Configuration
    # ##########################################################################
    model_name: Optional[str] = field(
        default = f"/content/drive/MyDrive/llama-2-7b-chat-hf",
        metadata = {"help": "The model that you want to train from the Hugging Face hub."}
      )
    adapter_name: Optional[str] = field(
        default = "LLama-TG",
        metadata = {"help": "The adapter name saved in the HuggingFace hub."}
      )
    save_to: Optional[str] = field(
        default = "Drive",                                                      # Save to "Hub", or "Drive", or "Both"
        metadata = {"help": "Determine where to save Adapters"}
      )
    # ##########################################################################
    #                         Logs and Checkpoints
    # ##########################################################################
    logging_steps: Optional[int] = field(
        default = 1,
        metadata = {"help": "log every X update steps"}
      )
    output_dir: Optional[str] = field(
        default = "/content/SE-PQA",
        metadata = {"help": "the output directory for both logs and checkpoints"}
      )
    every_n_epochs : Optional[int] = field(
        default = 1,
        metadata = {"help": "Save checkpoints every X epochs"}
      )
    save_on_train_epoch_end: Optional[bool] = field(
        default = None,
        metadata = {"help": "Whether to run checkpointing at the end of training epochs or validation"}
      )
    total_num_samples: Optional[str] = field(
        default = 'All',                                                        # Use {your desired number of samples} or 'All'
        metadata = {"help": "Number of samples to be selected from the whole dataset"}
      )
    # ##########################################################################
    #                             Hyper-parameters
    # ##########################################################################
    max_epochs: Optional[int] = field(
        default = 10,
        metadata = {"help": "maximum number of training epochs."}
      )
    learning_rate: Optional[float] = field(
        default = 1e-4,
        metadata = {"help": "the learning rate"}
      )
    gradient_accumulation_steps: Optional[int] = field(
        default = 8,
        metadata = {"help": "the number of gradient accumulation steps"}
      )
    gradient_checkpointing: Optional[bool] = field(
        default = True,
        metadata = {"help": "Enables gradient checkpointing."}
      )
    per_device_train_batch_size: Optional[int] = field(
        default = 4,
        metadata = {"help": "batch_size of training (per device)"}
      )
    per_device_eval_batch_size: Optional[int] = field(
        default = 1,
        metadata = {"help": "batch_size of validation (per device)"}
      )
    max_seq_length: Optional[int] = field(
        default = 512,
        metadata = {"help": "maximum input sequence length"}
      )
    trust_remote_code: Optional[bool] = field(
        default = True,
        metadata = {"help": '''Enable `trust_remote_code` so that it
        will execute code present on the Hub on your local machine'''}
      )
    split_ratio: Optional[float] = field(
        default = (0.8, 0.2, 0),
        metadata = {"help": "train/test/validation splits"}
      )
    precision: Optional[int] = field(
        default = 16,
        metadata = {"help": "train with 16/32/bf16 precision."}
      )
    num_sanity_val_steps: Optional[float] = field(
        default = 0,
        metadata = {"help": "number of validation batches before the first training epoch"}
      )
    max_new_tokens: Optional[int] = field(
        default = 30,
        metadata = {"help": "the maximum number of new tokens in the generated sequences (test step)"}
      )
    # ##########################################################################
    #                             Lora Configuration
    # ##########################################################################
    use_peft: Optional[bool] = field(
        default = True,
        metadata = {"help": "Wether to use PEFT or not to train adapters"}
      )
    lora_r: Optional[int] = field(
        default = 64,
        metadata = {"help": "the r parameter of the LoRA adapters"}
      )
    lora_alpha: Optional[int] = field(
        default = 64,
        metadata = {"help": "the alpha parameter of the LoRA adapters"}
      )
    lora_dropout: Optional[int] = field(
        default = 0.1,
        metadata = {"help": "the dropout rate of the LoRA adapters"}
      )
    # ##########################################################################
    #                                 BitsAndBytes
    # ##########################################################################
    load_in_8bit: Optional[bool] = field(
        default = False,
        metadata = {"help": "load the model in 8 bits precision"}
      )
    load_in_4bit: Optional[bool] = field(
        default = False,
        metadata = {"help": "load the model in 4 bits precision"}
      )
    use_nested_quant: Optional[bool] = field(
        default = False,
        metadata = {"help": "Activate nested quantization for 4bit base models"}
      )
    bnb_4bit_compute_dtype: Optional[str] = field(
        default = "float16",
        metadata = {"help": "Compute dtype for 4bit base models"}
      )
    bnb_4bit_quant_type: Optional[str] = field(
        default = "nf4",
        metadata = {"help": "Quantization type fp4 or nf4"}
      )

parser = HfArgumentParser(ScriptArguments)
script_args = parser.parse_args_into_dataclasses(return_remaining_strings=True)[0]
pl.seed_everything(42)

In [None]:
class OverrideEpochStepCallback(Callback):
    def __init__(self) -> None:
        super().__init__()

    def on_train_epoch_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
        self._log_step_as_current_epoch(trainer, pl_module)

    def on_test_epoch_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
        self._log_step_as_current_epoch(trainer, pl_module)

    def on_validation_epoch_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
        self._log_step_as_current_epoch(trainer, pl_module)

    def _log_step_as_current_epoch(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
        pl_module.log("step", trainer.current_epoch + 1)

checkpoint_callback = ModelCheckpoint(every_n_epochs=script_args.every_n_epochs)

In [None]:
class TGModel(pl.LightningModule):
    def __init__(self, script_args):
        super(TGModel, self).__init__()
        self.save_hyperparameters()
        self.Setup(script_args)
        self.rouge = ROUGEScore()
        self.adapter_name = script_args.adapter_name
        self.epoch_n = 1

    def Setup(self, script_args):
        if script_args.load_in_4bit and script_args.load_in_8bit:
          raise ValueError("You can't load the model in 8 bits and 4 bits at the same time")
        elif script_args.load_in_4bit:
          compute_dtype = getattr(torch, script_args.bnb_4bit_compute_dtype)

          bnb_config = BitsAndBytesConfig(
              load_in_4bit = script_args.load_in_4bit,
              bnb_4bit_quant_type = script_args.bnb_4bit_quant_type,
              bnb_4bit_compute_dtype = compute_dtype,
              bnb_4bit_use_double_quant = script_args.use_nested_quant,
          )
          self.model = AutoModelForCausalLM.from_pretrained(
              script_args.model_name,
              quantization_config = bnb_config,
              device_map = {"": 0},
          )
        elif script_args.load_in_8bit:
          self.model = AutoModelForCausalLM.from_pretrained(
              script_args.model_name,
              load_in_8bit = True,
              torch_dtype = torch.float16,
              device_map = {"": 0},
          )
          self.model = prepare_model_for_kbit_training(self.model)
        else:
          self.model = AutoModelForCausalLM.from_pretrained(
              script_args.model_name,
              torch_dtype = torch.bfloat16,
              device_map = {"": 0},
          )

        if script_args.use_peft:
            lora_config = LoraConfig(
                task_type = TaskType.CAUSAL_LM,
                r = script_args.lora_r,
                lora_alpha = script_args.lora_alpha,
                lora_dropout = script_args.lora_dropout,
                bias = "none",
            )
            self.model = get_peft_model(self.model, lora_config)
            self.model.print_trainable_parameters()

        self.model.config.use_cache = False

        self.tokenizer = AutoTokenizer.from_pretrained(
            script_args.model_name,
            padding_side='left'
        )
        self.tokenizer.pad_token_id = 0
        self.model.config.pad_token_id = self.tokenizer.pad_token_id

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(
            input_ids,
            attention_mask=attention_mask,
            labels=labels
            )
        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        loss, _ = self.forward(input_ids, attention_mask, labels)
        self.log('train_loss', loss.item(), on_epoch=True, on_step=True)
        return loss


    def on_train_epoch_end(self):
      out_dir = f"/content/drive/MyDrive/SE-PQA/TG-Adapters/"
      self.model.save_pretrained(out_dir + self.adapter_name + str(self.epoch_n))
      self.epoch_n += 1

    def generate(self, *args, **kwargs):
      return self.model.generate(*args, **kwargs)

    def configure_optimizers(self):
        return torch.optim.AdamW(self.model.parameters(), lr=script_args.learning_rate)

In [None]:
MyModel = TGModel(script_args)
logger = TensorBoardLogger(script_args.output_dir + 'logs', name="TG")

print(MyModel)
print("#"*60, "\n\t\t\t Model Configuration\n", "#"*60)
print(MyModel.model.config)

In [None]:
MyData = pd.read_pickle(f"/content/drive/MyDrive/SE-PQA/n2v_old_TG_Data_After_HieClustering.pkl")
MyData = MyData[['text', 'newtags']]
MyData.rename(columns = {'newtags': 'tags'}, inplace = True)

MyData["text"] = MyData["text"].str.replace(r"^\s*Possible Duplicates?:\s+.*?\s{2,}.*?\s{2,}", "", regex=True)
MyData["text"] = MyData["text"].str.replace(r"^\s*Possible Duplicates?:\s+.*?\s{2,}", "", regex=True)
MyData["text"] = MyData["text"].str.replace("C#", "C", regex=False)
MyData["text"] = MyData["text"].str.replace(r"#", "", regex=True)
MyData["text"] = MyData["text"].str.replace(r"\n", " ", regex=True)


if script_args.total_num_samples != 'All':
  MyData = MyData[:int(script_args.total_num_samples)]

print(MyData.shape)
display(MyData[0:100])

In [None]:
class TGDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_len, is_eval):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_eval = is_eval

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
      row_data = self.data.iloc[index]
      prompt = generate_prompt(row_data, self.tokenizer, self.is_eval)
      prompt_encoding = self.tokenizer(
          prompt,
          max_length = self.max_len,
          padding = 'max_length',
          truncation = True,
          add_special_tokens = True,
          return_tensors = 'pt',
      )
      input_ids = prompt_encoding['input_ids'].squeeze()
      attention_mask = prompt_encoding['attention_mask'].squeeze()

      if self.is_eval == False:
        response_index = get_response_index(input_ids, 'TG')
        if response_index:
          labels = torch.cat((torch.full((response_index,), -100), input_ids[response_index:])).squeeze()
        else:
          print('response_index not found')
      else:
        labels = self.tokenizer(
            row_data['tags'] + '</s>',
            add_special_tokens = False,
            return_tensors='pt',
        )
        labels = labels['input_ids'].squeeze()
      return {
          'input_ids': input_ids,
          'attention_mask': attention_mask,
          'labels': labels
      }

In [None]:
class TGDataModule(pl.LightningDataModule):
    def __init__(self, data, tokenizer, script_args):
        super().__init__()
        self.data = data
        self.tokenizer = tokenizer
        self.per_device_train_batch_size = script_args.per_device_train_batch_size
        self.per_device_eval_batch_size = script_args.per_device_eval_batch_size
        self.max_len = script_args.max_seq_length
        self.setup()

    def setup(self, stage=None):
        len_tr = int(script_args.split_ratio[0] * self.data.shape[0])
        len_te = int(script_args.split_ratio[1] * self.data.shape[0])
        train_data, test_data = train_test_split(self.data,
                                                 test_size=len_te,
                                                 random_state=42)
        train_data.reset_index(drop=True, inplace=True)
        test_data.reset_index(drop=True, inplace=True)

        self.train_data = TGDataset(train_data, self.tokenizer, self.max_len, is_eval=False)
        self.test_data = TGDataset(test_data, self.tokenizer, self.max_len, is_eval=True)

    def train_dataloader(self):
        return torch.utils.data.DataLoader(
            self.train_data,
            batch_size=self.per_device_train_batch_size,
            shuffle=True,
            num_workers=8,
        )

    def test_dataloader(self):
        return torch.utils.data.DataLoader(
            self.test_data,
            sampler = torch.utils.data.SequentialSampler(self.test_data,),
            batch_size= self.per_device_eval_batch_size,
            num_workers=8
        )

In [None]:
MyModel.tokenizer.truncation_side = 'left'
DataModule = TGDataModule(
    MyData,
    MyModel.tokenizer,
    script_args
)
print("num train batches", len(DataModule.train_dataloader()))
print("num test batches", len(DataModule.test_dataloader()))

In [None]:
for sample in DataModule.train_dataloader():
  print(MyModel.tokenizer.batch_decode(sample['input_ids'])[0])
  print(MyModel.tokenizer.batch_decode(sample['attention_mask'])[0])
  print(sample['labels'][0].tolist())
  break

In [None]:
for sample in DataModule.test_dataloader():
  print(sample)
  print(MyModel.tokenizer.batch_decode(sample['labels']))
  break

In [None]:
trainer = pl.Trainer(
    logger = logger,
    log_every_n_steps = script_args.logging_steps,
    max_epochs = script_args.max_epochs,
    accumulate_grad_batches = script_args.gradient_accumulation_steps,
    num_sanity_val_steps = script_args.num_sanity_val_steps,
    callbacks = [OverrideEpochStepCallback(), checkpoint_callback],                                  #
    default_root_dir= script_args.output_dir + 'Checkpoints',
    )

In [None]:
%reload_ext tensorboard
%tensorboard --logdir /content/ReQuESTlogs

trainer.fit(
    MyModel,
    datamodule=DataModule,
    # ckpt_path = "/content/ReQuESTlogs/TG/version_0/checkpoints/epoch=0-step=1.ckpt"
)

In [None]:
!cp -r /content/SE-PQAlogs /content/drive/MyDrive/SE-PQA/SE-PQAlogs_TG

In [None]:
%reload_ext tensorboard
%tensorboard --logdir /content/SE-PQAlogs

In [None]:
MyModel.model.save_pretrained(f"/content/drive/MyDrive/SE-PQA/LLama-TG10")

In [None]:
import gc
# del tokenizer
# del trainer
# del MyModel
# del fModel
# del BaseModel
gc.collect()
torch.cuda.empty_cache()

In [None]:
BaseModel= AutoModelForCausalLM.from_pretrained(
    f"/content/drive/MyDrive/llama-2-7b-chat-hf",
    device_map={"": 0},
    offload_folder="offload",
    offload_state_dict = True,
    # load_in_8bit = True,
    )

address = f"/content/drive/MyDrive/SE-PQA/TG-Adapters/LLama-TG10"
print("\n Loading model from ", address, "\n")
config = PeftConfig.from_pretrained(address)
fModel= PeftModel.from_pretrained(BaseModel,address,device_map={"": 0})
fModel = fModel.merge_and_unload()
print(fModel)
print(fModel.config)
print("\n Model successfully loded from ", address, "\n")


tokenizer = AutoTokenizer.from_pretrained(
    script_args.model_name,
    # add_special_tokens = False,
    padding_side='left'
    )

tokenizer.pad_token_id = 0
fModel.config.pad_token_id = tokenizer.pad_token_id

In [None]:
DataModule = TGDataModule(
    MyData,
    tokenizer,
    script_args
)
print("num train batches", len(DataModule.train_dataloader()))
print("num test batches", len(DataModule.test_dataloader()))

In [None]:
for sample in DataModule.test_dataloader():
  # print(sample['input_ids'][0].tolist())
  print(tokenizer.batch_decode(sample['input_ids'])[0])
  print(tokenizer.batch_decode(sample['attention_mask'])[0])
  print(sample['labels'][0].tolist())
  break

In [None]:
def test_step(test_dl):
    results = []

    for batch in test_dl:
        input_ids = batch['input_ids'].cuda()
        attention_mask = batch['attention_mask'].cuda()
        labels = batch['labels'].cuda()

        generated_txts_ids = fModel.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=script_args.max_new_tokens,
            do_sample=True,
            temperature=0.97,
        ).squeeze()

        response_start_idx = get_response_index(
                generated_txts_ids, 'TG'
                )
        single_generated_txt = tokenizer.decode(
            generated_txts_ids[response_start_idx:],
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True
        )
        # print(labels)
        # single_label_ids = labels
        # single_label_ids = torch.where(
        #     single_label_ids != -100,
        #     single_label_ids,
        #     tokenizer.pad_token_id
        # )
        # single_target_txt = tokenizer.decode(
        #     single_label_ids,
        #     skip_special_tokens=True,
        #     clean_up_tokenization_spaces=True
        # )
        # results.append([single_generated_txt, single_target_txt])
        results.append([single_generated_txt])
        # display(results)
        # input()
    return results


In [None]:
fModel.eval()
testOutputs = test_step(DataModule.test_dataloader())

In [None]:
testOutputs[:3]

In [None]:
testOutputs_file_name = f"/content/drive/MyDrive/SE-PQA/test_outputs10.pkl"
testOutputs = pd.read_pickle(testOutputs_file_name)
testOutputs

In [None]:
len_tr = int(script_args.split_ratio[0] * MyData.shape[0])
len_te = int(script_args.split_ratio[1] * MyData.shape[0])
train_data, test_data = train_test_split(MyData,
                                          test_size=len_te,
                                          random_state=42)
test_data['tags'].head(3)

In [None]:
testOutputs_file_name = f"/content/drive/MyDrive/SE-PQA/test_outputs_TG.pkl"
testOutputs2['generated_tags'] = testOutputs
testOutputs2['target_tags'] = test_data['tags'].reset_index(drop=True)
testOutputs2.to_pickle(testOutputs_file_name)

In [None]:
testOutputs2

In [None]:
scorer = BERTScorer(lang="en", device="cuda")
P, R, F1 = scorer.score(testOutputs2['generated_tags'].to_list(), testOutputs2['target_tags'].to_list(), verbose=False)
print(f"BERTScore Precision: {P}")
print(f"BERTScore Recall: {R}")
print(f"BERTScore F1: {F1}")
print(f"BERTScore Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1: {F1.mean():.4f}")

In [None]:
rouge = Rouge()
# scores = rouge.get_scores(testOutputs['generated_tags'].to_list(), testOutputs['target_tags'].to_list())
# print(scores)
scores2 = rouge.get_scores(testOutputs2['generated_tags'].to_list(), testOutputs2['target_tags'].to_list(), avg=True)
print("rouge-1:", scores2['rouge-1'])
print("rouge-2:",scores2['rouge-2'])
print("rouge-l:",scores2['rouge-l'])

#Indexer -> Retriever

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import re

In [None]:
columns = ['Id', 'AcceptedAnswerId', 'CreationDate', 'Body']      # 'Community'
Questions = pd.read_csv('/content/drive/MyDrive/SE-PQA/questions_with_answer.csv', usecols=columns)
display(Questions.head(3))
print("Total number of questions with accepted answer = ", len(Questions))

In [None]:
Questions = Questions[Questions['Body'].str.split().apply(len) < 500]

In [None]:
columns = ['ParentId', 'Id', 'Text']
Answers = pd.read_csv('/content/drive/MyDrive/SE-PQA/answers.csv', usecols=columns)
Answers.head(3)

In [None]:
print("Total number of answers = ", len(Answers))

In [None]:
Answers = Answers.rename(
    columns={'Id': 'Id_Answer'})

In [None]:
def remove_html_tags(text):
    return BeautifulSoup(text, "html.parser").get_text()

def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    return text.strip()

In [None]:
Questions.loc[:, 'Body'] = Questions['Body'].apply(remove_html_tags)
Questions.loc[:, 'Body'] = Questions['Body'].apply(clean_text)

In [None]:
Questions_with_AcceptedAnswers = Questions.merge(
    Answers,
    left_on=['Id', 'AcceptedAnswerId'],
    right_on=['ParentId', 'Id_Answer'],
    suffixes=('', '_Answer')
)
Questions_with_AcceptedAnswers = Questions_with_AcceptedAnswers[['Id', 'Id_Answer', 'AcceptedAnswerId', 'Text', 'CreationDate', 'Body']]
Questions_with_AcceptedAnswers = Questions_with_AcceptedAnswers.drop_duplicates(subset=['Id', 'Id_Answer'], keep='last')
print(len(Questions_with_AcceptedAnswers))
display(Questions_with_AcceptedAnswers.head(3))

In [None]:
!pip install -q sentence-transformers

In [None]:
!pip install -q torch

In [None]:
from sentence_transformers import SentenceTransformer, util
import time
import torch
from IPython.display import display, HTML

In [None]:
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

In [None]:
batch_size = 512
model = SentenceTransformer('all-mpnet-base-v2').cuda()

def generate_embeddings(texts, batch_size=batch_size):
    embeddings = model.encode(texts, convert_to_tensor=True, show_progress_bar=True, batch_size=batch_size)
    return embeddings

start_time = time.time()
embeddings = generate_embeddings(Questions_with_AcceptedAnswers['Body'].tolist(), batch_size=batch_size)
end_time = time.time()
vectorize_time = end_time - start_time

address2 = f"/content/drive/MyDrive/EMNLP/Questions_with_AcceptedAnswers_mpnet.pt"
torch.save(embeddings, address2)

Questions_with_AcceptedAnswers_vec = torch.load(address2)
print(Questions_with_AcceptedAnswers_vec.shape)

vectorize_time = vectorize_time/ len(embeddings)
display(HTML('<span style="color: red"> average vectorize time = </span><b>' + str(vectorize_time) + '</b>'))

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import numpy as np

In [None]:
MyData3 = pd.read_pickle("/content/drive/MyDrive/SE-PQA/SE_PQA_Data_10000_cleaned_Len500.pkl")
len_tr = int(0.8 * MyData3.shape[0])
len_te = int(0.2 * MyData3.shape[0])
train_data, test_data = train_test_split(MyData3,
                                          test_size=len_te,
                                          random_state=42)
test_data

In [None]:
test_data = test_data.sample(n=148, random_state=42)
test_data.head(3)

In [None]:
n = 50
cosine_sim_func = torch.nn.CosineSimilarity(dim=1, eps=1e-08)
Questions_with_AcceptedAnswers2 = Questions_with_AcceptedAnswers.reset_index(drop=True)

def get_top_n_similar_questions(row, QsAccAnsVect, QsAccAnsBody, n):

    question_vector = QsAccAnsVect[QsAccAnsBody['Id']==row['id_Q1']]
    QSA = QsAccAnsVect[(QsAccAnsBody['Id']!=row['id_Q1'])]
    cosine_sim = cosine_sim_func(question_vector, QSA)
    top_n_indices = np.flip(np.argsort(cosine_sim.cpu())[-n:].numpy())
    top_n_scores = np.flip(np.sort(cosine_sim.cpu())[-n:])

    similar_questions = (QsAccAnsBody[(QsAccAnsBody['Id']!=row['id_Q1'])]).reset_index(drop=True).iloc[top_n_indices]

    return top_n_indices, top_n_scores, similar_questions



retrival_results = []
total_time = 0

for i, row in test_data.iterrows():
    start_time = time.time()
    top_n_indices, cosine_sim, similar_questions = get_top_n_similar_questions(row, Questions_with_AcceptedAnswers_vec, Questions_with_AcceptedAnswers2, n)

    end_time = time.time()
    retrieval_time = end_time - start_time
    total_time += retrieval_time

    retrival_results.append({
        'RQE Question': row['body_Q1'],
        'RQE Answer': row['answer_body_Q1'],
        'Cosine similarities': cosine_sim,
        'Top n Similar Questions Body': similar_questions['Body'].tolist(),
        'Top n Candidate Answers': similar_questions['Text'].tolist(),
        'Top n Similar Questions (Cosine)': top_n_indices,
        'Top n Similar Questions (id)': similar_questions['Id'].tolist(),
        'Retrieval Time (seconds)': retrieval_time
    })

avg_retrieval_time = total_time / len(test_data)                        # Calculate average retrieval time

In [None]:
address3 = f"/content/drive/MyDrive/SE-PQA/retrival_results_mpnet-50.pkl"
(pd.DataFrame(retrival_results)).to_pickle(address3)

In [None]:
retrival_results = pd.read_pickle(address3)
for index, result in retrival_results.iterrows():
    print("RQE Question:", result['RQE Question'])
    print("RQE Answer:", result['RQE Answer'])
    print('Cosine similarities:', result['Cosine similarities']),
    print("Top n Similar Questions Body:", result['Top n Similar Questions Body'])
    print("Top n Similar Questions (Cosine):", result['Top n Similar Questions (Cosine)'])
    print("Top n Similar Questions (id):", result['Top n Similar Questions (id)'])
    print("Top n Candidate Answers:", result['Top n Candidate Answers'])
    print("Retrieval Time (seconds):", result['Retrieval Time (seconds)'])
    print("\n")
    break

#User modeling

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import re

In [None]:
columns = ['Id', 'CreationDate', 'Body', 'AccountId']      # 'Community'
Questions = pd.read_csv('/content/drive/MyDrive/SE-PQA/questions_with_answer.csv', usecols=columns)
display(Questions.head(3))
print("Total number of questions with accepted answer = ", len(Questions))

In [None]:
Questions = Questions[Questions['Body'].str.split().apply(len) < 500]

In [None]:
def remove_html_tags(text):
    return BeautifulSoup(text, "html.parser").get_text()

def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    return text.strip()

In [None]:
Questions.loc[:, 'Body'] = Questions['Body'].apply(remove_html_tags)
Questions.loc[:, 'Body'] = Questions['Body'].apply(clean_text)

In [None]:
MyData3 = pd.read_pickle("/content/drive/MyDrive/SE-PQA/SE_PQA_Data_10000_cleaned_Len500.pkl")
len(MyData3['userid_Q1'].unique())

In [None]:
user_history_df = pd.DataFrame(columns=['userid', 'historyCount', 'historyIDs', 'history'])
unique_users = pd.DataFrame(MyData3['userid_Q1'].unique(), columns=['userid_Q1'])

for index, row in unique_users.iterrows():
    user_id = row['userid_Q1']
    user_history = (Questions[Questions['AccountId'] == user_id][['Id', 'Body']])
    user_history = user_history.drop_duplicates(subset=['Id'])
    user_history_str = ', '.join(user_history['Id'].astype(str))
    user_history_body_str = ', '.join(user_history['Body'].astype(str))
    user_history_df = pd.concat(
        [user_history_df, pd.DataFrame({'userid': [user_id],
                                        'historyCount': len(user_history),
                                        'historyIDs': [user_history_str],
                                        'history': [user_history_body_str]})
        ], ignore_index=True)

user_history_df_filepath = f"/content/drive/MyDrive/SE-PQA/user_history_df.pkl"
user_history_df.to_pickle(user_history_df_filepath)
display(user_history_df)

In [None]:
!pip install -q -U transformers         # ==4.31.0
!pip install -q torchmetrics
!pip install -q pytorch_lightning
!pip install -q bitsandbytes
!pip install -q -U peft                    # ==0.4.0
!pip install -q accelerate              # ==0.21.0
!pip install -q trl
!pip install -q tensorboard
!pip install -q datasets
!pip install -q rouge
!pip install -q bert-score

In [None]:
import os
import gc
import re
import torch
import warnings
import nltk
import json
import time
import requests
nltk.download('punkt')

import numpy as np
import pandas as pd
import bitsandbytes as bnb
import pytorch_lightning as pl
import matplotlib.pyplot as plt

In [None]:
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning import Callback
from tensorboard import notebook

from torchmetrics import MetricCollection
from torchmetrics.text.bert import BERTScore
from torchmetrics.text.rouge import ROUGEScore
from torchmetrics.classification import (
    BinaryAccuracy,
    BinaryPrecision,
    BinaryRecall,
    BinaryF1Score
    )

from peft import (
    TaskType,
    PeftModel,
    PeftConfig,
    LoraConfig,
    get_peft_model,
    AutoPeftModelForCausalLM,
    prepare_model_for_kbit_training,
    )

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    HfArgumentParser,
    TrainingArguments,
    )

from dataclasses import dataclass, field
from nltk.tokenize import word_tokenize
from typing import Optional
from tqdm import tqdm
from bert_score import BERTScorer
from rouge import Rouge
from statistics import mean
from sklearn.model_selection import train_test_split

tqdm.pandas()
warnings.filterwarnings('ignore')
import transformers
print(transformers.__version__)

In [None]:
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
B_INST, E_INST = "[INST]", "[/INST]"

In [None]:
def get_tg_prompt(_question, _tags = None):
  system_prompt = 'You are a Tag Generator. Respond only with a list of tags; do not include any additional text or explanations.'
  user_prompt = f'''Please generate at least 5 tags for the provided question. Tags can include multi-word phrases if appropriate and should help hierarchically categorize the question's topics.
### Question:
{_question}
### Tags:
'''
  prompt = f"{B_INST} {B_SYS}{system_prompt}{E_SYS}{user_prompt} {E_INST}\n\n"
  if _tags: prompt += f'{_tags}</s>'
  return prompt

In [None]:
def get_response_index(_input_ids, _task):
  _index = None
  _skip_tokens = None
  if _task == 'RQE':
    _index = 2
    _skip_tokens = 10
  if _task == 'SUM':
    _index = 1
    _skip_tokens = 11
  if _task == 'TG':
    _index = 1 #1
    _skip_tokens = 10 #10
  hashtags_indexes = [i for i, n in enumerate(_input_ids) if n == 29937]
  if len(hashtags_indexes) > _index:
    return [i for i, n in enumerate(_input_ids) if n == 29937][_index] + _skip_tokens
  elif _task == 'RQE':
    return 0
  else:
    return -1

In [None]:
def generate_prompt(data, tokenizer, is_eval):
  promp = None
  q1 = tokenizer.decode(tokenizer(data['text'])['input_ids'][:380],
                        skip_special_tokens=True,
                        clean_up_tokenization_spaces=True)
  if is_eval: prompt = get_tg_prompt(q1)
  else: prompt = get_tg_prompt(q1, data['tags'])
  return prompt

In [None]:
@dataclass
class ScriptArguments:
    # ##########################################################################
    #                             Configuration
    # ##########################################################################
    model_name: Optional[str] = field(
        default = f"/content/drive/MyDrive/llama-2-7b-chat-hf",
        metadata = {"help": "The model that you want to train from the Hugging Face hub."}
      )
    adapter_name: Optional[str] = field(
        default = "LLama-TG",
        metadata = {"help": "The adapter name saved in the HuggingFace hub."}
      )
    save_to: Optional[str] = field(
        default = "Drive",                                                      # Save to "Hub", or "Drive", or "Both"
        metadata = {"help": "Determine where to save Adapters"}
      )
    # ##########################################################################
    #                         Logs and Checkpoints
    # ##########################################################################
    logging_steps: Optional[int] = field(
        default = 1,
        metadata = {"help": "log every X update steps"}
      )
    output_dir: Optional[str] = field(
        default = "/content/SE-PQA",
        metadata = {"help": "the output directory for both logs and checkpoints"}
      )
    every_n_epochs : Optional[int] = field(
        default = 1,
        metadata = {"help": "Save checkpoints every X epochs"}
      )
    save_on_train_epoch_end: Optional[bool] = field(
        default = None,
        metadata = {"help": "Whether to run checkpointing at the end of training epochs or validation"}
      )
    total_num_samples: Optional[str] = field(
        default = 'All',                                                        # Use {your desired number of samples} or 'All'
        metadata = {"help": "Number of samples to be selected from the whole dataset"}
      )
    # ##########################################################################
    #                             Hyper-parameters
    # ##########################################################################
    max_epochs: Optional[int] = field(
        default = 10,
        metadata = {"help": "maximum number of training epochs."}
      )
    learning_rate: Optional[float] = field(
        default = 1e-4,
        metadata = {"help": "the learning rate"}
      )
    gradient_accumulation_steps: Optional[int] = field(
        default = 8,
        metadata = {"help": "the number of gradient accumulation steps"}
      )
    gradient_checkpointing: Optional[bool] = field(
        default = True,
        metadata = {"help": "Enables gradient checkpointing."}
      )
    per_device_train_batch_size: Optional[int] = field(
        default = 4,
        metadata = {"help": "batch_size of training (per device)"}
      )
    per_device_eval_batch_size: Optional[int] = field(
        default = 1,
        metadata = {"help": "batch_size of validation (per device)"}
      )
    max_seq_length: Optional[int] = field(
        default = 512,
        metadata = {"help": "maximum input sequence length"}
      )
    trust_remote_code: Optional[bool] = field(
        default = True,
        metadata = {"help": '''Enable `trust_remote_code` so that it
        will execute code present on the Hub on your local machine'''}
      )
    split_ratio: Optional[float] = field(
        default = (0.8, 0.2, 0),
        metadata = {"help": "train/test/validation splits"}
      )
    precision: Optional[int] = field(
        default = 16,
        metadata = {"help": "train with 16/32/bf16 precision."}
      )
    num_sanity_val_steps: Optional[float] = field(
        default = 0,
        metadata = {"help": "number of validation batches before the first training epoch"}
      )
    max_new_tokens: Optional[int] = field(
        default = 30,
        metadata = {"help": "the maximum number of new tokens in the generated sequences (test step)"}
      )
    # ##########################################################################
    #                             Lora Configuration
    # ##########################################################################
    use_peft: Optional[bool] = field(
        default = True,
        metadata = {"help": "Wether to use PEFT or not to train adapters"}
      )
    lora_r: Optional[int] = field(
        default = 64,
        metadata = {"help": "the r parameter of the LoRA adapters"}
      )
    lora_alpha: Optional[int] = field(
        default = 64,
        metadata = {"help": "the alpha parameter of the LoRA adapters"}
      )
    lora_dropout: Optional[int] = field(
        default = 0.1,
        metadata = {"help": "the dropout rate of the LoRA adapters"}
      )
    # ##########################################################################
    #                                 BitsAndBytes
    # ##########################################################################
    load_in_8bit: Optional[bool] = field(
        default = False,
        metadata = {"help": "load the model in 8 bits precision"}
      )
    load_in_4bit: Optional[bool] = field(
        default = False,
        metadata = {"help": "load the model in 4 bits precision"}
      )
    use_nested_quant: Optional[bool] = field(
        default = False,
        metadata = {"help": "Activate nested quantization for 4bit base models"}
      )
    bnb_4bit_compute_dtype: Optional[str] = field(
        default = "float16",
        metadata = {"help": "Compute dtype for 4bit base models"}
      )
    bnb_4bit_quant_type: Optional[str] = field(
        default = "nf4",
        metadata = {"help": "Quantization type fp4 or nf4"}
      )

parser = HfArgumentParser(ScriptArguments)
script_args = parser.parse_args_into_dataclasses(return_remaining_strings=True)[0]
pl.seed_everything(42)

In [None]:
class OverrideEpochStepCallback(Callback):
    def __init__(self) -> None:
        super().__init__()

    def on_train_epoch_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
        self._log_step_as_current_epoch(trainer, pl_module)

    def on_test_epoch_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
        self._log_step_as_current_epoch(trainer, pl_module)

    def on_validation_epoch_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
        self._log_step_as_current_epoch(trainer, pl_module)

    def _log_step_as_current_epoch(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
        pl_module.log("step", trainer.current_epoch + 1)

checkpoint_callback = ModelCheckpoint(every_n_epochs=script_args.every_n_epochs)

In [None]:
class TGModel(pl.LightningModule):
    def __init__(self, script_args):
        super(TGModel, self).__init__()
        self.save_hyperparameters()
        self.Setup(script_args)
        self.rouge = ROUGEScore()
        self.adapter_name = script_args.adapter_name
        self.epoch_n = 1

    def Setup(self, script_args):
        if script_args.load_in_4bit and script_args.load_in_8bit:
          raise ValueError("You can't load the model in 8 bits and 4 bits at the same time")
        elif script_args.load_in_4bit:
          compute_dtype = getattr(torch, script_args.bnb_4bit_compute_dtype)

          bnb_config = BitsAndBytesConfig(
              load_in_4bit = script_args.load_in_4bit,
              bnb_4bit_quant_type = script_args.bnb_4bit_quant_type,
              bnb_4bit_compute_dtype = compute_dtype,
              bnb_4bit_use_double_quant = script_args.use_nested_quant,
          )
          self.model = AutoModelForCausalLM.from_pretrained(
              script_args.model_name,
              quantization_config = bnb_config,
              device_map = {"": 0},
          )
        elif script_args.load_in_8bit:
          self.model = AutoModelForCausalLM.from_pretrained(
              script_args.model_name,
              load_in_8bit = True,
              torch_dtype = torch.float16,
              device_map = {"": 0},
          )
          self.model = prepare_model_for_kbit_training(self.model)
        else:
          self.model = AutoModelForCausalLM.from_pretrained(
              script_args.model_name,
              torch_dtype = torch.bfloat16,
              device_map = {"": 0},
          )

        if script_args.use_peft:
            lora_config = LoraConfig(
                task_type = TaskType.CAUSAL_LM,
                r = script_args.lora_r,
                lora_alpha = script_args.lora_alpha,
                lora_dropout = script_args.lora_dropout,
                bias = "none",
            )
            self.model = get_peft_model(self.model, lora_config)
            self.model.print_trainable_parameters()

        self.model.config.use_cache = False

        self.tokenizer = AutoTokenizer.from_pretrained(
            script_args.model_name,
            padding_side='left'
        )
        self.tokenizer.pad_token_id = 0
        self.model.config.pad_token_id = self.tokenizer.pad_token_id

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(
            input_ids,
            attention_mask=attention_mask,
            labels=labels
            )
        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        loss, _ = self.forward(input_ids, attention_mask, labels)
        self.log('train_loss', loss.item(), on_epoch=True, on_step=True)
        return loss


    def on_train_epoch_end(self):
      out_dir = f"/content/drive/MyDrive/SE-PQA/TG-Adapters/"
      self.model.save_pretrained(out_dir + self.adapter_name + str(self.epoch_n))
      self.epoch_n += 1

    def generate(self, *args, **kwargs):
      return self.model.generate(*args, **kwargs)

    def configure_optimizers(self):
        return torch.optim.AdamW(self.model.parameters(), lr=script_args.learning_rate)

In [None]:
class TGDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_len, is_eval):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_eval = is_eval

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
      row_data = self.data.iloc[index]
      prompt = generate_prompt(row_data, self.tokenizer, self.is_eval)
      prompt_encoding = self.tokenizer(
          prompt,
          max_length = self.max_len,
          padding = 'max_length',
          truncation = True,
          add_special_tokens = True,
          return_tensors = 'pt',
      )
      input_ids = prompt_encoding['input_ids'].squeeze()
      attention_mask = prompt_encoding['attention_mask'].squeeze()

      if self.is_eval == False:
        response_index = get_response_index(input_ids, 'TG')
        if response_index:
          labels = torch.cat((torch.full((response_index,), -100), input_ids[response_index:])).squeeze()
        else:
          print('response_index not found')
      else:
        labels = self.tokenizer(
            row_data['tags'] + '</s>',
            add_special_tokens = False,
            return_tensors='pt',
        )
        labels = labels['input_ids'].squeeze()
      return {
          'input_ids': input_ids,
          'attention_mask': attention_mask,
          'labels': labels
      }

In [None]:
class TGDataModule(pl.LightningDataModule):
    def __init__(self, data, tokenizer, script_args):
        super().__init__()
        self.data = data
        self.tokenizer = tokenizer
        self.per_device_train_batch_size = script_args.per_device_train_batch_size
        self.per_device_eval_batch_size = script_args.per_device_eval_batch_size
        self.max_len = script_args.max_seq_length
        self.setup()

    def setup(self, stage=None):
        len_tr = int(script_args.split_ratio[0] * self.data.shape[0])
        len_te = int(script_args.split_ratio[1] * self.data.shape[0])
        train_data, test_data = train_test_split(self.data,
                                                 test_size=len_te,
                                                 random_state=42)
        train_data.reset_index(drop=True, inplace=True)
        test_data.reset_index(drop=True, inplace=True)

        self.train_data = TGDataset(train_data, self.tokenizer, self.max_len, is_eval=False)
        self.test_data = TGDataset(test_data, self.tokenizer, self.max_len, is_eval=True)

    def train_dataloader(self):
        return torch.utils.data.DataLoader(
            self.train_data,
            batch_size=self.per_device_train_batch_size,
            shuffle=True,
            num_workers=8,
        )

    def test_dataloader(self):
        return torch.utils.data.DataLoader(
            self.test_data,
            sampler = torch.utils.data.SequentialSampler(self.test_data,),
            batch_size= self.per_device_eval_batch_size,
            num_workers=8
        )

In [None]:
BaseModel= AutoModelForCausalLM.from_pretrained(
    f"/content/drive/MyDrive/llama-2-7b-chat-hf",
    device_map={"": 0},
    offload_folder="offload",
    offload_state_dict = True,
    # load_in_8bit = True,
    )

address = f"/content/drive/MyDrive/SE-PQA/TG-Adapters/LLama-TG10"
print("\n Loading model from ", address, "\n")
config = PeftConfig.from_pretrained(address)
fModel= PeftModel.from_pretrained(BaseModel,address,device_map={"": 0})
fModel = fModel.merge_and_unload()
print(fModel)
print(fModel.config)
print("\n Model successfully loded from ", address, "\n")


tokenizer = AutoTokenizer.from_pretrained(
    script_args.model_name,
    # add_special_tokens = False,
    padding_side='left'
    )

tokenizer.pad_token_id = 0
fModel.config.pad_token_id = tokenizer.pad_token_id

In [None]:
def test_step(test_dl):
  testOutputs = []

  for batch in test_dl:
    input_ids = batch['input_ids'].cuda()
    attention_mask = batch['attention_mask'].cuda()

    generated_txts_ids = fModel.generate(
        input_ids = input_ids,
        max_new_tokens = script_args.max_new_tokens,
        do_sample=True,
        temperature=0.97
        ).squeeze()

    generated_txts = tokenizer.decode(
        generated_txts_ids[get_response_index(generated_txts_ids, 'TG'):],
        skip_special_tokens = False,
        clean_up_tokenization_spaces = True
        )

    testOutputs.append(generated_txts[:-4])

  return testOutputs

In [None]:
user_history_df_filepath = f"/content/drive/MyDrive/SE-PQA/user_history_df.pkl"
user_history_df = pd.read_pickle(user_history_df_filepath)
display(user_history_df)

In [None]:
start_time = time.time()
fModel.eval()
# user_history_df['generated_tags'] = None

for index, row in user_history_df.iterrows():
    if index>62:
      history = row['history']
      history_questions = pd.DataFrame(history.split(', ')[:10], columns=['text'])
      history_questions['tags'] = ""
      historytags = []

      data = TGDataset(history_questions, tokenizer, 512, is_eval=True)
      DL = torch.utils.data.DataLoader(
              data, sampler = torch.utils.data.SequentialSampler(data),
              batch_size= 1, num_workers=8
          )
      historytags = test_step(DL)
      user_history_df.at[index, 'generated_tags']= ' -- '.join(historytags)

print("--- %s seconds ---" % (time.time() - start_time))
user_history_df_filepath = f"/content/drive/MyDrive/SE-PQA/user_history_gen_tags.pkl"
user_history_df.to_pickle(user_history_df_filepath)
display(user_history_df)

In [None]:
user_history_df['generated_tags'] = user_history_df['generated_tags'].str.replace('/', '')

In [None]:
user_history_df['generated_tags2'] = None
user_history_df

for index, row in user_history_df.iterrows():
  A = row['generated_tags'].split(' -- ')[-10:]
  B = ', '.join(A)
  user_history_df.at[index, 'generated_tags2'] = B

display(user_history_df)

In [None]:
def process_row(row):
    tags_series = pd.Series(row['generated_tags2'].split(', ')).explode()
    tag_counts = tags_series.value_counts()
    sorted_tags = tag_counts.index.tolist()
    top_20_tags = sorted_tags[:20]
    result = ', '.join(top_20_tags)
    return result

user_history_df['generated_tags2'] = user_history_df.apply(process_row, axis=1)
user_history_df_filepath = f"/content/drive/MyDrive/SE-PQA/user_history_T20_gen_tags.pkl"
user_history_df.to_pickle(user_history_df_filepath)
display(user_history_df)

In [None]:
data_path_LLama = f"/content/drive/MyDrive/SE-PQA/SE_PQA_Data_10000_cleaned_Len500.pkl"
MyData_LLama = pd.read_pickle(data_path_LLama)
MyData_LLama = MyData_LLama.merge(
    user_history_df[['userid', 'generated_tags2']],
    left_on='userid_Q1',
    right_on='userid',
    how='left'
)
MyData_LLama.rename(columns={'generated_tags2': 'U_Background_kn'}, inplace=True)
MyData_LLama_filepath = f"/content/drive/MyDrive/SE-PQA/SE_PQA_Data_10000_cleaned_Len500_T20_UK.pkl"
MyData_LLama.to_pickle(MyData_LLama_filepath)
MyData_LLama

#Post-retrieval

In [None]:
!pip install -q -U transformers         # ==4.31.0
!pip install -q torchmetrics
!pip install -q pytorch_lightning
!pip install -q bitsandbytes
!pip install -q -U peft                    # ==0.4.0
!pip install -q accelerate              # ==0.21.0
!pip install -q trl
!pip install -q tensorboard
!pip install -q datasets
!pip install -q rouge
!pip install -q bert-score

In [None]:
import os
import re
import torch
import warnings
import nltk
import json
import time
import requests
import sklearn
import gc
nltk.download('punkt')

import numpy as np
import pandas as pd
import bitsandbytes as bnb
import pytorch_lightning as pl
import matplotlib.pyplot as plt

In [None]:
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning import Callback
from pytorch_lightning.callbacks import ModelCheckpoint
from tensorboard import notebook

from torchmetrics import MetricCollection
from torchmetrics.text.bert import BERTScore
from torchmetrics.text.rouge import ROUGEScore
from torchmetrics.classification import (
    BinaryAccuracy,
    BinaryPrecision,
    BinaryRecall,
    BinaryF1Score
    )

from peft import (
    TaskType,
    PeftModel,
    PeftConfig,
    LoraConfig,
    get_peft_model,
    AutoPeftModelForCausalLM,
    prepare_model_for_kbit_training,
    )

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    HfArgumentParser,
    TrainingArguments,
    )

from dataclasses import dataclass, field
from nltk.tokenize import word_tokenize
from typing import Optional
from tqdm import tqdm
from bert_score import BERTScorer
from rouge import Rouge
from statistics import mean
from sklearn.model_selection import train_test_split
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator

tqdm.pandas()
warnings.filterwarnings('ignore')
import transformers
print(transformers.__version__)

In [None]:
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
B_INST, E_INST = "[INST]", "[/INST]"

In [None]:
# def get_rqe_prompt(_q1, _q2, _BN, _entailment=None):
#     system_prompt = "Help recognize question entailment"
#     user_prompt = f'''Entailment means:
# 1. every answer to Q2 must be a partial or complete answer to Q1
# 2. Q2 must be related to the topics of interest of Q1's asker, denoted by Kn.
# Respond with "positive" for entailment and "negative" for not-entailment. No other words.
# Example1:
# Q1: How can I read a PDF?
# Kn: python, programming, pandas
# Q2: Help me how to open different files such as pdf, docx, etc in Linux?
# Answer: negative

# Example2:
# Q1: How can I read a PDF?
# Kn: linux, debian, filesystems
# Q2: Help me how to open different files such as pdf, docx, etc in Linux?
# Answer: positive

# Now, evaluate the following:
# Q1: {_q1}
# Kn: {_BN}
# Q2: {_q2}
# ### Answer:
# '''
#     prompt = f"{B_INST} {B_SYS}{system_prompt}{E_SYS}{user_prompt}{E_INST}\n\n "
#     if _entailment: prompt += f"{_entailment}"
#     return prompt


def get_rqe_prompt(_q1, _q2, _entailment=None):
    system_prompt = "Help recognize question entailment"
    user_prompt = f'''Entailment means every answer to Q2 must be a partial or complete answer to Q1
Respond with "positive" for entailment and "negative" for not-entailment. No other words.

Now, evaluate the following:
Q1: {_q1}
Q2: {_q2}
### Answer:
'''
    prompt = f"{B_INST} {B_SYS}{system_prompt}{E_SYS}{user_prompt}{E_INST}\n\n "
    if _entailment: prompt += f"{_entailment}"
    return prompt

In [None]:
def get_response_index(_input_ids, _task):
  _index = None
  _skip_tokens = None
  if _task == 'RQE':
    _index = 0
    _skip_tokens = 10
  if _task == 'SUM':
    _index = 1
    _skip_tokens = 11
  if _task == 'TG':
    _index = 1
    _skip_tokens = 10
  hashtags_indexes = [i for i, n in enumerate(_input_ids) if n == 29937]
  if len(hashtags_indexes) > _index:
    return [i for i, n in enumerate(_input_ids) if n == 29937][_index] + _skip_tokens
  elif _task == 'RQE':
    return 0
  else:
    return -1

In [None]:
# def generate_prompt_rqe(data, tokenizer, is_eval):
#   promp = None
#   q1 = tokenizer.decode(tokenizer(data['q1'])['input_ids'][:200],
#                         skip_special_tokens=True,
#                         clean_up_tokenization_spaces=True)
#   q2 = tokenizer.decode(tokenizer(data['q2'])['input_ids'][:200],
#                         skip_special_tokens=True,
#                         clean_up_tokenization_spaces=True)
#   ub = tokenizer.decode(tokenizer(data['U_Background_kn'])['input_ids'][:100],
#                         skip_special_tokens=True,
#                         clean_up_tokenization_spaces=True)
#   if is_eval: prompt = get_rqe_prompt(q1, q2, ub)
#   else: prompt = get_rqe_prompt(q1, q2, ub, data['entailment'])
#   return prompt


def generate_prompt_rqe(data, tokenizer, is_eval):
  promp = None
  q1 = tokenizer.decode(tokenizer(data['q1'])['input_ids'][:200],
                        skip_special_tokens=True,
                        clean_up_tokenization_spaces=True)
  q2 = tokenizer.decode(tokenizer(data['q2'])['input_ids'][:200],
                        skip_special_tokens=True,
                        clean_up_tokenization_spaces=True)
  if is_eval: prompt = get_rqe_prompt(q1, q2)
  else: prompt = get_rqe_prompt(q1, q2, data['entailment'])
  return prompt

In [None]:
@dataclass
class ScriptArguments:
    # ##########################################################################
    #                             Configuration
    # ##########################################################################
    model_name: Optional[str] = field(
        # default = "mahdii1376/ReQuEST",
        default = f"/content/drive/MyDrive/llama-2-7b-chat-hf",
        metadata = {"help": "The model that you want to train from the Hugging Face hub."}
      )
    adapter_name: Optional[str] = field(
        default = "LLama-RQE-Wo",
        metadata = {"help": "The adapter name saved in the HuggingFace hub."}
      )
    save_to: Optional[str] = field(
        default = "Drive",                                                       # Save to "Hub", or "Drive", or "Both"
        metadata = {"help": "Determine where to save Adapters"}
      )
    # ##########################################################################
    #                         Logs and Checkpoints
    # ##########################################################################
    logging_steps: Optional[int] = field(
        default = 1,
        metadata = {"help": "log every X update steps"}
      )
    output_dir: Optional[str] = field(
        default = "/content/SE-PQA",
        metadata = {"help": "the output directory"}
      )
    every_n_epochs : Optional[int] = field(
        default = 1,
        metadata = {"help": "Save checkpoints every X epochs"}
      )
    save_on_train_epoch_end: Optional[bool] = field(
        default = None,
        metadata = {"help": "Whether to run checkpointing at the end of training epochs or validation"}
      )
    total_num_samples: Optional[str] = field(
        default = 'All',
        metadata = {"help": "Number of samples to be selected from the whole dataset"}
      )
    # ##########################################################################
    #                             Hyper-parameters
    # ##########################################################################
    max_epochs: Optional[int] = field(
        default = 5,
        metadata = {"help": "maximum number of training epochs."}
      )
    learning_rate: Optional[float] = field(
        default = 3e-5, #2e-4,
        metadata = {"help": "the learning rate"}
      )
    gradient_accumulation_steps: Optional[int] = field(
        default = 8,
        metadata = {"help": "the number of gradient accumulation steps"}
      )
    gradient_checkpointing: Optional[bool] = field(
        default = True,
        metadata = {"help": "Enables gradient checkpointing."}
      )
    per_device_train_batch_size: Optional[int] = field(
        default = 4,
        metadata = {"help": "batch_size of training (per device)"}
      )
    per_device_eval_batch_size: Optional[int] = field(
        default = 4,
        metadata = {"help": "batch_size of validation (per device)"}
      )
    max_seq_length: Optional[int] = field(
        default = 650, #750, #650
        metadata = {"help": "maximum input sequence length"}
      )
    trust_remote_code: Optional[bool] = field(
        default = True,
        metadata = {"help": '''Enable `trust_remote_code` so that it
        will execute code present on the Hub on your local machine'''}
      )
    split_ratio: Optional[float] = field(
        default = (0.8, 0.2, 0),
        metadata = {"help": "train/test/validation splits"}
      )
    precision: Optional[int] = field(
        default = 16,
        metadata = {"help": "train with 16/32/bf16 precision."}
      )
    num_sanity_val_steps: Optional[float] = field(
        default = 0,
        metadata = {"help": "number of validation batches before the first training epoch"}
      )
    max_new_tokens: Optional[int] = field(
        default = 1,
        metadata = {"help": "the maximum number of new tokens in the generated sequences (test step)"}
      )
    # ##########################################################################
    #                             Lora Configuration
    # ##########################################################################
    use_peft: Optional[bool] = field(
        default = True,
        metadata = {"help": "Wether to use PEFT or not to train adapters"}
      )
    lora_r: Optional[int] = field(
        default = 64, #16, #64, #32,
        metadata = {"help": "the r parameter of the LoRA adapters"}
      )
    lora_alpha: Optional[int] = field(
        default = 16, #16, #64, #64, #16
        metadata = {"help": "the alpha parameter of the LoRA adapters"}
      )
    lora_dropout: Optional[int] = field(
        default = 0.3,
        metadata = {"help": "the dropout rate of the LoRA adapters"}
      )
    # ##########################################################################
    #                                 BitsAndBytes
    # ##########################################################################
    load_in_8bit: Optional[bool] = field(
        default = False,
        metadata = {"help": "load the model in 8 bits precision"}
      )
    load_in_4bit: Optional[bool] = field(
        default = False,
        metadata = {"help": "load the model in 4 bits precision"}
      )
    use_nested_quant: Optional[bool] = field(
        default = False,
        metadata = {"help": "Activate nested quantization for 4bit base models"}
      )
    bnb_4bit_compute_dtype: Optional[str] = field(
        default = "float16",
        metadata = {"help": "Compute dtype for 4bit base models"}
      )
    bnb_4bit_quant_type: Optional[str] = field(
        default = "nf4",
        metadata = {"help": "Quantization type fp4 or nf4"}
      )

parser = HfArgumentParser(ScriptArguments)
script_args = parser.parse_args_into_dataclasses(return_remaining_strings=True)[0]
pl.seed_everything(42)

In [None]:
class OverrideEpochStepCallback(Callback):
    def __init__(self) -> None:
        super().__init__()

    def on_train_epoch_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
        self._log_step_as_current_epoch(trainer, pl_module)

    def on_test_epoch_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
        self._log_step_as_current_epoch(trainer, pl_module)

    def on_validation_epoch_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
        self._log_step_as_current_epoch(trainer, pl_module)

    def _log_step_as_current_epoch(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
        pl_module.log("step", trainer.current_epoch + 1)

checkpoint_callback = ModelCheckpoint(every_n_epochs = script_args.every_n_epochs,)

In [None]:
class RQEModel(pl.LightningModule):
    def __init__(self, script_args):
        super(RQEModel, self).__init__()
        self.save_hyperparameters()
        self.Setup(script_args)
        self.rouge = ROUGEScore()
        self.adapter_name = script_args.adapter_name
        self.epoch_n = 1
        self.validation_losses = []
        self.acc = script_args.gradient_accumulation_steps

    def Setup(self, script_args):
        if script_args.load_in_4bit and script_args.load_in_8bit:
          raise ValueError("You can't load the model in 8 bits and 4 bits at the same time")
        elif script_args.load_in_4bit:
          compute_dtype = getattr(torch, script_args.bnb_4bit_compute_dtype)

          bnb_config = BitsAndBytesConfig(
              load_in_4bit = script_args.load_in_4bit,
              bnb_4bit_quant_type = script_args.bnb_4bit_quant_type,
              bnb_4bit_compute_dtype = compute_dtype,
              bnb_4bit_use_double_quant = script_args.use_nested_quant,
          )
          self.model = AutoModelForCausalLM.from_pretrained(
              script_args.model_name,
              quantization_config = bnb_config,
              device_map = {"": 0},
          )
        elif script_args.load_in_8bit:
          self.model = AutoModelForCausalLM.from_pretrained(
              script_args.model_name,
              load_in_8bit = True,
              torch_dtype = torch.float16,
              device_map = {"": 0},
          )
          self.model = prepare_model_for_kbit_training(self.model)

        else:
          self.model = AutoModelForCausalLM.from_pretrained(
              script_args.model_name,
              torch_dtype = torch.bfloat16,
              device_map = {"": 0},
          )

        if script_args.use_peft:
            lora_config = LoraConfig(
                task_type = TaskType.CAUSAL_LM,
                r = script_args.lora_r,
                lora_alpha = script_args.lora_alpha,
                lora_dropout = script_args.lora_dropout,
                bias = "none",
                init_lora_weights = "pissa",
            )


            self.model.print_trainable_parameters()

        self.model.config.use_cache = False

        self.tokenizer = AutoTokenizer.from_pretrained(
            script_args.model_name,
            padding_side='left'
        )
        self.tokenizer.pad_token_id = 0
        self.model.config.pad_token_id = self.tokenizer.pad_token_id

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(input_ids,
                            attention_mask=attention_mask,
                            labels=labels
                            )
        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids'].cuda()
        attention_mask = batch['attention_mask'].cuda()
        labels = batch['labels'].cuda()

        loss, _ = self.forward(input_ids, attention_mask, labels)
        self.log('train_loss', loss.item(), on_epoch=True, on_step=True)

        # Log learning rate
        optimizer = self.trainer.optimizers[0]
        current_lr = optimizer.param_groups[0]['lr']
        self.log('learning_rate', current_lr, on_step=True, on_epoch=True)

        return loss

    def validation_step(self, batch, batch_idx):
      with torch.no_grad():
        input_ids = batch['input_ids'].cuda()
        attention_mask = batch['attention_mask'].cuda()
        labels = batch['labels'].cuda()

        val_loss, _ = self.forward(input_ids, attention_mask, labels)
        self.log('val_loss', val_loss.item(), on_epoch=True, on_step=True)

    def on_train_batch_end(self, outputs, batch, batch_idx):
      if (self.epoch_n % 1000 == 0):
          out_dir = f"/content/drive/MyDrive/SE-PQA/LLAMA-RQE-WoUM/"
          self.model.save_pretrained(out_dir + self.adapter_name + str(int(self.epoch_n)))
      self.epoch_n += 1

    def on_validation_epoch_end(self):
        if self.validation_losses:
            avg_val_loss = sum(self.validation_losses) / len(self.validation_losses)
            self.log("val_loss_step", avg_val_loss)
            self.validation_losses = []

    def generate(self, *args, **kwargs):
      return self.model.generate(*args, **kwargs)

    def configure_optimizers(self):
        # Define weight decay
        optimizer = torch.optim.AdamW(
            self.model.parameters(),
            lr=script_args.learning_rate,
            weight_decay=0.2
            )

        # Define the scheduler
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer,
            T_max=250 *script_args.max_epochs,              # Number of epochs before the LR reaches its minimum
            eta_min=1e-5                                    # Minimum learning rate
        )
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "interval": "step",   # Scheduler steps after every epoch
                "frequency": 1        # Frequency of applying the scheduler
            }
        }

    def val_dataloader(self):
        return self.trainer.datamodule.val_dataloader()

In [None]:
MyData_LLama_filepath = f"/content/drive/MyDrive/SE-PQA/SE_PQA_Data_10000_cleaned_Len500_T20_UK.pkl"
MyData_LLama = pd.read_pickle(MyData_LLama_filepath)
MyData_LLama = MyData_LLama[['body_Q1', 'body_Q2', 'entailment', 'U_Background_kn']]
MyData_LLama = MyData_LLama.rename(columns={
    'body_Q1': 'q1',
    'body_Q2': 'q2'
})
display(MyData_LLama)


In [None]:
MyData_LLama["q1"] = MyData_LLama["q1"].str.replace(r"^\s*Possible Duplicates?:\s+.*?\s{2,}.*?\s{2,}", "", regex=True)
MyData_LLama["q2"] = MyData_LLama["q2"].str.replace(r"^\s*Possible Duplicates?:\s+.*?\s{2,}.*?\s{2,}", "", regex=True)

MyData_LLama["q1"] = MyData_LLama["q1"].str.replace(r"^\s*Possible Duplicates?:\s+.*?\s{2,}", "", regex=True)
MyData_LLama["q2"] = MyData_LLama["q2"].str.replace(r"^\s*Possible Duplicates?:\s+.*?\s{2,}", "", regex=True)


MyData_LLama["q1"] = MyData_LLama["q1"].str.replace("C#", "C", regex=False)
MyData_LLama["q1"] = MyData_LLama["q1"].str.replace(r"#", "", regex=True)
MyData_LLama["q1"] = MyData_LLama["q1"].str.replace(r"\n", " ", regex=True)
MyData_LLama["q2"] = MyData_LLama["q2"].str.replace("C#", "C", regex=False)
MyData_LLama["q2"] = MyData_LLama["q2"].str.replace(r"#", "", regex=True)
MyData_LLama["q2"] = MyData_LLama["q2"].str.replace(r"\n", " ", regex=True)

In [None]:
class RQEDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_len, is_eval):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_eval = is_eval

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
      row_data = self.data.iloc[index]
      prompt = generate_prompt_rqe(row_data, self.tokenizer, self.is_eval)
      prompt_encoding = self.tokenizer(prompt,
                                       max_length = self.max_len,
                                       padding = 'max_length',
                                       truncation = True,
                                       add_special_tokens = True,
                                       return_tensors = 'pt',
                                       )
      input_ids = prompt_encoding['input_ids'].squeeze()
      attention_mask = prompt_encoding['attention_mask'].squeeze()

      if self.is_eval == False:
        response_index = get_response_index(input_ids, 'RQE')
        if response_index:
          labels = torch.cat((torch.full((response_index,), -100), input_ids[response_index:])).squeeze()
        else:
          print('response_index not found')
      else:
        labels = self.tokenizer(row_data['entailment'],
                                add_special_tokens = False,
                                truncation = True,
                                max_length = 1,
                                padding = 'max_length',
                                return_tensors='pt',
                                )
        labels = labels['input_ids'].squeeze()
      return {
          'input_ids': input_ids,
          'attention_mask': attention_mask,
          'labels': labels
          }

In [None]:
class RQEDataModule(pl.LightningDataModule):
    def __init__(self, data, tokenizer, script_args):
        super().__init__()
        self.data = data
        self.tokenizer = tokenizer
        self.per_device_train_batch_size = script_args.per_device_train_batch_size
        self.per_device_eval_batch_size = script_args.per_device_eval_batch_size
        self.max_len = script_args.max_seq_length
        self.setup()

    def setup(self, stage=None):
        len_tr = int(script_args.split_ratio[0] * self.data.shape[0])
        len_te = int(script_args.split_ratio[1] * self.data.shape[0])
        train_data, test_data = train_test_split(self.data,
                                                 train_size=len_tr,
                                                 shuffle=False,
                                                #  random_state=42
                                                 )

        train_data.reset_index(drop=True, inplace=True)
        test_data.reset_index(drop=True, inplace=True)

        self.train_data = RQEDataset(train_data, self.tokenizer, self.max_len, is_eval=False)
        self.test_data = RQEDataset(test_data, self.tokenizer, self.max_len, is_eval=True)

    def train_dataloader(self):
        return torch.utils.data.DataLoader(
            self.train_data,
            batch_size=self.per_device_train_batch_size,
            shuffle=True,
            num_workers=4,
        )

    def test_dataloader(self):
        return torch.utils.data.DataLoader(
            self.test_data,
            sampler = torch.utils.data.SequentialSampler(self.test_data,),
            batch_size= self.per_device_eval_batch_size,
            num_workers=4
        )

In [None]:
MyModel = RQEModel(script_args)
logger = TensorBoardLogger(script_args.output_dir + 'logs', name="RQE")

print(MyModel)
print("#"*60, "\n\t\t\t Model Configuration\n", "#"*60)
print(MyModel.model.config)

In [None]:
MyModel.tokenizer.truncation_side = 'left'
DataModule = RQEDataModule(
    MyData_LLama,
    MyModel.tokenizer,
    script_args
)
print("num train batches", len(DataModule.train_dataloader()))
print("num test batches", len(DataModule.test_dataloader()))

In [None]:
for sample in DataModule.train_dataloader():
  print(MyModel.tokenizer.batch_decode(sample['input_ids'])[0])
  print(MyModel.tokenizer.batch_decode(sample['attention_mask'])[0])
  print(sample['labels'][0].tolist())
  break

In [None]:
for sample in DataModule.test_dataloader():
  print(sample)
  print(MyModel.tokenizer.batch_decode(sample['labels']))
  break

In [None]:
trainer = pl.Trainer(
    logger = logger,
    log_every_n_steps = script_args.logging_steps,
    max_epochs = script_args.max_epochs,
    accumulate_grad_batches = script_args.gradient_accumulation_steps,
    num_sanity_val_steps = script_args.num_sanity_val_steps,
    callbacks = [OverrideEpochStepCallback(), checkpoint_callback],                                  #
    default_root_dir= script_args.output_dir + 'Checkpoints',
    )

In [None]:
%reload_ext tensorboard
%tensorboard --logdir /content/SE-PQAlogs

trainer.fit(
    MyModel,
    datamodule=DataModule,
    # ckpt_path = "/content/ReQuESTlogs/TG/version_0/checkpoints/epoch=0-step=1.ckpt"
)

In [None]:
!cp -r /content/SE-PQAlogs /content/drive/MyDrive/SE-PQA/SE-PQAlogs_RQE_WoUM

In [None]:
%reload_ext tensorboard
%tensorboard --logdir /content/SE-PQAlogs/RQE

In [None]:
import torch
import gc
# tokenizer=None
trainer=None
MyModel = None
# fModel = None
# BaseModel = None
gc.collect()
torch.cuda.empty_cache()

In [None]:
BaseModel= AutoModelForCausalLM.from_pretrained(
    f"/content/drive/MyDrive/llama-2-7b-chat-hf",
    device_map={"": 0},
    offload_folder="offload",
    offload_state_dict = True,
    # load_in_8bit = True
    )

In [None]:
address = f"/content/drive/MyDrive/SE-PQA/LLAMA-RQE-WoUM/LLama-RQE-Wo10000"
print("\n Loading model from ", address, "\n")
config = PeftConfig.from_pretrained(address)
fModel= PeftModel.from_pretrained(BaseModel, address, device_map={"": 0})
fModel = fModel.merge_and_unload()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    f'/content/drive/MyDrive/llama-2-7b-chat-hf',
    padding_side='left'
    )
tokenizer.pad_token_id = 0

fModel.config.pad_token_id = tokenizer.pad_token_id
fModel.config.mask_token_id = tokenizer.mask_token_id
print(fModel)
print(fModel.config)
print("\n Model successfully loded from ", address, "\n")

In [None]:
address3 = f"/content/drive/MyDrive/SE-PQA/retrival_results_mpnet-50.pkl"
retrival_results = pd.read_pickle(address3)

def clean_text_list(text_list):
    combined_text = '###'.join(text_list)
    cleaned_text = re.sub(r'\s+', ' ', combined_text).strip()
    return cleaned_text.split('###')

retrival_results['Top n Similar Questions Body'] = retrival_results['Top n Similar Questions Body'].apply(clean_text_list)

In [None]:
len(retrival_results)

In [None]:
MyData3 = pd.read_pickle("/content/drive/MyDrive/SE-PQA/SE_PQA_Data_10000_cleaned_Len500_T20_UK.pkl")
len_tr = int(0.8 * MyData3.shape[0])
len_te = int(0.2 * MyData3.shape[0])
train_data, test_data = train_test_split(MyData3,
                                          test_size=len_te,
                                          random_state=42)
test_data
test_data = test_data.sample(n=148, random_state=42)
test_data.head(3)

In [None]:
m=50
test_data = test_data[['body_Q1', 'U_Background_kn']]
RQETestDataNew = test_data.loc[test_data.index.repeat(m)].reset_index(drop=True)
similar_questions = [item for sublist in retrival_results['Top n Similar Questions Body'].values for item in sublist[:m]]
similar_answers = [item for sublist in retrival_results['Top n Candidate Answers'].values for item in sublist[:m]]

RQETestDataNew['q2'] = similar_questions
RQETestDataNew['CandidateAnswerBody'] = similar_answers
RQETestDataNew['entailment'] = ''

display(RQETestDataNew.head(m+1))
display(f'Number of samples in new RQE test data = ' + str(len(RQETestDataNew)) + '</b>')

In [None]:
CosineScores = [item for sublist in retrival_results['Cosine similarities'].values for item in sublist[:m]]
q2_ids = [item for sublist in retrival_results['Top n Similar Questions (id)'].values for item in sublist[:m]]
RQETestDataNew['CosineSimilarities'] = CosineScores
RQETestDataNew['CandidateQuestionID'] = q2_ids

display(RQETestDataNew.head(5))
display(RQETestDataNew.shape)

In [None]:
train_data = train_data[['body_Q1', 'body_Q2', 'entailment', 'U_Background_kn']].copy()
display(train_data.head(5))
test_data = RQETestDataNew[['body_Q1', 'q2', 'entailment', 'U_Background_kn']].copy()
display(test_data.head(5))

train_data.rename(columns={'body_Q1': 'q1', 'body_Q2':'q2'}, inplace=True)
test_data.rename(columns={'body_Q1': 'q1'}, inplace=True)
MyData = pd.concat([train_data, test_data]).reset_index()
display(MyData)

In [None]:
address6 = f"/content/drive/MyDrive/SE-PQA/MyQAData_MPNet50.pkl"
MyData.to_pickle(address6)
MyData = pd.read_pickle(address6)

In [None]:
class RQEDataModule(pl.LightningDataModule):
    def __init__(self, data, tokenizer, script_args):
        super().__init__()
        self.data = data
        self.tokenizer = tokenizer
        self.per_device_train_batch_size = script_args.per_device_train_batch_size
        self.per_device_eval_batch_size = script_args.per_device_eval_batch_size
        self.max_len = script_args.max_seq_length
        self.setup()

    def setup(self, stage=None):
        len_tr = 8000
        len_te = 7400
        train_data, test_data = train_test_split(self.data,
                                                 train_size=len_tr,
                                                 shuffle=False,
                                                #  random_state=42
                                                 )

        train_data.reset_index(drop=True, inplace=True)
        test_data.reset_index(drop=True, inplace=True)

        self.train_data = RQEDataset(train_data, self.tokenizer, self.max_len, is_eval=False)
        self.test_data = RQEDataset(test_data, self.tokenizer, self.max_len, is_eval=True)

    def train_dataloader(self):
        return torch.utils.data.DataLoader(
            self.train_data,
            batch_size=self.per_device_train_batch_size,
            shuffle=True,
            num_workers=4,
        )

    def test_dataloader(self):
        return torch.utils.data.DataLoader(
            self.test_data,
            sampler = torch.utils.data.SequentialSampler(self.test_data,),
            batch_size= self.per_device_eval_batch_size,
            num_workers=4
        )

In [None]:
Data_RQE = RQEDataModule(
    MyData,
    tokenizer,
    script_args
)
print("num train batches", len(Data_RQE.train_dataloader()))
print("num test batches", len(Data_RQE.test_dataloader()))

In [None]:
for i in Data_RQE.test_dataloader():
  print(tokenizer.decode(i['input_ids'][0]))
  # print(tokenizer.batch_decode(i['input_ids']))
  print(tokenizer.batch_decode(i['labels']))
  break

In [None]:
fModel.eval()
results = []

start_time = time.time()

with torch.no_grad():
    for batch in Data_RQE.test_dataloader():
        input_ids = batch['input_ids'].cuda()
        attention_mask = batch['attention_mask'].cuda()

        generated_txts_ids = fModel.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=script_args.max_new_tokens,
            do_sample=False,
            temperature=0.0000001,
        )

        for i in range(input_ids.size(0)):
            single_generated_ids = generated_txts_ids[i]

            response_start_idx = get_response_index(single_generated_ids, 'RQE')
            single_generated_txt = tokenizer.decode(
                single_generated_ids[response_start_idx:],
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True
            )
            results.append(single_generated_txt[1:])
            # print(results)
            # input()
end_time = time.time()

display(f"--- {end_time - start_time} seconds ---")
display(f"Number of results: {len(results)}")

In [None]:
display(f"--- {end_time - start_time} seconds ---")
display(f"Number of results: {len(results)}")

In [None]:
import pandas as pd
address7 = f"/content/drive/MyDrive/SE-PQA/results_MPNet_50_WoUM_10000.pkl"
pd.DataFrame(results, columns = ['predicted_label']).to_pickle(address7)

test_results_df = pd.read_pickle(address7)
print(test_results_df)

#Generator

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split


In [None]:
MyData_wo = pd.read_pickle(f"/content/drive/MyDrive/SE-PQA/results_MPNet_50_WoUM_10000.pkl")
MyData_w = pd.read_pickle(f"/content/drive/MyDrive/SE-PQA/results_MPNet_50_UM_10000.pkl")
MyData_WithoutPreds = pd.read_pickle(f"/content/drive/MyDrive/SE-PQA/MyQAData_MPNet50.pkl")
retrival_results = pd.read_pickle(f"/content/drive/MyDrive/SE-PQA/retrival_results_mpnet-50.pkl")

In [None]:
MyData3 = pd.read_pickle("/content/drive/MyDrive/SE-PQA/SE_PQA_Data_10000_cleaned_Len500_T20_UK.pkl")
len_tr = int(0.8 * MyData3.shape[0])
len_te = int(0.2 * MyData3.shape[0])
train_data, test_data = train_test_split(MyData3,
                                          test_size=len_te,
                                          random_state=42)
test_data
test_data = test_data.sample(n=148, random_state=42)
test_data_answers = test_data.copy()

m=50
test_data = test_data[['body_Q1', 'U_Background_kn']]
RQETestDataNew = test_data.loc[test_data.index.repeat(m)].reset_index(drop=True)
similar_questions = [item for sublist in retrival_results['Top n Similar Questions Body'].values for item in sublist[:m]]
similar_answers = [item for sublist in retrival_results['Top n Candidate Answers'].values for item in sublist[:m]]

RQETestDataNew['q2'] = similar_questions
RQETestDataNew['CandidateAnswerBody'] = similar_answers
RQETestDataNew['entailment'] = ''

display(RQETestDataNew.head(m+1))
display(f'Number of samples in new RQE test data = ' + str(len(RQETestDataNew)) + '</b>')

CosineScores = [item for sublist in retrival_results['Cosine similarities'].values for item in sublist[:m]]
q2_ids = [item for sublist in retrival_results['Top n Similar Questions (id)'].values for item in sublist[:m]]
RQETestDataNew['CosineSimilarities'] = CosineScores
RQETestDataNew['CandidateQuestionID'] = q2_ids

display(RQETestDataNew.head(5))
display(RQETestDataNew.shape)

In [None]:
# MyData_WithoutPreds = MyData_WithoutPreds[8000:]

In [None]:
RQE6_vs_RQE7 = pd.concat([MyData_wo, MyData_w], axis = 1)

RQE6_vs_RQE7.columns = ['Without_UM', 'With_UM']
print('Do models perform the same: ', MyData_wo.equals(MyData_w), "\n")                # Two models did not perform equally
display(RQE6_vs_RQE7)
display(RQE6_vs_RQE7.shape)

In [None]:
MyData_WithPreds = pd.concat([RQETestDataNew.reset_index(drop=True), RQE6_vs_RQE7], axis=1)
MyData_WithPreds.shape

In [None]:
MyData_WithPreds.head(3)

In [None]:
MyData_WithPreds = MyData_WithPreds.groupby(
    ['body_Q1'],
    group_keys=False
    ).apply(lambda x: x.sort_values(by='CosineSimilarities', ascending=False))

display(MyData_WithPreds.head(5))
print(MyData_WithPreds.shape)

In [None]:
answer_mapping = test_data_answers.groupby('body_Q1')['answer_body_Q1'].first()  # or .last()
MyData_WithPreds['AcceptedAnswer'] = MyData_WithPreds['body_Q1'].map(answer_mapping)

In [None]:
MyData_WithPreds[(MyData_WithPreds["With_UM"]!='positive') & (MyData_WithPreds["With_UM"]!='negative')].head(5)

In [None]:
MyData_WithPreds = MyData_WithPreds.drop_duplicates(subset=['body_Q1', 'q2'], keep='last')
MyData_WithPreds

In [None]:
f = 3
entailed_df_Without_UM = MyData_WithPreds[MyData_WithPreds['Without_UM'] == 'positive']
top3_entailed_Without_UM = entailed_df_Without_UM.groupby(['body_Q1']).head(f)
display(top3_entailed_Without_UM.shape)
display(top3_entailed_Without_UM.head(2*f))

entailed_df_With_UM = MyData_WithPreds[MyData_WithPreds['With_UM'] == 'positive']
top3_entailed_With_UM = entailed_df_With_UM.groupby(['body_Q1']).head(f)
display(top3_entailed_With_UM.shape)
display(top3_entailed_With_UM.head(2*f))

In [None]:
import numpy as np
import re
import pandas as pd

In [None]:
!pip install -q rouge-score bert-score nltk transformers torch
!pip install -q sacrebleu
!pip install -q sentence-transformers

from bs4 import BeautifulSoup
import sacrebleu
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine

In [None]:
import nltk
import torch
import bert_score
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score

nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
!pip install -q unbabel-comet

from comet import download_model, load_from_checkpoint

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
## LLAMA2
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain import LLMChain, HuggingFacePipeline, PromptTemplate


In [None]:
!pip install -q langchain-community langchain-core
!pip install -q --upgrade langchain

from langchain import PromptTemplate

In [None]:
#

template_WoUM = """********************{i}********************
You are an AI language model tasked with synthesizing an accurate, complete, and well-structured answer based solely on the provided expert-written answers. Follow these strict guidelines:
Analyze each provided answer carefully.
Extract relevant words, phrases, sentences, or subsequences that contribute to answering the given question.
Synthesize a comprehensive, continuous, and well-structured response without any headings, subheadings, or explicit references to the original answers (e.g., do not say "as stated in Answer 1").
Rephrase extracted content where necessary to align with the exact requirements of the question. If an answer provides a solution to a slightly different but related problem, adapt the phrasing while preserving factual accuracy.
Incorporate all relevant information from the answers, ensuring that multiple valid solutions, perspectives, or explanations are included where applicable. No relevant information should be omitted.
Remove non-relevant parts that do not contribute to answering the question.
Do not introduce any external information beyond what is contained in the provided answers. If an answer is not covered in the provided content, do not generate additional details from external knowledge.
If none of the provided answers sufficiently address the question, clearly state: "The question could not be answered based on the available context."
Prioritize precision over recall, ensuring that responses are accurate and directly relevant to the question. However, the answer should also be as complete as possible while maintaining clarity and conciseness.
Now, based on these instructions, analyze the following question and answers, then generate the best possible response.

### Question: {query}
### Answer1: {A1}
### Answer2: {A2}
### Answer3: {A3}
### Answer:

"""

def generate_summary(i, query, a1, a2, a3):
    prompt_temp_WoUM = PromptTemplate(
        template=template_WoUM,
        input_variables=["query", "A1", "A2", "A3"]
        )
    summary = prompt_temp_WoUM.format(i = i, query = query, A1 = a1, A2 = a2, A3 = a3)

    return summary

In [None]:
# select wo (without user-modeling) or w (with user-modeling)
state = 'w'
# state = 'wo'

In [None]:
if state == 'wo':
  display('State : without user-modeling')
  df = top3_entailed_Without_UM.copy()
else:
  display('State : with user-modeling')
  df = top3_entailed_With_UM.copy()

In [None]:
from langchain.memory import ConversationBufferMemory

In [None]:
generated_answers = []
retrieval_time = 0
i = 0
for name, group in df.groupby(['body_Q1']):
    candidate_answers = group['CandidateAnswerBody'].tolist()
    group['AcceptedAnswer'] = group['AcceptedAnswer'].str.replace("\n", " ", regex=False)
    Gold = group['AcceptedAnswer'].iloc[0]
    a1 = candidate_answers[0]
    if (len(candidate_answers)<2):
      a2 = ""
    else:
      a2 = candidate_answers[1]

    if (len(candidate_answers)<3):
      a3 = ""
    else:
      a3 = candidate_answers[2]
    group['body_Q1'] = group['body_Q1'].str.replace("\n", " ", regex=False)
    query = group['body_Q1'].iloc[0]
    # start_time = time.time()

    generated_answer = generate_summary(i, query, a1, a2, a3)
    e = f
    file1_path = f"/content/drive/MyDrive/SE-PQA/NewPrompts_{state}_{f}.txt"
    with open(file1_path, "a") as f1:
      f1.write(generated_answer)
    f1.close()


    file1_path = f"/content/drive/MyDrive/SE-PQA/Answers_{state}_{f}.txt"
    with open(file1_path, "a") as f2:
      f2.write(Gold + "------------------------")
    f2.close()
    i = i + 1


#Quantitave Analysis

In [None]:
def clean_text1(text):
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def clean_text2(text):
    text = re.sub(r"(\*{2,3}|`)", " ", text)
    text = text.lower()
    text = re.sub(r"<pre>", "", text)
    text = re.sub(r"<code>", "", text)
    text = re.sub(r'[“”]', "'", text)
    text = re.sub(r"[‘’]", "'", text)
    text = re.sub(r"[`]", "", text)
    text = re.sub(r'[^a-z0-9\s/:.,!?\'()*-]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


def read_lines1(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [clean_text1(line.strip()) for line in f]


def read_lines2(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [clean_text2(line.strip()) for line in f]


def compute_rouge(references, candidates):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    for ref, cand in zip(references, candidates):
        score = scorer.score(ref, cand)
        for key in scores.keys():
            scores[key].append(score[key])
    avg_scores = {k: {
        'precision': sum(s.precision for s in v) / len(v),
        'recall': sum(s.recall for s in v) / len(v),
        'f1': sum(s.fmeasure for s in v) / len(v)
    } for k, v in scores.items()}
    return avg_scores


def compute_bleu(references, candidates):
    smoothing = SmoothingFunction().method1
    bleu_scores = {f"bleu-{i}": [] for i in range(1, 5)}
    for ref, cand in zip(references, candidates):
        ref_tokens = [ref.split()]
        cand_tokens = cand.split()
        for i in range(1, 5):
            score = sentence_bleu(ref_tokens, cand_tokens, weights=[1/i]*i + [0]*(4-i), smoothing_function=smoothing)
            bleu_scores[f"bleu-{i}"].append(score)
    return {k: sum(v) / len(v) for k, v in bleu_scores.items()}


def compute_bert_score(references, candidates):
    P, R, F1 = bert_score.score(candidates, references, lang="en")
    return {
        "precision": P.mean().item(),
        "recall": R.mean().item(),
        "f1": F1.mean().item()
    }


def compute_meteor(references, candidates):
    scores = [meteor_score([ref.split()], cand.split()) for ref, cand in zip(references, candidates)]
    return sum(scores) / len(scores)


def compute_perplexity(sentences):
    model_name = "gpt2"
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model.eval()
    perplexities = []
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=1024)
        if len(inputs["input_ids"].squeeze()) == 0:
            continue
        inputs["position_ids"] = torch.arange(inputs["input_ids"].size(1)).unsqueeze(0)
        with torch.no_grad():
            loss = model(**inputs, labels=inputs["input_ids"]).loss
        perplexities.append(torch.exp(loss).item())
    return sum(perplexities) / len(perplexities) if perplexities else float("inf")



def compute_ter(references, candidates):
    scores = [sacrebleu.sentence_ter(cand, [ref]).score for ref, cand in zip(references, candidates)]
    return sum(scores) / len(scores)


def compute_chrf(references, candidates):
    scores = [sacrebleu.sentence_chrf(cand, [ref]).score for ref, cand in zip(references, candidates)]
    return sum(scores) / len(scores)


def compute_sentence_bert(references, candidates):
    model = SentenceTransformer("all-mpnet-base-v2")
    ref_embeddings = model.encode(references, convert_to_tensor=True)
    cand_embeddings = model.encode(candidates, convert_to_tensor=True)
    similarities = [1 - cosine(ref_emb.cpu(), cand_emb.cpu())
                    for ref_emb, cand_emb in zip(ref_embeddings, cand_embeddings)]
    return sum(similarities) / len(similarities)


def compute_comet(references, candidates):
    model_path = download_model("Unbabel/wmt22-comet-da")
    model = load_from_checkpoint(model_path)
    data = [{"src": ref, "mt": cand, "ref": ref} for ref, cand in zip(references, candidates)]
    scores = model.predict(data, batch_size=8)
    return sum(scores["scores"]) / len(scores["scores"])

In [None]:
def main(file1, file2):
    references = read_lines1(file1)
    candidates = read_lines1(file2)
    perplexity = compute_perplexity(candidates)

    references = read_lines2(file1)
    candidates = read_lines2(file2)

    rouge_scores = compute_rouge(references, candidates)
    bleu_scores = compute_bleu(references, candidates)
    bert_scores = compute_bert_score(references, candidates)
    meteor = compute_meteor(references, candidates)
    ter_score = compute_ter(references, candidates)
    chrf_score = compute_chrf(references, candidates)
    comet_score = compute_comet(references, candidates)
    sbert_score = compute_sentence_bert(references, candidates)

    print(f"TER Score: {ter_score:.4f}")
    print(f"chrF Score: {chrf_score:.4f}")
    print(f"COMET Score: {comet_score:.4f}")
    print(f"SBERT Cosine Similarity: {sbert_score:.4f}")
    print("ROUGE Scores:", rouge_scores)
    print("BLEU Scores:", bleu_scores)
    print("BERTScore:", bert_scores)
    print("METEOR Score:", meteor)
    print("Perplexity:", perplexity)

In [None]:
import logging
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)

In [None]:
# Example usage
if __name__ == "__main__":

    UM_path = "/content/drive/MyDrive/SE-PQA/SE-PQA-UM-3.txt"
    WoUM_path = "/content/drive/MyDrive/SE-PQA/SE-PQA-WoUM-3.txt"
    AccA_path_UM = "/content/drive/MyDrive/SE-PQA/Answers_w_3.txt"
    AccA_path_WoUM = "/content/drive/MyDrive/SE-PQA/Answers_wo_3.txt"

    main(AccA_path_UM, UM_path)
    main(AccA_path_WoUM, WoUM_path)