In [1]:
import pandas as pd
import numpy as np
import openai
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import adjusted_rand_score
from transformers import DistilBertTokenizer, DistilBertModel
import time
import numpy as np
from scipy.optimize import linear_sum_assignment as hungarian
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score, adjusted_mutual_info_score

In [2]:
cluster_nmi = normalized_mutual_info_score
def cluster_acc(y_true, y_pred):
    y_true = y_true.astype(np.int64)
    assert y_pred.size == y_true.size
    D = max(y_pred.max(), y_true.max()) + 1
    w = np.zeros((D, D), dtype=np.int64)
    for i in range(y_pred.size):
        w[y_pred[i], y_true[i]] += 1
  
    row_ind, col_ind = hungarian(w.max() - w)
    return sum([w[i, j] for i, j in zip(row_ind, col_ind)]) * 1.0 / y_pred.size

In [46]:
SubSets=[]

# Load the dataset wich contains 250 articles
data50 = pd.read_csv('bbc_news_subset_50artcl.csv')
data100 = pd.read_csv('bbc_news_subset_100artcl.csv')
texts50 = data50['text'].tolist()
texts100 = data100['text'].tolist()
SubSets.append(texts50)
SubSets.append(texts100)

In [47]:
len(SubSets)
for subset in SubSets :
    print(len(subset))

50
100


In [5]:
# Set up OpenAI GPT and BERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [6]:
model

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Li

In [None]:
 from llama_cpp import Llama

# Initialize the LLaMA model
llm = Llama(model_path="./llama-2-7b-chat.Q2_K.gguf", verbose=False,n_ctx=2048)
def Llama_generate_keyphrases(text):
   
    # Encode the text to tokens and truncate if necessary
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    tokenized_text = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

    # Join tokens back to string if needed or use the tokenized text directly
    truncated_text = tokenizer.convert_tokens_to_string(tokenized_text)

    # Define the interaction with the LLaMA model
    response = llm.create_chat_completion(
        messages=[
            {"role": "system", "content": "Generate keyphrases that describe the intent of this text."},
            {"role": "user", "content": truncated_text}
        ],
        max_tokens=50  # Adjust the max_tokens if needed
    )

    # Extracting the generated keyphrases from the response
    keyphrases = response['choices'][0]['message']['content'].strip()
    return keyphrases

In [18]:
def encode_text(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].detach().numpy()  # CLS token representation

In [54]:
# Vectorization for simple clustering
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)# Try to test with others values of max_features

X_simple50 = vectorizer.fit_transform(texts50)
X_simple100= vectorizer.fit_transform(texts100)

In [52]:
X_simple50

<50x4365 sparse matrix of type '<class 'numpy.float64'>'
	with 8052 stored elements in Compressed Sparse Row format>

In [30]:
# Clustering
kmeans_simple = KMeans(n_clusters=5, random_state=42)
kmeans_enhanced = KMeans(n_clusters=5, random_state=42)

In [36]:
# Enhanced vectorization using LLM keyphrases
enhanced_vectors50 = []
keyphrase_cache=[]
cpt=1
total=50
for text in texts50:
    keyphrase = Llama_generate_keyphrases(text)
    keyphrase_cache.append(keyphrase)
    text_vector = encode_text(text)
    keyphrase_vector = encode_text(keyphrase)
    concatenated_vector = np.concatenate((text_vector, keyphrase_vector), axis=1)
    enhanced_vectors50.append(concatenated_vector.squeeze())
    print(f"{cpt}/{total}", end="\r")
    cpt=cpt+1

# Convert list to array
enhanced_vectors50 = np.array(enhanced_vectors50)

50/50

In [38]:
np.save('Llama_enhanced_vectors50.npy', enhanced_vectors50)

In [39]:
simple_labels50 = kmeans_simple.fit_predict(X_simple50)
enhanced_labels50 = kmeans_enhanced.fit_predict(enhanced_vectors50)



In [42]:
from sklearn.preprocessing import LabelEncoder
# Evaluation with random_state=42
nmi_simple50 = normalized_mutual_info_score(data50['category'].values, simple_labels50)
nmi_enhanced50 = normalized_mutual_info_score(data50['category'].values, enhanced_labels50)
print(f"Simple Clustering (50 articles) - NMI: {nmi_simple50}")
print(f"Enhanced Clustering (50 articles) - NMI: {nmi_enhanced50}")

print("---------------------------------------------------------------------------------")

rand_score_simple50 = adjusted_rand_score(data50['category'].values, simple_labels50)
rand_score_enhanced50 = adjusted_rand_score(data50['category'].values, enhanced_labels50)
print(f"Simple Clustering (50 articles) - rand_score: {rand_score_simple50}")
print(f"Enhanced Clustering (50 articles) - rand_score: {rand_score_enhanced50}")

print("---------------------------------------------------------------------------------")

# Encode category labels
label_encoder = LabelEncoder()
y_true = label_encoder.fit_transform(data50['category'].values)

acc_simple50 = cluster_acc(np.array(y_true), np.array(simple_labels50))
acc_enhanced50 = cluster_acc(np.array(y_true), np.array(enhanced_labels50))
print(f"Simple Clustering (50 articles) - acc: {acc_simple50}")
print(f"Enhanced Clustering (50 articles) - acc: {acc_enhanced50}")

Simple Clustering (50 articles) - NMI: 0.43672245047005154
Enhanced Clustering (50 articles) - NMI: 0.5883994567867266
---------------------------------------------------------------------------------
Simple Clustering (50 articles) - rand_score: 0.26850612581076905
Enhanced Clustering (50 articles) - rand_score: 0.39307190214318755
---------------------------------------------------------------------------------
Simple Clustering (50 articles) - acc: 0.6
Enhanced Clustering (50 articles) - acc: 0.7


In [49]:
# Enhanced vectorization using LLM keyphrases
enhanced_vectors100 = []
keyphrase_cache100=[]
cpt=1
total=100
for text in texts100:
    keyphrase = Llama_generate_keyphrases(text)
    keyphrase_cache100.append(keyphrase)
    text_vector = encode_text(text)
    keyphrase_vector = encode_text(keyphrase)
    concatenated_vector = np.concatenate((text_vector, keyphrase_vector), axis=1)
    enhanced_vectors100.append(concatenated_vector.squeeze())
    print(f"{cpt}/{total}", end="\r")
    cpt=cpt+1

# Convert list to array
enhanced_vectors100 = np.array(enhanced_vectors100)
np.save('Llama_enhanced_vectors100.npy', enhanced_vectors100)

100/100

In [55]:
simple_labels100 = kmeans_simple.fit_predict(X_simple100)
enhanced_labels100 = kmeans_enhanced.fit_predict(enhanced_vectors100)



In [56]:
# Evaluation with random_state=5
nmi_simple100 = normalized_mutual_info_score(data100['category'].values, simple_labels100)
nmi_enhanced100 = normalized_mutual_info_score(data100['category'].values, enhanced_labels100)
print(f"Simple Clustering (100 articles) - NMI: {nmi_simple100}")
print(f"Enhanced Clustering (100 articles) - NMI: {nmi_enhanced100}")
print("---------------------------------------------------------------------------------")

rand_score_simple100 = adjusted_rand_score(data100['category'].values, simple_labels100)
rand_score_enhanced100 = adjusted_rand_score(data100['category'].values, enhanced_labels100)
print(f"Simple Clustering (100 articles) - rand_score: {rand_score_simple100}")
print(f"Enhanced Clustering (100 articles) - rand_score: {rand_score_enhanced100}")

print("---------------------------------------------------------------------------------")

# Encode category labels
label_encoder = LabelEncoder()
y_true = label_encoder.fit_transform(data100['category'].values)

acc_simple100 = cluster_acc(np.array(y_true), np.array(simple_labels100))
acc_enhanced100 = cluster_acc(np.array(y_true), np.array(enhanced_labels100))
print(f"Simple Clustering (100 articles) - acc: {acc_simple100}")
print(f"Enhanced Clustering (100 articles) - acc: {acc_enhanced100}")

Simple Clustering (100 articles) - NMI: 0.6096940422930666
Enhanced Clustering (100 articles) - NMI: 0.8184489629920321
---------------------------------------------------------------------------------
Simple Clustering (100 articles) - rand_score: 0.518511042283018
Enhanced Clustering (100 articles) - rand_score: 0.7362427647366333
---------------------------------------------------------------------------------
Simple Clustering (100 articles) - acc: 0.77
Enhanced Clustering (100 articles) - acc: 0.86
