In [12]:
import pandas as pd
import numpy as np
import openai
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import adjusted_rand_score
from transformers import DistilBertTokenizer, DistilBertModel
import time
import numpy as np
from scipy.optimize import linear_sum_assignment as hungarian
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score, adjusted_mutual_info_score

In [13]:
def cluster_acc(y_true, y_pred):
    y_true = y_true.astype(np.int64)
    assert y_pred.size == y_true.size
    D = max(y_pred.max(), y_true.max()) + 1
    w = np.zeros((D, D), dtype=np.int64)
    for i in range(y_pred.size):
        w[y_pred[i], y_true[i]] += 1
  
    # ind = sklearn.utils.linear_assignment_.linear_assignment(w.max() - w)
    # row_ind, col_ind = linear_assignment(w.max() - w)
    row_ind, col_ind = hungarian(w.max() - w)
    return sum([w[i, j] for i, j in zip(row_ind, col_ind)]) * 1.0 / y_pred.size

In [None]:
SubSets=[]

# Load the dataset wich contains 250 articles
data50 = pd.read_csv('bbc_news_subset_50artcl.csv')
texts50 = data50['text'].tolist()
SubSets.append(texts50)

In [15]:
len(SubSets)
for subset in SubSets :
    print(len(subset))

50


In [16]:
# Set up OpenAI GPT and BERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [23]:
from llama_cpp import Llama

# Initialize the LLaMA model
Mistral_llm = Llama(model_path="./mistral-7b-v0.1.Q2_K.gguf", verbose=False,n_ctx=2048)
# def Llama_generate_keyphrases(text):
   
#     # Encode the text to tokens and truncate if necessary
#     inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
#     tokenized_text = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

#     # Join tokens back to string if needed or use the tokenized text directly
#     truncated_text = tokenizer.convert_tokens_to_string(tokenized_text)

#     # Define the interaction with the LLaMA model
#     response = llm.create_chat_completion(
#         messages=[
#             {"role": "system", "content": "Generate keyphrases that describe the intent of this text."},
#             {"role": "user", "content": truncated_text}
#         ],
#         max_tokens=50  # Adjust the max_tokens if needed
#     )

#     # Extracting the generated keyphrases from the response
#     keyphrases = response['choices'][0]['message']['content'].strip()
#     return keyphrases
    
def Mistral_generate_keyphrases(text):
    # Define the interaction with the LLaMA model
    response = Mistral_llm.create_chat_completion(
        messages=[
            {"role": "system", "content": "Generate keyphrases that describe the intent of this text."},
            {"role": "user", "content": text}
        ]
        #,max_tokens=50  # Adjust the max_tokens if needed
    )
    # Extracting the generated keyphrases from the response
    keyphrases = response['choices'][0]['message']['content'].strip()
    return keyphrases

In [18]:
def encode_text(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].detach().numpy()  # CLS token representation

In [19]:
# Vectorization for simple clustering
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)# Try to test with others values of max_features

X_simple50 = vectorizer.fit_transform(texts50)

In [20]:
X_simple50

<50x4365 sparse matrix of type '<class 'numpy.float64'>'
	with 8052 stored elements in Compressed Sparse Row format>

In [21]:
# Clustering
kmeans_simple = KMeans(n_clusters=5, random_state=42)
kmeans_enhanced = KMeans(n_clusters=5, random_state=42)

In [None]:
# Enhanced vectorization using LLM keyphrases
enhanced_vectors50 = []
keyphrase_cache=[]
cpt=1
total=50
for text in texts50:
    keyphrase = Mistral_generate_keyphrases(text)
    keyphrase_cache.append(keyphrase)
    text_vector = encode_text(text)
    keyphrase_vector = encode_text(keyphrase)
    concatenated_vector = np.concatenate((text_vector, keyphrase_vector), axis=1)
    enhanced_vectors50.append(concatenated_vector.squeeze())
    print(f"{cpt}/{total}", end="\r")
    cpt=cpt+1

# Convert list to array
enhanced_vectors50 = np.array(enhanced_vectors50)

In [25]:
np.save('Mistral_enhanced_vectors50.npy', enhanced_vectors50)

In [26]:
simple_labels50 = kmeans_simple.fit_predict(X_simple50)
enhanced_labels50 = kmeans_enhanced.fit_predict(enhanced_vectors50)



In [27]:
from sklearn.preprocessing import LabelEncoder
# Evaluation with random_state=42
nmi_simple50 = normalized_mutual_info_score(data50['category'].values, simple_labels50)
nmi_enhanced50 = normalized_mutual_info_score(data50['category'].values, enhanced_labels50)
print(f"Simple Clustering (50 articles) - NMI: {nmi_simple50}")
print(f"Enhanced Clustering (50 articles) - NMI: {nmi_enhanced50}")

print("---------------------------------------------------------------------------------")

rand_score_simple50 = adjusted_rand_score(data50['category'].values, simple_labels50)
rand_score_enhanced50 = adjusted_rand_score(data50['category'].values, enhanced_labels50)
print(f"Simple Clustering (50 articles) - rand_score: {rand_score_simple50}")
print(f"Enhanced Clustering (50 articles) - rand_score: {rand_score_enhanced50}")

print("---------------------------------------------------------------------------------")

# Encode category labels
label_encoder = LabelEncoder()
y_true = label_encoder.fit_transform(data50['category'].values)

acc_simple50 = cluster_acc(np.array(y_true), np.array(simple_labels50))
acc_enhanced50 = cluster_acc(np.array(y_true), np.array(enhanced_labels50))
print(f"Simple Clustering (50 articles) - acc: {acc_simple50}")
print(f"Enhanced Clustering (50 articles) - acc: {acc_enhanced50}")

Simple Clustering (50 articles) - NMI: 0.43672245047005154
Enhanced Clustering (50 articles) - NMI: 0.6055765313391491
---------------------------------------------------------------------------------
Simple Clustering (50 articles) - rand_score: 0.26850612581076905
Enhanced Clustering (50 articles) - rand_score: 0.40334490684141033
---------------------------------------------------------------------------------
Simple Clustering (50 articles) - acc: 0.6
Enhanced Clustering (50 articles) - acc: 0.64


In [29]:
SubSets=[]

# Load the dataset wich contains 1000 articles
data100 = pd.read_csv('bbc_news_subset_100artcl.csv')
texts100 = data100['text'].tolist()
SubSets.append(texts100)

In [30]:
X_simple100 = vectorizer.fit_transform(texts100)

In [None]:
# Enhanced vectorization using LLM keyphrases
enhanced_vectors100 = []
keyphrase_cache=[]
cpt=1
total=100
for text in texts100:
    keyphrase = Mistral_generate_keyphrases(text)
    keyphrase_cache.append(keyphrase)
    text_vector = encode_text(text)
    keyphrase_vector = encode_text(keyphrase)
    concatenated_vector = np.concatenate((text_vector, keyphrase_vector), axis=1)
    enhanced_vectors100.append(concatenated_vector.squeeze())
    print(f"{cpt}/{total}", end="\r")
    cpt=cpt+1

# Convert list to array
enhanced_vectors100 = np.array(enhanced_vectors100)

17/100