In [80]:
import pandas as pd
import numpy as np
import openai
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
import os

In [47]:
import os
from dotenv import load_dotenv

load_dotenv()  # Load the .env file

openai.api_key = os.getenv('OPENAI_API_KEY')
if openai.api_key is None:
    raise ValueError("API key is not set.")
else:
    print("API Key loaded successfully.")


API Key loaded successfully.


In [48]:
def gpt3_query(text):
    """ Query GPT-3.5 Turbo with the given text to determine if it should be reassigned to a different cluster. """
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are an assistant to help decide if a news article should stay in its current cluster or move."},
            {"role": "user", "content": text}
        ],
        max_tokens=50
    )
    answer = response['choices'][0]['message']['content'].strip().lower()
    return "yes" in answer

In [49]:
def load_data(file_path):
    """ Load the BBC News dataset from a CSV file. """
    return pd.read_csv(file_path)

In [50]:
def preprocess_text(data):
    """ Preprocess text data for clustering. """
    vectorizer = TfidfVectorizer(stop_words='english')
    return vectorizer.fit_transform(data['text'])

In [None]:
# Load the dataset file
file_path = 'bbc_news_subset_1000artcl.csv'
data = load_data(file_path)
X = preprocess_text(data)

# Initial clustering
kmeans = KMeans(n_clusters=5, random_state=42)
initial_labels = kmeans.fit_predict(X)
cluster_centers = kmeans.cluster_centers_

In [None]:
def refine_clusters(X, cluster_labels, cluster_centers,Nbr_Low_Conf_Pnt,nb_nearest_cluster):
     # Nbr_Low_Conf_Pnt : le nombre de Low Confidence Points à identifier 
     # X : 
     # cluster_labels : 
     # cluster_centers : 
     # nb_nearest_cluster
    new_cluster_labels = cluster_labels.copy()
    distances = euclidean_distances(X.toarray(), cluster_centers)
    confidence_scores = distances.min(axis=1) - np.partition(distances, 2, axis=1)[:, 1]
    low_confidence_indices = np.argsort(confidence_scores)[:Nbr_Low_Conf_Pnt]  # Top Nbr_Low_Conf_Pnt uncertain points
    cpt=1
    total_request=Nbr_Low_Conf_Pnt
    for idx in low_confidence_indices:
        current_cluster_idx = cluster_labels[idx]
        nearest_clusters = np.argsort(distances[idx])[1:(nb_nearest_cluster+1)]

        query_text = f"Should this news article stay in cluster {current_cluster_idx}? Text: {data['text'].iloc[idx]}"
        response = gpt3_query(query_text)
        print(f"{cpt}/{total_request}", end="\r")
        cpt=cpt+1
        if not response:
            for new_cluster_idx in nearest_clusters:
                query_text = f"Should this news article move to cluster {new_cluster_idx}? Text: {data['text'].iloc[idx]}"
                response = gpt3_query(query_text)
                if response:
                    new_cluster_labels[idx] = new_cluster_idx
                    break

    return new_cluster_labels

In [52]:
# Refine clustering with GPT-3
refined_labels = refine_clusters(X, initial_labels, cluster_centers,500,4)

500/1000

In [55]:
from sklearn.metrics import normalized_mutual_info_score

def compute_nmi(true_labels, predicted_labels):
    """Compute the Normalized Mutual Information between true and predicted labels."""
    return normalized_mutual_info_score(true_labels, predicted_labels)

# Assuming 'category' column has the true labels
true_labels = data['category'].values

# Compute NMI for both clustering results
nmi_initial = compute_nmi(true_labels, initial_labels)
nmi_refined = compute_nmi(true_labels, refined_labels)

print(f"Initial Clustering NMI: {nmi_initial}")
print(f"Refined Clustering NMI: {nmi_refined}")


Initial Clustering NMI: 0.6131461379882496
Refined Clustering NMI: 0.21241491686367456


In [58]:
# Load the dataset file
file_path = 'bbc_news_subset_1000artcl.csv'
data = load_data(file_path)
X = preprocess_text(data)

# Initial clustering
kmeans = KMeans(n_clusters=5, random_state=42)
initial_labels = kmeans.fit_predict(X)
cluster_centers = kmeans.cluster_centers_

In [None]:
# Refine clustering with GPT-3.5 turbo
refined_labels = refine_clusters(X, initial_labels, cluster_centers,50)

In [61]:
# Compute NMI for both clustering results
nmi_initial = compute_nmi(true_labels, initial_labels)
nmi_refined = compute_nmi(true_labels, refined_labels)

print(f"Initial Clustering NMI: {nmi_initial}")
print(f"Refined Clustering NMI: {nmi_refined}")

Initial Clustering NMI: 0.6131461379882496
Refined Clustering NMI: 0.5687042637889106


In [68]:
# Refine clustering with GPT-3
refined_labels = refine_clusters(X, initial_labels, cluster_centers,100,4)

100/1000

In [69]:
# Compute NMI for both clustering results
nmi_initial = compute_nmi(true_labels, initial_labels)
nmi_refined = compute_nmi(true_labels, refined_labels)

print(f"Initial Clustering NMI: {nmi_initial}")
print(f"Refined Clustering NMI: {nmi_refined}")

Initial Clustering NMI: 0.6131461379882496
Refined Clustering NMI: 0.5208955207676117


In [72]:
# Refine clustering with GPT-3
refined_labels = refine_clusters(X, initial_labels, cluster_centers,25,4)

25/25

In [73]:
# Compute NMI for both clustering results
nmi_initial = compute_nmi(true_labels, initial_labels)
nmi_refined = compute_nmi(true_labels, refined_labels)

print(f"Initial Clustering NMI: {nmi_initial}")
print(f"Refined Clustering NMI: {nmi_refined}")

Initial Clustering NMI: 0.6131461379882496
Refined Clustering NMI: 0.5978809403512185


In [74]:
# Refine clustering with GPT-3
refined_labels = refine_clusters(X, initial_labels, cluster_centers,20,4)

20/20

In [75]:
# Compute NMI for both clustering results
nmi_initial = compute_nmi(true_labels, initial_labels)
nmi_refined = compute_nmi(true_labels, refined_labels)

print(f"Initial Clustering NMI: {nmi_initial}")
print(f"Refined Clustering NMI: {nmi_refined}")

Initial Clustering NMI: 0.6131461379882496
Refined Clustering NMI: 0.5967547054379694


In [76]:
# Refine clustering with GPT-3
refined_labels = refine_clusters(X, initial_labels, cluster_centers,10,4)

10/10

In [77]:
# Compute NMI for both clustering results
nmi_initial = compute_nmi(true_labels, initial_labels)
nmi_refined = compute_nmi(true_labels, refined_labels)

print(f"Initial Clustering NMI: {nmi_initial}")
print(f"Refined Clustering NMI: {nmi_refined}")

Initial Clustering NMI: 0.6131461379882496
Refined Clustering NMI: 0.6043318982858454


In [78]:
# Refine clustering with GPT-3
refined_labels = refine_clusters(X, initial_labels, cluster_centers,5,4)

5/5

In [79]:
# Compute NMI for both clustering results
nmi_initial = compute_nmi(true_labels, initial_labels)
nmi_refined = compute_nmi(true_labels, refined_labels)

print(f"Initial Clustering NMI: {nmi_initial}")
print(f"Refined Clustering NMI: {nmi_refined}")

Initial Clustering NMI: 0.6131461379882496
Refined Clustering NMI: 0.6116910284489357
