In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Function to segment dialogue based on topic similarities

def segment_based_on_similarity(dataframe, num_clusters=5):
    """
    Segments a dialogue dataset based on topic similarities using KMeans clustering.

    Parameters:
        dataframe (pd.DataFrame): DataFrame containing a 'text' column for clustering.
        num_clusters (int): Number of clusters to segment into.

    Returns:
        pd.DataFrame: Updated DataFrame with a 'segment' column indicating segment IDs.
    """
    # Ensure 'text' column exists
    if 'text' not in dataframe.columns:
        raise ValueError("The dataframe must contain a 'text' column for segmentation.")

    # Fill missing text entries
    text_data = dataframe['text'].fillna('').tolist()

    # TF-IDF Vectorization
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(text_data)

    # Apply KMeans clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
    dataframe['segment'] = kmeans.fit_predict(tfidf_matrix)

    return dataframe

# Example usage for segmentation

def process_dialogues(dialogue_files, num_clusters=5):
    """
    Processes a list of dialogue files, segments them by topic similarity, and saves results.

    Parameters:
        dialogue_files (dict): Dictionary with file labels as keys and file paths as values.
        num_clusters (int): Number of clusters to use for segmentation.

    Returns:
        dict: Paths to the saved segmented files.
    """
    segmented_files = {}

    for name, path in dialogue_files.items():
        # Load the dialogue data
        dialogue_data = pd.read_csv(path)

        # Perform segmentation
        segmented_data = segment_based_on_similarity(dialogue_data, num_clusters=num_clusters)

        # Save the segmented data
        segmented_file_path = f"segmented_{name}.csv"
        segmented_data.to_csv(segmented_file_path, index=False)
        segmented_files[name] = segmented_file_path

    return segmented_files

# Example file paths
dialogue_files = {
    "you own file"
}

# Run segmentation for the given files
segmented_results = process_dialogues(dialogue_files, num_clusters=5)
print("Segmented files saved:", segmented_results)
