In [1]:
import pandas as pd

# Function to handle file upload and data reading
def upload_and_process_files(uploaded_files):
    """
    Function to upload and process the CSV files.
    
    Args:
    uploaded_files (list): List of file paths.
    
    Returns:
    pd.DataFrame: Combined DataFrame from all uploaded CSV files.
    """
    all_data = []  # List to store DataFrames from uploaded files
    for file_path in uploaded_files:
        print(f"Processing file: {file_path}")
        if file_path.endswith(".csv"):
            try:
                # Try to read the CSV file with utf-8 encoding
                df = pd.read_csv(file_path, encoding='utf-8')
                # Clean column names by stripping spaces
                df.columns = df.columns.str.strip()
                all_data.append(df)
            except pd.errors.ParserError:
                raise ValueError(f"Error: The file {file_path} is not in the correct format of a .csv file.")
    
    # Combine all DataFrames into one
    if all_data:
        df_combined = pd.concat(all_data, ignore_index=True)
        return df_combined
    else:
        return None

# Replace with actual file paths
uploaded_files = [r'C:\Users\ungdu\Downloads\Test Chatgpt\Test data.csv']  # Provide full file path(s)

# Upload and process the file
df_combined = upload_and_process_files(uploaded_files)

# Display the first few rows to verify the data
if df_combined is not None:
    display(df_combined)
else:
    print("No data to display.")


Processing file: C:\Users\ungdu\Downloads\Test Chatgpt\Test data.csv


Unnamed: 0,Câu hỏi,Câu trả lời
0,Các quả có mùi vị như thế nào,Quả cam ngon. Quả táo dở. Quả chanh chua. Quả ...
1,Các quả có hình dáng như thế nào,"Quả cam có hình tròn. Quả táo có hình tròn, hơ..."


In [2]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Ensure that punkt is downloaded for sentence tokenization
nltk.download("punkt", quiet=True)

# Define the chunking class
class SemanticChunker:
    def __init__(self, threshold=0.3, embedding_type="tfidf"):
        self.threshold = threshold
        self.embedding_type = embedding_type
    
    def embed_function(self, sentences):
        """
        Convert a list of sentences into their vector representations using TF-IDF.
        
        Args:
        - sentences (list): List of sentences to be embedded.
        
        Returns:
        - numpy.ndarray: The TF-IDF vectors.
        """
        if self.embedding_type == "tfidf":
            vectorizer = TfidfVectorizer().fit_transform(sentences)
            return vectorizer.toarray()
        else:
            raise ValueError("Unsupported embedding type")
    
    def split_text(self, text):
        """
        Split the input text into chunks based on semantic similarity of sentences.
        
        Args:
        - text (str): Input text to be chunked.
        
        Returns:
        - list: List of chunked text.
        """
        sentences = nltk.sent_tokenize(text)  # Extract sentences
        sentences = [item for item in sentences if item and item.strip()]
        if not len(sentences):
            return []

        # Vectorize the sentences for similarity checking
        vectors = self.embed_function(sentences)

        # Calculate pairwise cosine similarity between sentences
        similarities = cosine_similarity(vectors)

        # Initialize chunks with the first sentence
        chunks = [[sentences[0]]]

        # Group sentences into chunks based on similarity threshold
        for i in range(1, len(sentences)):
            sim_score = similarities[i-1, i]

            if sim_score >= self.threshold:
                # If the similarity is above the threshold, add to the current chunk
                chunks[-1].append(sentences[i])
            else:
                # Start a new chunk
                chunks.append([sentences[i]])

        # Join the sentences in each chunk to form coherent paragraphs
        return [' '.join(chunk) for chunk in chunks]

# Initialize the SemanticChunker (adjust threshold if needed)
chunker = SemanticChunker(threshold=0.3)

# Apply the chunking process to the "Câu trả lời" column
df_combined["Chunks"] = df_combined["Câu trả lời"].apply(lambda x: chunker.split_text(x))

# Display the chunked results
for index, row in df_combined.iterrows():
    print(f"\nCâu hỏi: {row['Câu hỏi']}")
    print("Chunks:")
    for i, chunk in enumerate(row["Chunks"]):
        print(f"  Chunk {i+1}: {chunk}")


Câu hỏi: Các quả có mùi vị như thế nào
Chunks:
  Chunk 1: Quả cam ngon.
  Chunk 2: Quả táo dở.
  Chunk 3: Quả chanh chua.
  Chunk 4: Quả mít to. Quả mít rất thơm nữa

Câu hỏi: Các quả có hình dáng như thế nào 
Chunks:
  Chunk 1: Quả cam có hình tròn. Quả táo có hình tròn, hơi nhỏ.
  Chunk 2: Quả chanh hình bầu dục.
  Chunk 3: Quả mít to dài có vỏ xù xì.
  Chunk 4: Quả mít có thể lấy gỗ


In [4]:
# Export the DataFrame with chunked data to a CSV file
df_combined.to_csv('chunking_data.csv', index=False, encoding='utf-8')

print("Data has been successfully exported to chunking_data.csv")

Data has been successfully exported to chunking_data.csv


In [3]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Initialize the Vietnamese SBERT model
embedding_model = SentenceTransformer('keepitreal/vietnamese-sbert')

# Function to embed text using the pre-trained model
def embed_text(text_list):
    """
    Function to embed a list of texts using the pre-trained Vietnamese SBERT model.

    Args:
    text_list (list): List of text sentences to be embedded.

    Returns:
    np.ndarray: Embeddings for each sentence.
    """
    embeddings = embedding_model.encode(text_list, convert_to_tensor=True)
    return embeddings

# Example: Apply embedding model to chunked text
def apply_embeddings_to_chunks(df):
    """
    Function to apply embeddings to the chunked texts.

    Args:
    df (pd.DataFrame): DataFrame containing chunked texts.

    Returns:
    pd.DataFrame: DataFrame with embeddings added.
    """
    # Loop through each row in the DataFrame and apply embedding to each chunk
    embeddings_list = []
    for index, row in df.iterrows():
        question = row['Câu hỏi']
        chunks = row['Chunks']
        
        # Embed the chunks using the Vietnamese SBERT model
        embeddings = embed_text(chunks)
        
        # Store the embeddings in a list
        embeddings_list.append(embeddings)
        
    # Add embeddings to the DataFrame
    df['Embeddings'] = embeddings_list
    return df

# Apply the embedding model to the chunked text data
df_with_embeddings = apply_embeddings_to_chunks(df_combined)

# Display the resulting DataFrame with embeddings
print(df_with_embeddings.head())

  from tqdm.autonotebook import tqdm, trange


                             Câu hỏi  \
0      Các quả có mùi vị như thế nào   
1  Các quả có hình dáng như thế nào    

                                         Câu trả lời  \
0  Quả cam ngon. Quả táo dở. Quả chanh chua. Quả ...   
1  Quả cam có hình tròn. Quả táo có hình tròn, hơ...   

                                              Chunks  \
0  [Quả cam ngon., Quả táo dở., Quả chanh chua., ...   
1  [Quả cam có hình tròn. Quả táo có hình tròn, h...   

                                          Embeddings  
0  [[tensor(0.3438), tensor(0.4892), tensor(0.466...  
1  [[tensor(0.0007), tensor(-0.0319), tensor(0.32...  


In [5]:
# Convert the embeddings into a string format to store in CSV
df_with_embeddings['Embeddings'] = df_with_embeddings['Embeddings'].apply(lambda x: str(x.tolist()))

# Export the DataFrame with embeddings to a CSV file
df_with_embeddings.to_csv('embedding_data.csv', index=False, encoding='utf-8')

print("Data with embeddings has been successfully exported to embedding_data.csv")

Data with embeddings has been successfully exported to embedding_data.csv
