# EXPLAIN & TEST FLOWCHART 

In [3]:
import pandas as pd

In [9]:
# Function to handle file upload and data reading
def upload_and_process_files(uploaded_files):
    """
    Function to upload and process the CSV files.

    Args:
    uploaded_files (list): List of file paths.

    Returns:
    pd.DataFrame: Combined DataFrame from all uploaded CSV files.
    """
    for file_path in uploaded_files:
        print(f"Processing file: {file_path}")
        if file_path.endswith(".csv"):
            try:
                # Try to read the CSV file
                df = pd.read_csv(file_path)
                all_data.append(df)
            except pd.errors.ParserError:
                # Handle CSV parsing error
                raise ValueError(f"Error: The file {file_path} is not in the correct format of a .csv file.")
    
    if all_data:
        # Combine all DataFrames into one
        df_combined = pd.concat(all_data, ignore_index=True)
        return df_combined
    else:
        return None

In [10]:
# Replace with actual file paths
uploaded_files = [r'C:\Users\ungdu\Downloads\Test Chatgpt\Test data.csv']  # Provide full file path(s)

df_combined = upload_and_process_files(uploaded_files)

if df_combined is not None:
    # Display the DataFrame in Jupyter notebook
    display(df_combined)
else:
    print("No data to display.")

Processing file: C:\Users\ungdu\Downloads\Test Chatgpt\Test data.csv


Unnamed: 0,Câu hỏi,Câu trả lời
0,Các quả có mùi vị như thế nào,Quả cam ngon. Quả táo dở. Quả chanh chua. Quả ...
1,Các quả có hình dáng như thế nào,"Quả cam có hình tròn. Quả táo có hình tròn, hơ..."


# CHUNKING

In [15]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Ensure that punkt is downloaded for sentence tokenization
nltk.download("punkt", quiet=True)

class SemanticChunker:
    def __init__(self, threshold=0.3, embedding_type="tfidf"):
        """
        Initialize the chunker with a threshold and embedding type.
        
        Args:
        - threshold (float): Minimum cosine similarity to group sentences into the same chunk.
        - embedding_type (str): Type of embedding to use for vectorization, currently supports 'tfidf'.
        """
        self.threshold = threshold
        self.embedding_type = embedding_type
    
    def embed_function(self, sentences):
        """
        Convert a list of sentences into their vector representations using TF-IDF.
        
        Args:
        - sentences (list): List of sentences to be embedded.
        
        Returns:
        - numpy.ndarray: The TF-IDF vectors.
        """
        if self.embedding_type == "tfidf":
            vectorizer = TfidfVectorizer().fit_transform(sentences)
            return vectorizer.toarray()
        else:
            raise ValueError("Unsupported embedding type")
    
    def split_text(self, text):
        """
        Split the input text into chunks based on semantic similarity of sentences.
        
        Args:
        - text (str): Input text to be chunked.
        
        Returns:
        - list: List of chunked text.
        """
        sentences = nltk.sent_tokenize(text)  # Extract sentences
        sentences = [item for item in sentences if item and item.strip()]
        if not len(sentences):
            return []

        # Vectorize the sentences for similarity checking
        vectors = self.embed_function(sentences)

        # Calculate pairwise cosine similarity between sentences
        similarities = cosine_similarity(vectors)

        # Initialize chunks with the first sentence
        chunks = [[sentences[0]]]

        # Group sentences into chunks based on similarity threshold
        for i in range(1, len(sentences)):
            sim_score = similarities[i-1, i]

            if sim_score >= self.threshold:
                # If the similarity is above the threshold, add to the current chunk
                chunks[-1].append(sentences[i])
            else:
                # Start a new chunk
                chunks.append([sentences[i]])

        # Join the sentences in each chunk to form coherent paragraphs
        return [' '.join(chunk) for chunk in chunks]


# Initialize the SemanticChunker (adjust threshold if needed)
chunker = SemanticChunker(threshold=0.3)

# Apply the chunking process to the "Câu trả lời" column
df_combined["Chunks"] = df_combined["Câu trả lời"].apply(lambda x: chunker.split_text(x))

# Display the chunked results
for index, row in df_combined.iterrows():
    print(f"\nCâu hỏi: {row['Câu hỏi']}")
    print("Chunks:")
    for i, chunk in enumerate(row["Chunks"]):
        print(f"  Chunk {i+1}: {chunk}")


KeyError: 'Câu hỏi'