In [34]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import json
import numpy as np
from tqdm import tqdm
import csv
import re
from sentence_transformers import SentenceTransformer
import os
from typing import Optional, List, Dict, Any, Union
import glob

In [None]:
class ThaiQASystem:
    def __init__(self, model_id: str = "scb10x/typhoon2.1-gemma3-4b"):
        """Initialize the Thai Q&A System"""
        self.model_id = model_id
        self.tokenizer = None
        self.model = None
        self.embedder = None
        self.doc_embeddings = None
        self.doc_df = None
        
    def load_model(self):
        """Load the language model"""
        print("üîÑ Loading model...")
        try:
            torch._dynamo.config.cache_size_limit = 1024
            torch.set_float32_matmul_precision('high')
            
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_id,
                torch_dtype=torch.bfloat16,
                device_map="auto",
            )
            print("‚úÖ Model loaded successfully!")
        except Exception as e:
            print(f"‚ùå Error loading model: {e}")
            raise
    
    def load_embedder(self, embedder_model: str = "BAAI/bge-m3"):
        """Load sentence transformer for RAG"""
        print("üîÑ Loading embedder...")
        try:
            self.embedder = SentenceTransformer(embedder_model)
            print("‚úÖ Embedder loaded successfully!")
        except Exception as e:
            print(f"‚ùå Error loading embedder: {e}")
            raise
    
    def load_multiple_documents(self, 
                              doc_paths: Union[str, List[str]], 
                              embedding_path: Optional[str] = None,
                              force_recreate_embeddings: bool = False):
        """
        Load multiple documents from various sources
        
        Args:
            doc_paths: Single path, list of paths, or directory path with wildcards
            embedding_path: Path to save/load embeddings
            force_recreate_embeddings: Force recreation of embeddings even if they exist
        """
        try:
            # Handle different input types
            if isinstance(doc_paths, str):
                if '*' in doc_paths or '?' in doc_paths:
                    # Handle wildcards
                    doc_files = glob.glob(doc_paths)
                elif os.path.isdir(doc_paths):
                    # Handle directory - get all CSV files
                    doc_files = glob.glob(os.path.join(doc_paths, "*.csv"))
                else:
                    # Single file
                    doc_files = [doc_paths]
            else:
                # List of files
                doc_files = doc_paths
            
            print(f"üìÇ Found {len(doc_files)} document files to load")
            
            # Load and combine all documents
            all_docs = []
            for doc_path in doc_files:
                print(f"üìñ Loading: {doc_path}")
                
                if doc_path.endswith('.json'):
                    df = pd.read_json(doc_path, lines=True)
                else:
                    df = pd.read_csv(doc_path, encoding='utf-8')
                
                # Process based on file type/structure
                processed_df = self._process_document_structure(df, doc_path)
                all_docs.append(processed_df)
                print(f"  ‚úÖ Loaded {len(processed_df)} entries from {os.path.basename(doc_path)}")
            
            # Combine all documents
            if all_docs:
                self.doc_df = pd.concat(all_docs, ignore_index=True)
                print(f"üìö Total documents loaded: {len(self.doc_df)}")
                
                # Load or create embeddings
                if embedding_path and os.path.exists(embedding_path) and not force_recreate_embeddings:
                    print("üì• Loading existing embeddings...")
                    self.doc_embeddings = np.load(embedding_path)
                    if len(self.doc_embeddings) != len(self.doc_df):
                        print("‚ö†Ô∏è Embedding count mismatch, recreating...")
                        self.create_embeddings(save_path=embedding_path)
                    else:
                        print("‚úÖ Embeddings loaded successfully!")
                else:
                    print("üîÑ Creating new embeddings...")
                    self.create_embeddings(save_path=embedding_path)
            else:
                print("‚ùå No documents loaded!")
                
        except Exception as e:
            print(f"‚ùå Error loading documents: {e}")
            self.doc_df = None
            self.doc_embeddings = None
    
    def _process_document_structure(self, df: pd.DataFrame, file_path: str) -> pd.DataFrame:
        """
        Process different document structures and normalize them
        """
        file_name = os.path.basename(file_path).lower()
        
        # Check for md_csv format: [source_document, page_number, content_type, content]
        if all(col in df.columns for col in ['source_document', 'page_number', 'content_type', 'content']):
            print(f"  üìã Detected MD format in {file_name}")
            processed_df = pd.DataFrame({
                'content': df['content'].astype(str),
                'source': df['source_document'].astype(str),
                'page': df['page_number'].astype(str),
                'type': df['content_type'].astype(str),
                'file_origin': file_name
            })
            
        # Check for cheatsheet_csv format: [Question No, Question, Answer, Explanation]
        elif all(col in df.columns for col in ['Question No', 'Question', 'Answer', 'Explanation']):
            print(f"  üìã Detected Cheatsheet format in {file_name}")
            # Combine question, answer, and explanation into content
            df['combined_content'] = (
                "‡∏Ñ‡∏≥‡∏ñ‡∏≤‡∏°: " + df['Question'].astype(str) + 
                " ‡∏Ñ‡∏≥‡∏ï‡∏≠‡∏ö: " + df['Answer'].astype(str) + 
                " ‡∏Ñ‡∏≥‡∏≠‡∏ò‡∏¥‡∏ö‡∏≤‡∏¢: " + df['Explanation'].astype(str)
            )
            processed_df = pd.DataFrame({
                'content': df['combined_content'],
                'source': 'cheatsheet',
                'page': df['Question No'].astype(str),
                'type': 'qa_pair',
                'file_origin': file_name
            })
            
        # Check for simple content format
        elif 'content' in df.columns:
            print(f"  üìã Detected simple content format in {file_name}")
            processed_df = pd.DataFrame({
                'content': df['content'].astype(str),
                'source': df.get('source', 'unknown').astype(str) if 'source' in df.columns else 'unknown',
                'page': df.get('page', '1').astype(str) if 'page' in df.columns else '1',
                'type': df.get('type', 'document').astype(str) if 'type' in df.columns else 'document',
                'file_origin': file_name
            })
            
        # Auto-detect based on column names
        else:
            print(f"  üìã Auto-detecting format in {file_name}")
            # Use first text column as content
            text_columns = df.select_dtypes(include=['object']).columns.tolist()
            if text_columns:
                content_col = text_columns[0]
                processed_df = pd.DataFrame({
                    'content': df[content_col].astype(str),
                    'source': 'auto_detected',
                    'page': '1',
                    'type': 'document',
                    'file_origin': file_name
                })
            else:
                raise ValueError(f"No suitable text columns found in {file_name}")
        
        # Clean and filter content
        processed_df = processed_df[processed_df['content'].str.strip() != '']
        processed_df = processed_df[processed_df['content'] != 'nan']
        processed_df['content'] = processed_df['content'].str.strip()
        
        return processed_df
    
    def create_embeddings(self, save_path: str = "embeddings_multi_docs.npy"):
        """Create embeddings for documents"""
        if self.embedder is None:
            self.load_embedder()
        
        if self.doc_df is None or len(self.doc_df) == 0:
            print("‚ùå No documents loaded!")
            return
        
        print(f"üîÑ Creating embeddings for {len(self.doc_df)} documents...")
        texts = ["passage: " + str(x) for x in self.doc_df["content"]]
        
        # Create embeddings in batches to handle large datasets
        batch_size = 32
        all_embeddings = []
        
        for i in tqdm(range(0, len(texts), batch_size), desc="Creating embeddings"):
            batch_texts = texts[i:i+batch_size]
            batch_embeddings = self.embedder.encode(batch_texts, batch_size=len(batch_texts))
            all_embeddings.append(batch_embeddings)
        
        self.doc_embeddings = np.vstack(all_embeddings)
        
        # Save embeddings
        if save_path:
            np.save(save_path, self.doc_embeddings)
            print(f"üíæ Embeddings saved to {save_path}")
        
        # Save dataframe with embeddings
        self.doc_df["embedding"] = self.doc_embeddings.tolist()
        combined_df_path = "combined_docs_with_embeddings.json"
        self.doc_df.to_json(combined_df_path, orient="records", lines=True)
        print(f"üíæ Combined documents saved to {combined_df_path}")
        
        print(f"‚úÖ Embeddings created for {len(self.doc_embeddings)} documents")
    
    def retrieve_relevant_docs(self, question: str, top_k: int = 5) -> List[Dict[str, Any]]:
        """Retrieve relevant documents with metadata"""
        if self.embedder is None or self.doc_embeddings is None:
            return []
        
        try:
            # Encode question
            question_embedding = self.embedder.encode(["query: " + question])
            
            # Calculate similarities
            similarities = np.dot(self.doc_embeddings, question_embedding.T).flatten()
            top_indices = np.argsort(similarities)[-top_k:][::-1]
            
            # Get relevant documents with metadata
            relevant_docs = []
            for idx in top_indices:
                if idx < len(self.doc_df):
                    doc_info = {
                        'content': str(self.doc_df.iloc[idx]["content"]),
                        'source': str(self.doc_df.iloc[idx]["source"]),
                        'page': str(self.doc_df.iloc[idx]["page"]),
                        'type': str(self.doc_df.iloc[idx]["type"]),
                        'file_origin': str(self.doc_df.iloc[idx]["file_origin"]),
                        'similarity': float(similarities[idx])
                    }
                    relevant_docs.append(doc_info)
            
            return relevant_docs
        except Exception as e:
            print(f"‚ö†Ô∏è Error in document retrieval: {e}")
            return []
    
    def get_answer_only(self, question: str, use_rag: bool = True, top_k: int = 3) -> str:
        """Get answer from model with optional RAG"""
        if self.model is None or self.tokenizer is None:
            raise Exception("Model not loaded! Call load_model() first.")
        
        # Prepare context if RAG is enabled
        context = ""
        if use_rag and self.doc_embeddings is not None:
            relevant_docs = self.retrieve_relevant_docs(question, top_k=top_k)
            if relevant_docs:
                context_parts = []
                for i, doc in enumerate(relevant_docs[:top_k]):
                    source_info = f"[{doc['file_origin']}:{doc['source']}:{doc['page']}]"
                    context_parts.append(f"{source_info} {doc['content']}")
                
                context = "\n\n‡∏ö‡∏£‡∏¥‡∏ö‡∏ó‡∏ó‡∏µ‡πà‡πÄ‡∏Å‡∏µ‡πà‡∏¢‡∏ß‡∏Ç‡πâ‡∏≠‡∏á:\n" + "\n".join(context_parts)
        
        # Prepare messages
        system_prompt = (
            "You are an AI that answers multiple choice questions in Thai. "
            "Reply only with a valid JSON object in this exact format: "
            '{"answer": "‡∏Å"}. Do not include any explanation. '
            'Choices must be enclosed in double quotes. Do not add anything else outside the JSON.'
        )
        
        if context:
            system_prompt += "\n\nUse the provided context to help answer the question."
        
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": question.strip() + context}
        ]

        try:
            input_ids = self.tokenizer.apply_chat_template(
                messages,
                add_generation_prompt=True,
                return_tensors="pt"
            ).to(self.model.device)

            outputs = self.model.generate(
                input_ids,
                max_new_tokens=128,
                do_sample=True,
                temperature=0.6,
                top_p=0.95,
                pad_token_id=self.tokenizer.eos_token_id
            )

            response = outputs[0][input_ids.shape[-1]:]
            decoded = self.tokenizer.decode(response, skip_special_tokens=True)

            # Extract JSON
            try:
                # Find JSON in response
                start_idx = decoded.find("{")
                end_idx = decoded.rfind("}") + 1
                
                if start_idx != -1 and end_idx > start_idx:
                    json_str = decoded[start_idx:end_idx]
                    result = json.loads(json_str)
                    answer = result.get("answer", "").strip()
                    # Remove quotes if present
                    return answer.strip('"')
                else:
                    print(f"‚ö†Ô∏è No JSON found in: {decoded}")
                    return ""
                    
            except json.JSONDecodeError as e:
                print(f"‚ö†Ô∏è JSON decode error: {e}")
                print(f"Raw output: {decoded}")
                return ""
                
        except Exception as e:
            print(f"‚ö†Ô∏è Generation error: {e}")
            return ""
    def get_answer_and_reason(self, question: str):
        # Stub: Replace with actual retrieval + reasoning logic
        if "Clopidogrel" in question:
            return "‡∏Ç", "‡πÄ‡∏û‡∏£‡∏≤‡∏∞‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡πÉ‡∏ô PDF ‡∏£‡∏∞‡∏ö‡∏∏‡∏ß‡πà‡∏≤ Clopidogrel mg tablet OP: ‡πÄ‡∏ö‡∏¥‡∏Å‡πÑ‡∏î‡πâ 3 ‡∏ö‡∏≤‡∏ó/‡πÄ‡∏°‡πá‡∏î"
        else:
            return "‡∏á", "‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡πÄ‡∏à‡∏≤‡∏∞‡∏à‡∏á‡πÉ‡∏ô PDF ‡∏à‡∏∂‡∏á‡∏ï‡∏≠‡∏ö‡∏Ç‡πâ‡∏≠ ‡∏á ‡πÄ‡∏õ‡πá‡∏ô‡∏Ñ‡πà‡∏≤‡∏õ‡∏£‡∏¥‡∏¢‡∏≤‡∏¢"

    def test_single_question(self, question: str, use_rag: bool = True, top_k: int = 3) -> Dict[str, Any]:
        """Test a single question with detailed info"""
        print(f"\nü§î ‡∏Ñ‡∏≥‡∏ñ‡∏≤‡∏°: {question}")
        
        relevant_docs = []
        if use_rag and self.doc_embeddings is not None:
            relevant_docs = self.retrieve_relevant_docs(question, top_k=top_k)
            print(f"üìö ‡πÄ‡∏≠‡∏Å‡∏™‡∏≤‡∏£‡∏ó‡∏µ‡πà‡πÄ‡∏Å‡∏µ‡πà‡∏¢‡∏ß‡∏Ç‡πâ‡∏≠‡∏á: {len(relevant_docs)} ‡πÄ‡∏≠‡∏Å‡∏™‡∏≤‡∏£")
            
            # Show top relevant documents
            for i, doc in enumerate(relevant_docs[:3]):
                print(f"  {i+1}. [{doc['file_origin']}] similarity: {doc['similarity']:.3f}")
                print(f"     {doc['content'][:100]}...")
        
        answer = self.get_answer_only(question, use_rag=use_rag, top_k=top_k)
        print(f"üí° ‡∏Ñ‡∏≥‡∏ï‡∏≠‡∏ö: {answer}")
        
        return {
            "question": question,
            "answer": answer,
            "used_rag": use_rag,
            "relevant_docs": relevant_docs
        }

    def show_document_stats(self):
        """Show statistics about loaded documents"""
        if self.doc_df is None:
            print("‚ùå No documents loaded!")
            return
        
        print("\nüìä Document Statistics:")
        print(f"Total documents: {len(self.doc_df)}")
        print(f"Document types: {self.doc_df['type'].value_counts().to_dict()}")
        print(f"Source files: {self.doc_df['file_origin'].value_counts().to_dict()}")
        print(f"Sources: {self.doc_df['source'].nunique()} unique sources")
        
        # Show sample content
        print("\nüìñ Sample content:")
        for i in range(min(3, len(self.doc_df))):
            doc = self.doc_df.iloc[i]
            print(f"{i+1}. [{doc['file_origin']}:{doc['type']}] {doc['content'][:100]}...")

    def generate_submission_file(self, test_file_path: str, submission_file_path: str, 
                               use_rag: bool = True, top_k: int = 3):
        """Generate submission file from test CSV"""
        try:
            print(f"üìñ Reading test file: {test_file_path}")
            test_df = pd.read_csv(test_file_path, encoding='utf-8')
            
            if 'id' not in test_df.columns or 'question' not in test_df.columns:
                raise ValueError("‚ùå Test file must have 'id' and 'question' columns")
            
            print(f"üìä Processing {len(test_df)} questions...")
            
            submission_rows = []
            for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="üß† Generating answers"):
                answer = self.get_answer_only(row['question'], use_rag=use_rag, top_k=top_k)
                submission_rows.append({
                    "id": row['id'],
                    "answer": f'"{answer}"' if answer and not answer.startswith('"') else answer
                })

            submission_df = pd.DataFrame(submission_rows)
            submission_df.to_csv(submission_file_path, index=False, encoding='utf-8')
            print(f"‚úÖ Submission file created: {submission_file_path}")
            
            # Show sample results
            print("\nüìã Sample results:")
            print(submission_df.head(10))
            
        except FileNotFoundError:
            print(f"‚ùå Test file not found: {test_file_path}")
        except Exception as e:
            print(f"‚ùå Error generating submission: {e}")

In [None]:
def main():
    question = "‡∏¢‡∏≤ Clopidogrel mg tablet ‡πÉ‡∏ô‡∏õ‡∏µ 2567 ‡∏à‡πà‡∏≤‡∏¢‡πÉ‡∏ô‡∏≠‡∏±‡∏ï‡∏£‡∏≤‡πÄ‡∏ó‡πà‡∏≤‡πÉ‡∏î‡∏ï‡πà‡∏≠‡πÄ‡∏°‡πá‡∏î‡πÉ‡∏ô‡∏Å‡∏£‡∏ì‡∏µ‡∏ú‡∏π‡πâ‡∏õ‡πà‡∏ß‡∏¢‡∏ô‡∏≠‡∏Å (OP)? ‡∏Å. 2 ‡∏ö‡∏≤‡∏ó/‡πÄ‡∏°‡πá‡∏î ‡∏Ç. 3 ‡∏ö‡∏≤‡∏ó/‡πÄ‡∏°‡πá‡∏î ‡∏Ñ. 4 ‡∏ö‡∏≤‡∏ó/‡πÄ‡∏°‡πá‡∏î ‡∏á. 5 ‡∏ö‡∏≤‡∏ó/‡πÄ‡∏°‡πá‡∏î"
    
    """Main function with enhanced document loading options"""
    qa_system = ThaiQASystem()
    
    try:
        # Load model
        qa_system.load_model()
        qa_system.load_multiple_documents([
            "./data/doc_csv.csv", 
            "./data/labeled_data.csv",
        ])
        # ans = qa_system.get_answer_only(question)
        # print(ans)
        qa_system.generate_submission_file("./data/test.csv", "./data/submission.csv")
    except Exception as e:
        print(f"‚ùå Error: {e}")

In [23]:
main()

üîÑ Loading model...


The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ Model loaded successfully!
üìÇ Found 2 document files to load
üìñ Loading: ./data/doc_csv.csv
  üìã Detected MD format in doc_csv.csv
  ‚úÖ Loaded 376 entries from doc_csv.csv
üìñ Loading: ./data/labeled_data.csv
  üìã Detected Cheatsheet format in labeled_data.csv
  ‚úÖ Loaded 432 entries from labeled_data.csv
üìö Total documents loaded: 808
üîÑ Creating new embeddings...
üîÑ Loading embedder...
‚úÖ Embedder loaded successfully!
üîÑ Creating embeddings for 808 documents...


  return forward_call(*args, **kwargs)
Creating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 26/26 [00:00<00:00, 46.49it/s]


üíæ Combined documents saved to combined_docs_with_embeddings.json
‚úÖ Embeddings created for 808 documents
üìñ Reading test file: ./data/test.csv
üìä Processing 500 questions...


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
üß† Generating answers: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [03:59<00:00,  2.09it/s]

‚úÖ Submission file created: ./data/submission.csv

üìã Sample results:
   id answer
0   1    "‡∏Ñ"
1   2    "‡∏Ç"
2   3    "‡∏Å"
3   4    "‡∏á"
4   5    "‡∏Ñ"
5   6    "‡∏á"
6   7    "‡∏á"
7   8    "‡∏Ç"
8   9    "‡∏Å"
9  10    "‡∏Ç"





In [37]:
df = pd.read_csv('./data/submission_1.csv')

# ‡∏•‡πâ‡∏≤‡∏á triple quotes ‡∏ñ‡πâ‡∏≤‡∏°‡∏µ
df['answer'] = df['answer'].astype(str).str.strip('"')

# ‡πÉ‡∏™‡πà double quotes ‡∏£‡∏≠‡∏ö answer ‡πÄ‡∏≠‡∏á
df['answer'] = '"' + df['answer'] + '"'

# ‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ô‡πÑ‡∏ü‡∏•‡πå‡πÉ‡∏´‡∏°‡πà‡πÇ‡∏î‡∏¢‡πÑ‡∏°‡πà‡πÉ‡∏ä‡πâ quoting ‡∏Ç‡∏≠‡∏á pandas
df.to_csv('./data/submission.csv', index=False, quoting=3)  # 3 ‡∏Ñ‡∏∑‡∏≠ csv.QUOTE_NONE


In [33]:
def sim(csv1, csv2):
    # ‡πÇ‡∏´‡∏•‡∏î‡πÑ‡∏ü‡∏•‡πå CSV
    df1 = pd.read_csv(csv1)
    df2 = pd.read_csv(csv2)

    # ‡∏£‡∏ß‡∏°‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡πÇ‡∏î‡∏¢‡∏≠‡∏¥‡∏á‡∏ï‡∏≤‡∏° id ‡∏ó‡∏µ‡πà‡∏ï‡∏£‡∏á‡∏Å‡∏±‡∏ô‡πÄ‡∏ó‡πà‡∏≤‡∏ô‡∏±‡πâ‡∏ô
    merged = pd.merge(df1, df2, on='id', suffixes=('_1', '_2'))

    # ‡∏•‡∏ö whitespace ‡πÅ‡∏•‡∏∞ quote ‡∏´‡∏≤‡∏Å‡∏¢‡∏±‡∏á‡∏°‡∏µ
    merged['answer_1'] = merged['answer_1'].astype(str).str.strip().str.strip('"')
    merged['answer_2'] = merged['answer_2'].astype(str).str.strip().str.strip('"')

    # ‡πÄ‡∏õ‡∏£‡∏µ‡∏¢‡∏ö‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö‡∏Ñ‡∏≥‡∏ï‡∏≠‡∏ö
    total = len(merged)
    correct = (merged['answer_1'] == merged['answer_2']).sum()

    # ‡∏Ñ‡∏≥‡∏ô‡∏ß‡∏ì‡πÄ‡∏õ‡∏≠‡∏£‡πå‡πÄ‡∏ã‡πá‡∏ô‡∏ï‡πå‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏´‡∏°‡∏∑‡∏≠‡∏ô
    accuracy = (correct / total) * 100 if total > 0 else 0

    print(f"Matched: {correct}/{total} ({accuracy:.2f}%)")
    return accuracy

sim('./data/s1.csv', './data/s2.csv')

Matched: 229/500 (45.80%)


np.float64(45.800000000000004)