<a href="https://colab.research.google.com/github/Nitobest/AICCRA-BI/blob/main/Four_Methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

🔄 METHOD 1: Top-K Matching (method1_topk_matching)

Purpose: Simple, fast approach using one embedding model
How it works:

Uses a single pre-trained model (default: MiniLM)
Finds top K most similar PRMS innovations for each AICCRA
Returns ranked matches with similarity scores


Best for: Quick results, when you want exactly K matches per innovation
Output: Fixed number of matches per AICCRA with clear ranking

🔄 METHOD 2: Multi-Model Comparison (method2_multi_model)

Purpose: Compare different embedding models to find the best performer
How it works:

Tests multiple pre-trained models simultaneously
Compares average performance across models
Shows which models agree on matches


Best for: Model selection, understanding which approach works best for your data
Output: Matches from different models + performance comparison

🔄 METHOD 3: Hybrid TF-IDF + Embedding (method3_hybrid_approach)

Purpose: Combines keyword-based and semantic approaches
How it works:

TF-IDF captures exact keyword matches
Embeddings capture semantic similarity
Weighted combination (default: 60% embedding, 40% TF-IDF)
Calculates confidence based on agreement between methods


Best for: When you want both exact matches AND semantic similarity
Output: Multiple similarity scores + confidence levels

🔄 METHOD 4: Threshold-Based Quality Matching (method4_threshold_based)

Purpose: Quality-focused approach with guaranteed minimum matches
How it works:

Defines quality levels (Excellent ≥80%, Good ≥60%, etc.)
Finds matches at each quality level
Ensures minimum number of matches per AICCRA
Removes duplicates while maintaining quality hierarchy


Best for: When you need quality assurance and flexible match quantities
Output: Quality-categorized matches with guaranteed minimums

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

class MultiMethodMatcher:
    """
    Combined matching system that implements 4 different approaches:
    1. Top-K Matching with Single Model
    2. Multiple Model Comparison
    3. Hybrid TF-IDF + Embedding
    4. Threshold-based Quality Matching
    """

    def __init__(self, aiccra_file='aiccra_now.csv', prms_file='prms_innovations_complete.csv'):
        print("🚀 Initializing Multi-Method Matcher...")
        self.load_data(aiccra_file, prms_file)
        self.results = {}

    def load_data(self, aiccra_file, prms_file):
        """Load and prepare the data"""
        print("📊 Loading datasets...")

        # Load datasets
        self.df_a = pd.read_csv(aiccra_file)
        self.df_p = pd.read_csv(prms_file)

        # Prepare combined text fields
        self.df_a['text'] = (self.df_a['Title'].fillna('') + ' ' + self.df_a['Narrative'].fillna('')).str.strip()
        self.df_p['text'] = (self.df_p['Title'].fillna('') + ' ' + self.df_p['Description'].fillna('')).str.strip()

        print(f"✅ Loaded {len(self.df_a)} AICCRA and {len(self.df_p)} PRMS innovations")

    def preprocess_text(self, text):
        """Basic text preprocessing for TF-IDF"""
        text = str(text).lower()
        text = re.sub(r'[^\w\s]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    # ==================== METHOD 1: TOP-K MATCHING ====================
    def method1_topk_matching(self, k=3, model_name='all-MiniLM-L6-v2'):
        """
        METHOD 1: Top-K Matching with Single Model
        - Uses one embedding model
        - Returns top K matches for each AICCRA innovation
        - Fast and straightforward approach
        """
        print(f"\n🔄 METHOD 1: Top-K Matching (K={k}) with {model_name}")

        # Load model and compute embeddings
        model = SentenceTransformer(model_name)
        emb_a = model.encode(self.df_a['text'].tolist(), convert_to_numpy=True, show_progress_bar=True)
        emb_p = model.encode(self.df_p['text'].tolist(), convert_to_numpy=True, show_progress_bar=True)

        # Compute similarity matrix
        sim_matrix = cosine_similarity(emb_a, emb_p)

        # Get top K matches
        results = []
        for i, a_id in enumerate(self.df_a['Innovation ID']):
            top_k_indices = np.argsort(sim_matrix[i])[::-1][:k]

            for rank, j in enumerate(top_k_indices):
                score = sim_matrix[i, j]
                results.append({
                    'Method': 'TopK_Matching',
                    'AICCRA_ID': a_id,
                    'AICCRA_Title': self.df_a.loc[i, 'Title'],
                    'Match_Rank': rank + 1,
                    'PRMS_ID': self.df_p.loc[j, 'Result id'],
                    'PRMS_Title': self.df_p.loc[j, 'Title'],
                    'Score_%': round(float(score) * 100, 2),
                    'Model_Used': model_name,
                    'Match_Quality': self._get_quality_label(score)
                })

        self.results['method1'] = pd.DataFrame(results)
        print(f"✅ Method 1 complete: {len(results)} matches found")
        return self.results['method1']

    # ==================== METHOD 2: MULTI-MODEL COMPARISON ====================
    def method2_multi_model(self, models=None, top_k=2):
        """
        METHOD 2: Multiple Model Comparison
        - Tests different embedding models
        - Compares performance across models
        - Shows which models agree on matches
        """
        if models is None:
            models = [
                'all-MiniLM-L6-v2',           # Fast, general purpose
                'all-mpnet-base-v2',          # Better quality
                'multi-qa-MiniLM-L6-cos-v1',  # Search optimized
            ]

        print(f"\n🔄 METHOD 2: Multi-Model Comparison with {len(models)} models")

        all_results = []
        model_performances = {}

        for model_name in models:
            try:
                print(f"  Testing model: {model_name}")

                # Load model and compute embeddings
                model = SentenceTransformer(model_name)
                emb_a = model.encode(self.df_a['text'].tolist(), convert_to_numpy=True)
                emb_p = model.encode(self.df_p['text'].tolist(), convert_to_numpy=True)

                # Compute similarity matrix
                sim_matrix = cosine_similarity(emb_a, emb_p)

                # Get top matches
                model_results = []
                scores_sum = 0

                for i, a_id in enumerate(self.df_a['Innovation ID']):
                    top_k_indices = np.argsort(sim_matrix[i])[::-1][:top_k]

                    for rank, j in enumerate(top_k_indices):
                        score = sim_matrix[i, j]
                        scores_sum += score

                        model_results.append({
                            'Method': 'Multi_Model',
                            'AICCRA_ID': a_id,
                            'Match_Rank': rank + 1,
                            'PRMS_ID': self.df_p.loc[j, 'Result id'],
                            'Score_%': round(float(score) * 100, 2),
                            'Model_Used': model_name,
                            'Match_Quality': self._get_quality_label(score)
                        })

                all_results.extend(model_results)
                model_performances[model_name] = scores_sum / len(model_results)

            except Exception as e:
                print(f"  ❌ Error with {model_name}: {e}")

        self.results['method2'] = pd.DataFrame(all_results)
        self.results['method2_performance'] = model_performances

        print(f"✅ Method 2 complete: {len(all_results)} matches across all models")
        print("📊 Model Performance (Average Score):")
        for model, avg_score in model_performances.items():
            print(f"  {model}: {avg_score:.3f}")

        return self.results['method2']

    # ==================== METHOD 3: HYBRID TF-IDF + EMBEDDING ====================
    def method3_hybrid_approach(self, embedding_weight=0.6, top_k=4):
        """
        METHOD 3: Hybrid TF-IDF + Embedding Approach
        - Combines keyword-based (TF-IDF) with semantic similarity
        - Weighted combination of both approaches
        - Better handling of both exact matches and semantic similarity
        """
        print(f"\n🔄 METHOD 3: Hybrid TF-IDF + Embedding (weight={embedding_weight})")

        # Preprocess text for TF-IDF
        self.df_a['text_processed'] = self.df_a['text'].apply(self.preprocess_text)
        self.df_p['text_processed'] = self.df_p['text'].apply(self.preprocess_text)

        # TF-IDF Approach
        print("  Computing TF-IDF similarities...")
        vectorizer = TfidfVectorizer(
            max_features=5000,
            stop_words='english',
            ngram_range=(1, 2),
            min_df=2,
            max_df=0.8
        )

        all_texts = self.df_a['text_processed'].tolist() + self.df_p['text_processed'].tolist()
        vectorizer.fit(all_texts)

        tfidf_a = vectorizer.transform(self.df_a['text_processed'])
        tfidf_p = vectorizer.transform(self.df_p['text_processed'])
        tfidf_sim = cosine_similarity(tfidf_a, tfidf_p)

        # Semantic Embedding Approach
        print("  Computing embedding similarities...")
        model = SentenceTransformer('all-mpnet-base-v2')
        emb_a = model.encode(self.df_a['text'].tolist(), convert_to_numpy=True)
        emb_p = model.encode(self.df_p['text'].tolist(), convert_to_numpy=True)
        emb_sim = cosine_similarity(emb_a, emb_p)

        # Combine approaches
        print("  Combining both approaches...")
        combined_sim = embedding_weight * emb_sim + (1 - embedding_weight) * tfidf_sim

        # Get results
        results = []
        for i, a_id in enumerate(self.df_a['Innovation ID']):
            top_k_indices = np.argsort(combined_sim[i])[::-1][:top_k]

            for rank, j in enumerate(top_k_indices):
                tfidf_score = tfidf_sim[i, j]
                emb_score = emb_sim[i, j]
                combined_score = combined_sim[i, j]

                # Calculate confidence based on agreement
                agreement = abs(tfidf_score - emb_score)
                confidence = "High" if agreement < 0.2 else "Medium" if agreement < 0.4 else "Low"

                results.append({
                    'Method': 'Hybrid_TF-IDF_Embedding',
                    'AICCRA_ID': a_id,
                    'AICCRA_Title': self.df_a.loc[i, 'Title'],
                    'Match_Rank': rank + 1,
                    'PRMS_ID': self.df_p.loc[j, 'Result id'],
                    'PRMS_Title': self.df_p.loc[j, 'Title'],
                    'TF-IDF_Score_%': round(float(tfidf_score) * 100, 2),
                    'Embedding_Score_%': round(float(emb_score) * 100, 2),
                    'Combined_Score_%': round(float(combined_score) * 100, 2),
                    'Confidence': confidence,
                    'Match_Quality': self._get_quality_label(combined_score)
                })

        self.results['method3'] = pd.DataFrame(results)
        print(f"✅ Method 3 complete: {len(results)} matches found")
        return self.results['method3']

    # ==================== METHOD 4: THRESHOLD-BASED MATCHING ====================
    def method4_threshold_based(self):
        """
        METHOD 4: Threshold-based Quality Matching
        - Defines multiple quality levels with thresholds
        - Ensures minimum number of matches per innovation
        - Categorizes matches by quality (Excellent/Good/Moderate/Weak)
        """
        print("\n🔄 METHOD 4: Threshold-based Quality Matching")

        # Use powerful model for this approach
        model = SentenceTransformer('all-mpnet-base-v2')
        emb_a = model.encode(self.df_a['text'].tolist(), convert_to_numpy=True)
        emb_p = model.encode(self.df_p['text'].tolist(), convert_to_numpy=True)
        sim_matrix = cosine_similarity(emb_a, emb_p)

        # Define quality thresholds
        thresholds = {
            'Excellent': 0.8,    # 80%+ similarity
            'Good': 0.6,         # 60-79% similarity
            'Moderate': 0.4,     # 40-59% similarity
            'Weak': 0.2          # 20-39% similarity
        }

        results = []
        min_matches_per_aiccra = 2

        for i, a_id in enumerate(self.df_a['Innovation ID']):
            similarities = sim_matrix[i]
            innovation_matches = []

            # Find matches for each quality level
            for quality, threshold in thresholds.items():
                valid_matches = np.where(similarities >= threshold)[0]
                sorted_indices = valid_matches[np.argsort(similarities[valid_matches])[::-1]]

                max_per_quality = 3 if quality in ['Excellent', 'Good'] else 2
                top_matches = sorted_indices[:max_per_quality]

                for rank, j in enumerate(top_matches):
                    score = similarities[j]
                    match_data = {
                        'Method': 'Threshold_Based',
                        'AICCRA_ID': a_id,
                        'AICCRA_Title': self.df_a.loc[i, 'Title'],
                        'PRMS_ID': self.df_p.loc[j, 'Result id'],
                        'PRMS_Title': self.df_p.loc[j, 'Title'],
                        'Score_%': round(float(score) * 100, 2),
                        'Match_Quality': quality,
                        'Rank_in_Quality': rank + 1,
                        'Threshold_Used': threshold
                    }
                    innovation_matches.append(match_data)

            # Remove duplicates and ensure minimum matches
            seen_prms = set()
            unique_matches = []
            for match in sorted(innovation_matches, key=lambda x: x['Score_%'], reverse=True):
                if match['PRMS_ID'] not in seen_prms:
                    seen_prms.add(match['PRMS_ID'])
                    unique_matches.append(match)

                if len(unique_matches) >= 5:  # Limit per AICCRA
                    break

            # Ensure minimum matches
            if len(unique_matches) < min_matches_per_aiccra:
                # Add more matches regardless of threshold
                all_indices = np.argsort(similarities)[::-1]
                for j in all_indices:
                    if self.df_p.loc[j, 'Result id'] not in seen_prms:
                        score = similarities[j]
                        unique_matches.append({
                            'Method': 'Threshold_Based',
                            'AICCRA_ID': a_id,
                            'AICCRA_Title': self.df_a.loc[i, 'Title'],
                            'PRMS_ID': self.df_p.loc[j, 'Result id'],
                            'PRMS_Title': self.df_p.loc[j, 'Title'],
                            'Score_%': round(float(score) * 100, 2),
                            'Match_Quality': 'Forced',  # Below all thresholds
                            'Rank_in_Quality': 1,
                            'Threshold_Used': 0.0
                        })
                        seen_prms.add(self.df_p.loc[j, 'Result id'])
                        if len(unique_matches) >= min_matches_per_aiccra:
                            break

            results.extend(unique_matches)

        # Add overall ranking
        df_results = pd.DataFrame(results)
        df_results['Overall_Rank'] = df_results.groupby('AICCRA_ID')['Score_%'].rank(method='dense', ascending=False).astype(int)

        self.results['method4'] = df_results
        print(f"✅ Method 4 complete: {len(results)} matches found")
        return self.results['method4']

    # ==================== UTILITY METHODS ====================
    def _get_quality_label(self, score):
        """Convert similarity score to quality label"""
        if score >= 0.8:
            return 'Excellent'
        elif score >= 0.6:
            return 'Good'
        elif score >= 0.4:
            return 'Moderate'
        else:
            return 'Weak'

    def run_all_methods(self):
        """
        Run all four methods and compile results
        """
        print("🚀 Starting Multi-Method Matching Analysis")
        print("=" * 60)

        # Run all methods
        self.method1_topk_matching()
        self.method2_multi_model()
        self.method3_hybrid_approach()
        self.method4_threshold_based()

        # Create summary
        self.create_summary()

        print("\n" + "=" * 60)
        print("🎉 ALL METHODS COMPLETED!")
        print("📁 Results saved to individual CSV files")
        return self.results

    def create_summary(self):
        """Create summary of all methods"""
        print("\n📊 CREATING SUMMARY REPORT")

        summary_stats = []

        for method_name, df in self.results.items():
            if isinstance(df, pd.DataFrame) and not df.empty:
                stats = {
                    'Method': method_name.replace('method', 'Method '),
                    'Total_Matches': len(df),
                    'Avg_Score': df.get('Score_%', df.get('Combined_Score_%', pd.Series([0]))).mean(),
                    'Max_Score': df.get('Score_%', df.get('Combined_Score_%', pd.Series([0]))).max(),
                    'Min_Score': df.get('Score_%', df.get('Combined_Score_%', pd.Series([0]))).min(),
                    'Unique_AICCRA': df['AICCRA_ID'].nunique() if 'AICCRA_ID' in df.columns else 0,
                    'Avg_Matches_per_AICCRA': len(df) / df['AICCRA_ID'].nunique() if 'AICCRA_ID' in df.columns else 0
                }
                summary_stats.append(stats)

        summary_df = pd.DataFrame(summary_stats)

        # Save all results
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        for method_name, df in self.results.items():
            if isinstance(df, pd.DataFrame):
                filename = f"{method_name}_results_{timestamp}.csv"
                df.to_csv(filename, index=False)
                print(f"  💾 Saved: {filename}")

        # Save summary
        summary_filename = f"methods_summary_{timestamp}.csv"
        summary_df.to_csv(summary_filename, index=False)
        print(f"  💾 Saved: {summary_filename}")

        print("\n📈 SUMMARY STATISTICS:")
        print(summary_df.round(2).to_string(index=False))



In [None]:

# ==================== MAIN EXECUTION ====================
if __name__ == "__main__":
    # Initialize the matcher
    matcher = MultiMethodMatcher('aiccra_real.csv', 'prms_innovations_real.csv')

    # Run all methods
    all_results = matcher.run_all_methods()

    # Optional: Run individual methods with custom parameters
    print("\n" + "="*60)
    print("🔧 CUSTOM METHOD EXAMPLES:")

    # Example: Run Method 1 with different parameters
    custom_topk = matcher.method1_topk_matching(k=5, model_name='all-mpnet-base-v2')
    print(f"Custom Top-K: {len(custom_topk)} matches")

    # Example: Run Method 3 with different weights
    custom_hybrid = matcher.method3_hybrid_approach(embedding_weight=0.8, top_k=3)
    print(f"Custom Hybrid: {len(custom_hybrid)} matches")