In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations
from scipy import stats
import os
os.environ['OMP_NUM_THREADS'] = '1'
import matplotlib
matplotlib.style.use('ggplot')

from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

from sentence_transformers import SentenceTransformer
import torch
import torch.nn.functional as F
import faiss
from sklearn.metrics.pairwise import cosine_similarity
import gc
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  from tqdm.autonotebook import tqdm, trange





In [2]:
#accessing HuggingFace
# Set the hub_cache dir
os.environ['HF_HOME'] = '/data/shared_models/'
#   can also be done by 
#   os.environ['TRANSFORMERS_CACHE'] = '/data/shared_models/'

# set mirror
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

In [6]:
class ensemble_nlp_model(TransformerMixin, BaseEstimator):
    models= []
    batch_size=16
    
    map_Dict={}
    
    K=10
    VER=5
    D = 1024
    weight1, weight2, weight3 = 0.5, 0.35, 0.29

    def __init__(self, path_list): 
        for path in path_list:
            self.models.append(self.init_model(path))
        print("nlp init complete")
        pass
    
    def init_model(self, model_path):
        model = SentenceTransformer(model_path, local_files_only=False, trust_remote_code=True)
        model.to(device)
        # wrap the model to use all GPUs
        model = torch.nn.DataParallel(model)
        model.eval()
        return model
    
    def fit(self, map_data,key_name,value_name, y=None):
        print('model fit')
        #print(X)
        
        self.map=X
        for i in range(self.map.shape[0]):
            self.map_Dict[self.map.MisconceptionId.values[i]] = self.map.MisconceptionName.values[i]
        
        self.map_bge_vect= self.encode_map(X,self.bge_model)
        self.map_gte_vect= self.encode_map(X,self.gte_model)
        self.map_mpnetv2_vect= self.encode_map(X,self.mpnetv2_model)
        
        
        self.map_ensemble_vect = (self.weight1 * self.map_gte_vect +
                                  self.weight2 * self.map_mpnetv2_vect +
                                  self.weight3 * self.map_bge_vect)
        return self
        
    
    def transform(self, X):
        print('nlp transform')
        
        return  X
    
    def search_faiss(self, k, d, vectors_to_add, query_vectors):
        """
        Perform a FAISS search with L2 distance.
    
        Parameters:
            k (int): Number of nearest neighbors to search for.
            d (int): Dimension of the vectors.
            vectors_to_add (numpy.ndarray): The vectors to add to the FAISS index.
            query_vectors (numpy.ndarray): The vectors to search for the nearest neighbors.
        
        Returns:
            D (numpy.ndarray): The distances to the k nearest neighbors.
            I (numpy.ndarray): The indices of the k nearest neighbors.
        """
        # Create the index
        index = faiss.IndexFlatL2(d)
    
        # Add vectors to the index
        index.add(vectors_to_add)
    
        # Search for k nearest neighbors
        D, I = index.search(query_vectors, k)
    
        return D, I
    def encode_map(self, X, model, progress_bar=True):
        # Encode misconception names from the misconception_mapping DataFrame
        misconception_mapping_vec = model.module.encode(X["MisconceptionName"].to_list(), batch_size=self.batch_size,
                                                        normalize_embeddings=True, show_progress_bar=progress_bar)
        
        torch.cuda.empty_cache()
        gc.collect()
        
        return misconception_mapping_vec
        
    def encode_texts(self, X, model, progress_bar=True):
        
        X['AllText']=X.apply(lambda x: f"<Construct>{x['ConstructName']} <Subject>{x['SubjectName']} <Question>{x['QuestionText']} <Answer>{x['AnswerText']}",axis=1)
        
        # Encode all text from X
        all_text_vec = model.module.encode(X["AllText"].to_list(), batch_size=self.batch_size ,
                                           normalize_embeddings=True, show_progress_bar=progress_bar)
    
        torch.cuda.empty_cache()
        gc.collect()
    
        return  all_text_vec
    
    def encode_text_all_models(self, X):
        self.bge_text_vec= self.encode_texts(X,self.bge_model)
        self.gte_text_vec= self.encode_texts(X,self.gte_model)
        self.mpnetv2_text_vec= self.encode_texts(X,self.mpnetv2_model)
        return self.bge_text_vec,self.gte_text_vec,self.mpnetv2_text_vec
    
    def predict(self,X):
        
        self.encode_text_all_models(X)
        
        ensemble_text_vec = (self.weight1 * self.gte_text_vec + self.weight2 * self.bge_text_vec + self.weight3 * self.mpnetv2_text_vec)
        self.D= ensemble_text_vec.shape[1]
        
        _, ensemble_indices = self.search_faiss(self.K, self.D, self.map_ensemble_vect, ensemble_text_vec)
        self.print_predict(ensemble_indices)
        pass
    
    def print_predict(self,indicies):
        count=0
        count2=0
        for i in indicies:
            count+=1
            print(f'Question {count}:')
            for j in i:
                count2+=1
                print (f"Possible Misconception {count2}: #{j}: {self.map_Dict[int(j)]}")

In [3]:
BGE_path='BAAI/bge-small-en'
GTE_path='thenlper/gte-small'
MPNetV2_path='all-mpnet-base-v2'

In [4]:
path_list=[BGE_path, GTE_path,MPNetV2_path]

In [7]:
model=ensemble_nlp_model(path_list)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/68.1k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/66.7M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

nlp init complete
