In [11]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

from SimEngine.models.embedding import CAMeL, AraBERTv2, MARBERT, FastTextArabicEmbedder, TFIDFEmbedder, EmbeddingInterface
from SimEngine.models.ner import Hatmimoha, NERInterface
from SimEngine.preprocessing import TFIDFPreprocessor, ArabertPreprocessor, HardPreprocessor
from SimEngine.similarity_engine import SimilarityEngine

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Test sentences

In [12]:
df = pd.read_csv('data/joined_split_by_agency/joined_جامعة الملك سعود.csv')
# agency_name = 'معهد الادارة العامة'
x1 = df['fc_description'].values[:100]
x2 = x1.copy()

## Run `SimEngine`

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

from SimEngine.models.embedding import CAMeL, AraBERTv2, MARBERT, FastTextArabicEmbedder, TFIDFEmbedder, EmbeddingInterface, TransformerEmbedder
from SimEngine.models.ner import Hatmimoha, NERInterface
from SimEngine.preprocessing import TFIDFPreprocessor, ArabertPreprocessor, HardPreprocessor
from SimEngine.similarity_engine import SimilarityEngine


finetuned_model = TransformerEmbedder(
    model='model_finetuned',
    tokenizer='model_finetuned',
    pooling_strategy='mean'
)

# Initialize the embedding models
embedding = EmbeddingInterface(embedding_model=finetuned_model) 

# Initialize the similarity engine
engine = SimilarityEngine(
                          embedding_interface = embedding, # Embedding models to use
                          threshold = 0.8, # Min similarity score to consider
                          top_k = None, # Return top k similar entires 
                          )

sim_dict = engine.fit(x1 = x1, x2 = x2)

## `sim_dict` has those attributes

```python
    class SimilarityDict:
        x1: List[str] # corpus1
        x2: List[str] # corpus2
        similarity_matrix: csr_matrix # len(corpus1) x len(corpus2) matrix of simialrity scores
        similarity_dict: Dict[str, Dict[str, float]] # {corpus1_sentence : { corpus2_sentence : score}}
        threshold: Optional[float] # Min similarity score to report
        top_k: Optional[int] # Top k corpus2_sentence to report

In [4]:
from pprint import pprint
numbers = [1,2,3,4,5,6,7,8,9,10]
for key,value in sim_dict.similarity_dict.items():
    #clean key of numbers
    for number in numbers:
        key = key.replace(str(number), '')
    if key:
        pprint({key:value})
    

{'سلفه مؤقته خاصة لصيانة السيارات المسترجعة': {'سلفه مؤقته خاصة لصيانة السيارات المسترجعة': 0.99999994,
                                               'صيانة السيارات المسترجعة': 0.8302753}}
{'صيانة السيارات المسترجعة': {'سلفه مؤقته خاصة لصيانة السيارات المسترجعة': 0.8302753,
                              'صيانة السيارات المسترجعة': 1.0000001,
                              'قطع غيار السيارات': 0.8117895}}
{'تأمين إحتياجات المركز الجامعي للسكر': {'تأمين إحتياجات المركز الجامعي للسكر': 0.99999994}}
{'رحلات علميه': {'رحلات علميه': 0.99999946}}
{'أحتياجات الكلية الضرورية والمستعجلة': {'أحتياجات الكلية الضرورية والمستعجلة': 1.0000001}}
{'تفرغ علمي': {'تفرغ علمي': 1.0}}
{'مكافأة مناقشة الرسائل ': {'دعم أبحاث عمادة البحث العلمي': 0.81537986,
                            'دعم مركز البحوث': 0.8173649,
                            'مستحقات مناقشة رسائل': 0.8034914,
                            'مكافأة أعضاء مناقشة رسائل': 0.94366467,
                            'مكافأة مناقشة الرسائل ': 1.0000005,


## Convert to excel

In [5]:
HardPreprocessor().transform(x1)

TypeError: string operation on non-string array

In [None]:
%load_ext autoreload
%autoreload 2

from SimEngine.utils import convert_sim_dict_to_excel
convert_sim_dict_to_excel(sim_dict, f'{agency_name}_similarity.xlsx')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Other stuff

In [None]:
from diffusers import DiffusionPipeline
import torch

pipe = DiffusionPipeline.from_pretrained("SimianLuo/LCM_Dreamshaper_v7", custom_pipeline="latent_consistency_txt2img", custom_revision="main")

# To save GPU memory, torch.float16 can be used, but it may compromise image quality.
pipe.to(torch_device="mps", torch_dtype=torch.float32)

prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k"

# Can be set to 1~50 steps. LCM support fast inference even <= 4 steps. Recommend: 1~8 steps.
num_inference_steps = 2

images = pipe(prompt=prompt, num_inference_steps=num_inference_steps, guidance_scale=8.0, lcm_origin_steps=50, output_type="pil").images


Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.
Loading pipeline components...:  33%|███▎      | 2/6 [00:07<00:15,  3.77s/it]


ValueError: Non-consecutive added token '<|startoftext|>' found. Should have index 49408 but has index 49406 in saved vocabulary.