In [1]:
from ollama import chat
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch

In [2]:
aggregatedProjectsDF = pd.read_csv('projectsAggWResults.csv',sep=",")
aggregatedProjectsDF = aggregatedProjectsDF.drop('Unnamed: 0',axis=1)
aggregatedProjectsDF[['description','avis_technique']] = aggregatedProjectsDF['description'].str.split('Modalités de réalisation - Avis technique :', expand=True)
electionDetails = pd.read_csv('projectDetails.csv',sep=";")

In [3]:
district_projects = aggregatedProjectsDF[aggregatedProjectsDF['src_district_code'] == 3.3].copy()
district_winning_projects = district_projects[district_projects['approved_binary']==1]
district_losing_projects = district_projects[district_projects['approved_binary']==0]
print("Winning projects\n",district_winning_projects[['project_id','project_name', 'votes']].sort_values(by='votes', ascending=False))
print("Loosing projects\n", district_losing_projects[['project_id','project_name', 'votes']].sort_values(by='votes', ascending=False))

Winning projects
      project_id                                       project_name  votes
158          91  Nichoirs à martinets et chauve souris + pièges...    111
192          94  Installation de pièges à moustiques à Croix Da...     49
91          201  Voilage d'ombres sur les jeux public de plein ...     34
68           93                        Réduire la pollution sonore     33
14           99                              L'ornemental fruitier     29
194          98                               Equipements sportifs     26
83           92  Végétaliser le parking du cimetière de Croix D...     21
Loosing projects
     project_id                                       project_name  votes
12          97  Création d'un terrain de cécifoot (foot pour n...     16
9           95  Végétalisation des murs extérieurs de l'école ...     10


In [4]:
ground_truth = [94, 91, 92, 99, 201, 98, 93]
embedding_benchmark_models = {}

In [5]:
def check_out_perf(transformer_output: list[int], ground_truth:list[int]) -> float:
    correct_score = 0
    for i in range(len(transformer_output)):
        ith_output = transformer_output[i]
        ith_correct_output = ground_truth[i]
        if ith_output == ith_correct_output:
            correct_score +=1
    return correct_score/len(ground_truth)

In [6]:
test_project_description = district_losing_projects[district_losing_projects['project_id']==95]
target_project = {"title":test_project_description.iloc[0]['project_name'],"id":test_project_description.iloc[0]['project_id'].item(), "description":test_project_description.iloc[0]['description']}

**This is the reference that is going to be considered right in terms of ordering by similarity**  
answ = [94, 91, 92, 99, 201, 98, 93]

### Model BAAI/bge-m3 

In [7]:
from FlagEmbedding import BGEM3FlagModel

In [8]:
BGEM3 = BGEM3FlagModel('BAAI/bge-m3',use_fp16=True)
# embedding_benchmark_models['BGEM3'] = BGEM3

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

### Model intfloat/e5-mistral-7b-instruct (NOGO)

In [9]:
from sentence_transformers import SentenceTransformer

In [10]:
# Intfloat_mistral = SentenceTransformer("intfloat/e5-mistral-7b-instruct")

### Model OrdalieTech/Solon-embeddings-large-0.1 

In [11]:
OrdalieTech = SentenceTransformer("OrdalieTech/Solon-embeddings-large-0.1", device='cpu')
embedding_benchmark_models['OrdalieTech']=OrdalieTech

### Model manu/sentence_croissant_alpha_v0.4 (NOGO)
*My hardware was not able to run it*

In [12]:
# CroissantAlpha = SentenceTransformer("manu/sentence_croissant_alpha_v0.4")

### Model sentence-transformers/sentence-t5-xxl  (NOGO)
*My hardware was not able to run it*

### Model sentence-transformers/sentence-t5-xxl
changed to dangvantuan/sentence-camembert-large

In [13]:
camembert_large = SentenceTransformer("dangvantuan/sentence-camembert-large",device='cpu')
embedding_benchmark_models['camembert-large'] = camembert_large

No sentence-transformers model found with name dangvantuan/sentence-camembert-large. Creating a new one with mean pooling.


In [18]:
class Project:
    def __init__(self,project_id,project_title,description,embedded_description,embedded_title,similarity_score,title_similarity_score):
        self.project_id=project_id
        self.project_title=project_title
        self.description=description
        self.embedded_description=embedded_description
        self.embedded_title = embedded_title
        self.title_similarity_score = title_similarity_score
        self.similarity_score=similarity_score

In [19]:
for model_name,model in embedding_benchmark_models.items():
    embedded_winning_projects = []
    embedded_target_project = model.encode(target_project['description'])
    embedded_target_project_title = model.encode(target_project['title'])
    print(f"{model_name} --------------------")

    for index, row in district_winning_projects.iterrows():
        encoded_description = model.encode(row['description'])
        encoded_title = model.encode(row['project_name'])
        obj = Project(
            project_id=row['project_id'],
            project_title = row['project_name'],
            description= row['description'],
            embedded_description= encoded_description,
            embedded_title=encoded_title,
            similarity_score= model.similarity(embedded_target_project,encoded_description),
            title_similarity_score= model.similarity(embedded_target_project_title,encoded_title)

        )
        embedded_winning_projects.append(obj)

    embedded_winning_projects.sort(key=lambda x: x.title_similarity_score, reverse=True)
    for proj in embedded_winning_projects:
        print(f"""
            id -> {proj.project_id}
            proj_title -> {proj.project_title}
            description score -> {proj.similarity_score}
            title_score -> {proj.title_similarity_score}
        """)
    perf_score = check_out_perf(embedded_winning_projects,ground_truth)
    print(f"Model perf:{perf_score}")

OrdalieTech --------------------

            id -> 92
            proj_title -> Végétaliser le parking du cimetière de Croix Daurade
            description score -> tensor([[0.4835]])
            title_score -> tensor([[0.4450]])
        

            id -> 201
            proj_title -> Voilage d'ombres sur les jeux public de plein air pour enfants
            description score -> tensor([[0.4381]])
            title_score -> tensor([[0.4428]])
        

            id -> 99
            proj_title -> L'ornemental fruitier
            description score -> tensor([[0.3202]])
            title_score -> tensor([[0.2659]])
        

            id -> 91
            proj_title -> Nichoirs à martinets et chauve souris + pièges à larves ou à adulte de moustique tigre
            description score -> tensor([[0.3940]])
            title_score -> tensor([[0.2647]])
        

            id -> 94
            proj_title -> Installation de pièges à moustiques à Croix Daurade
            descripti