In [1]:
from ollama import chat
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch


**Importing the model and the dataset**

In [2]:
model = OllamaLLM(model="deepseek-r1:1.5b")

In [3]:
aggregatedProjectsDF = pd.read_csv('projectsAggWResults.csv',sep=",")
aggregatedProjectsDF = aggregatedProjectsDF.drop('Unnamed: 0',axis=1)
electionDetails = pd.read_csv('projectDetails.csv',sep=";")

Testing the similarity analysis

In [4]:
test_template_en = """
    Given this short project description: {target_project}

    Compare the description given with the following texts as rank them in order of similarity
    to the given descriptions and organize them from the most similar to the least similar:
    1-{desc_1},
    2-{desc_2},
    3-{desc_3}

"""

In [5]:
prompt_en = ChatPromptTemplate.from_template(test_template_en)
chain = prompt_en | model


In [6]:
test_input = chain.invoke({
    "target_project":"Il est important de boire suffisamment d’eau chaque jour pour rester en bonne santé.",
    "desc_1":"Une alimentation équilibrée contribue aussi à améliorer l’énergie et la concentration.",
    "desc_2":"Le volcan est entré en éruption soudainement, couvrant le ciel de cendres.",
    "desc_3":"S’hydrater régulièrement aide le corps à bien fonctionner et prévient la fatigue."
})

print(test_input)

<think>
Okay, so I need to figure out how to rank these three texts based on their similarity to the given project description about sufficient daily water intake. Let me start by understanding each part of the problem.

First, the user provided a short project description: "Il est important de boire suffisamment d’eau chaque jour pour rester en bonne santé." This translates to "It is important to drink enough water every day to stay in good health."

Then, there are three texts that need comparison. Let me read each one carefully.

1. Une alimentation équilibrée contribue aussi à améliorer l’énergie et la concentration.,
2. Le volcan est entré en éruption soudainement, couvrant le ciel de cendres.,
3. S’hydrater régulièrement aide le corps à bien fonctionner et prévient la fatigue.

I need to compare each of these texts with the project description and rank them from most similar to least similar.

Starting with the first text: "Une alimentation équilibrée contribue aussi à améliorer 

As we can see, the model can do the comparisons between short texts, it takes some time but it works, now we'll need to see how does it works on a context more similar to our context

In [7]:
print(aggregatedProjectsDF.columns)

Index(['project_id', 'project_name', 'description', 'category', 'cost',
       'district', 'votes', 'title_length', 'description_length',
       'agg_district_code', 'agg_quartiers', 'src_district_code', 'approved',
       'approved_binary'],
      dtype='object')


We are going to apply this analysis district by district and check the outputted results.  
Info on how to format the output https://ollama.com/blog/structured-outputs
**Ollama documentation**  
https://github.com/ollama/ollama/blob/main/docs/api.md  
https://github.com/ollama/ollama-python  
https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-chat-completion  
https://github.com/ollama/ollama/blob/main/docs/modelfile.md#valid-parameters-and-values

In [8]:
class PairResponseFormatter(BaseModel):
    target_project: int = Field(description="ID of the project of reference that is being used to compare to other projects")
    similar_project: int = Field(description="The id of the most similar project related to the target project")

class ResponseFormatter(BaseModel):
    target_project: int = Field(description="ID of the project of reference that is being used to compare to other projects")
    ranked_similar_projects: list[int] = Field(description="The id's of similar projects ranked by their similarity from the most similar to the least similar")
    # ranked_similar_projects_score: list[float] = Field(description="the score of the similarity between projects ranked by their similarity from the most similar to the least similar")


class DetailedResponseFormatter(BaseModel):
    target_project: int = Field(description="ID of the project of reference that is being used to compare to other projects")
    ranked_similar_projects: dict[str,float] = Field(description="A dictionary where the key is the project ID and the value is the score of similarity between the projects")

In [9]:
# response = chat(
#     messages = [
#         {
#             'role': 'user',
#             'content': """
#     Given this target project description in french: 
#     target_project : {"id": 1, "description": "Développement d'une application mobile pour la gestion des tâches personnelles avec rappels et synchronisation cloud."}

#     Compare the given description to this other descriptions, formatted in a json format and give me the ID of the project that is
#     the most similar one to the target_project. Consider only similarity between descriptions

#     Other descriptions:
#     [
#         {"id": 4, "description": "Création d'une application mobile de gestion des tâches personnelles, intégrant rappels automatiques et synchronisation entre appareils."},
#         {"id": 3, "description": "Mise en place d'un système web pour la gestion de projets collaboratifs avec calendrier partagé."},
#         {"id": 2, "description": "Développement d'un jeu vidéo éducatif pour enfants en bas âge, avec des animations interactives."}
#     ]

#     Respond with the id of the target project, a list of id's of the similar projects ranked by their similarity ordered by the most similar to the least similar
#     and include in the project list even the projects that are not related at all
#     """
#         }
#     ],
#     model='deepseek-r1:1.5b',
#     format = ResponseFormatter.model_json_schema(),
# )

# test_pair = ResponseFormatter.model_validate_json(response.message.content)
# print(test_pair)

**Now that twe can use the LLM to process the similarity between projects and output that relation in a structured format using a custom prompt, we nee to find a way to compute the overlapping of projects voters**

In [10]:
def compute_voter_overlap(target_project_id:int, compared_project_id:int, election_details_dataset) -> float:
    result = {
        "overlapping_voters":[],
        "overlap_percentage_target_project":0,
        "overlap_percentage_compared_project":0
    }
    
    ##Get the voters per project
    target_project_votes = election_details_dataset[election_details_dataset['project_id']==target_project_id]
    compared_project_votes = election_details_dataset[election_details_dataset['project_id']==compared_project_id]


    ###Get a list of voters ID's that voted for that same project
    target_project_voters = target_project_votes['ID'].unique().tolist()
    compared_project_voters = compared_project_votes['ID'].unique().tolist()

    ##Calculate the overlapping percentage
    overlap_set = list(set(target_project_voters) & set(compared_project_voters))

    result["overlapping_voters"] = overlap_set
    result["overlap_percentage_target_project"]= len(overlap_set)/len(target_project_voters)
    result["overlap_percentage_compared_project"]= len(overlap_set)/len(compared_project_voters)    

    return result
    

In [11]:
compute_voter_overlap(52, 51, electionDetails)

{'overlapping_voters': [2465,
  3425,
  2533,
  4519,
  2408,
  3401,
  2139,
  555,
  3373,
  3118,
  558,
  1491,
  1749,
  1401,
  4539],
 'overlap_percentage_target_project': 0.15625,
 'overlap_percentage_compared_project': 0.38461538461538464}

In [12]:
district_projects = aggregatedProjectsDF[aggregatedProjectsDF['src_district_code'] == 2.3].copy()
district_winning_projects = district_projects[district_projects['approved_binary']==1]
district_losing_projects = district_projects[district_projects['approved_binary']==0]
print("Winning projects\n",district_winning_projects[['project_id','project_name', 'votes']].sort_values(by='votes', ascending=False))
print("Loosing projects\n", district_losing_projects[['project_id','project_name', 'votes']].sort_values(by='votes', ascending=False))

Winning projects
      project_id                                       project_name  votes
147          52  Végétaliser pour limiter la surchauffe estival...     96
170          58  Une piste cyclable pour une meilleure continui...     66
69           56  Un verger pour les enfants du bitume (plantati...     64
101          60  Du miel dans mon quartier: produisons de la do...     43
Loosing projects
      project_id                                       project_name  votes
181          51  Végétaliser nos quartiers (projet étudié autou...     39
103          57  Des panneaux solaires sur la Maison de Quartie...     31
140          54  Aménagement du parc Fontaine Lestang (Les Bisc...     31
97           53           Végétalisation d'ombrage rue Jean Mermoz     12
106          59  Des jeux pour les grands (aménagement d’une ai...      7
32           55  La Gironde conviviale et sportive (aménagement...      3


#### Given the objective of this study, the proof of concept of this pipeline is going to go over three cases

1. **Cloned projects** - Two projects that are very similar (similar description) - They share (ideally) no voter overlap -> Possible projects: (28,27), (51, 52), (95, 92), (151[98 votes], 156[178 votes])**
2. **The project with a very low vote count** - Compare that project with a project that was approved -> Possible projects: (176 [2 votes], 169 [189 votes]),(178 [2 votes], 182 [33 votes]), (50 [2 votes], 44 [33 votes])
3. **Two distinct projects**, one passed and the other one didn't and they overlap in voting intention (the same people that voted for project A also voted for project B but only project B was accepted) -> Possible projects: (95, 91), (103,102), (109,110), (122,125), (151, 156)**

*The pairs are the project id's*  
**This pairing fits in more than one case

In [13]:
test_project_description = district_losing_projects[district_losing_projects['project_id']==51]
# print(test_project_description)
target_project = {"id":test_project_description.iloc[0]['project_id'].item(), "description":test_project_description.iloc[0]['description']}
# print(case_1_test_project_description['project_name'])
# print(len(district15_losing_projects.index))
winning_projects = []
for index, row in district_winning_projects.iterrows():
    project = {"id":row['project_id'], "description":row['description']}
    winning_projects.append(project)

In [14]:
similarity_template_en = f"""
    You are given a target project description written in French:

target_project:
{target_project}

You are also given a list of other project descriptions in JSON format, each with an associated ID:

{winning_projects}

Your task is to compare the semantic content of the target project description to each of the other descriptions. Determine which projects are most similar based **only on the semantic similarity between the descriptions** (ignore any metadata, IDs, or structure beyond the text content).

Return a ranked list of all the project IDs, from most similar to least similar. Include all projects, even those that are not semantically related.

(Optional Example to guide behavior):

Example:
Target: {{"id": "p4", "description":"La construction d'une école primaire écologique en région rurale."}}
Other projects:
[
  {{"id": "p1", "description": "Création d'un centre médical."}},
  {{"id": "p2", "description": "Construction d'un bâtiment scolaire pour 500 élèves."}},
  {{"id": "p3", "description": "Aménagement d’un parc de loisirs."}}
]
Most similar to least similar:
p2, p1, p3
"""

In [15]:
print(similarity_template_en)


    You are given a target project description written in French:

target_project:
{'id': 51, 'description': "Bonjour, Je pense que beaucoup de choses peuvent être améliorées. La première qui me vient à l'esprit est la question de l'environnement urbain. Nous allons vivre des années de plus en plus chaudes et je pense que ces pics de chaleur vont particulièrement se faire ressentir dans les villes où le béton absorbe de façon conséquente la chaleur. Personnellement, quand l'été arrive j'ai souvent l'impression d'étouffer. C'est pourquoi je pense que la végétalisation des espaces serait une solution efficace pour contrer la chaleur. Cela permettrait de créer des zones d'ombres, de faire baisser les températures, d'apporter un peu d'air et de casser l'aspect béton. Il me semble que beaucoup de gens aspirent à voir la nature un peu plus présente en ville ! J'habite dans le quartier des arènes où certes quelques arbres ont été plantés, mais cela ne me semble pas suffisant. Il serait telle

In [16]:
similarity_response = chat(
    messages = [
        {
            'role': 'user',
            'content': similarity_template_en
        }
    ],
    model='deepseek-r1:1.5b',
    format = ResponseFormatter.model_json_schema(),
)

similarity_response = ResponseFormatter.model_validate_json(similarity_response.message.content)
print(similarity_response)

target_project=51 ranked_similar_projects=[56, 50, 58]


**CASE 3-Overlapping voters case**

In [17]:
case_3_district_projects = aggregatedProjectsDF[aggregatedProjectsDF['src_district_code'] == 5.3].copy()
case_3_district_winning_projects = case_3_district_projects[case_3_district_projects['approved_binary']==1]
case_3_district_losing_projects = case_3_district_projects[case_3_district_projects['approved_binary']==0]
print("CASE 3 Winning projects\n",case_3_district_winning_projects[['project_id','project_name', 'votes']].sort_values(by='votes', ascending=False))
print("CASE 3 Loosing projects\n", case_3_district_losing_projects[['project_id','project_name', 'votes']].sort_values(by='votes', ascending=False))


CASE 3 Winning projects
      project_id                                       project_name  votes
34          156  Aménager le Rond-Point des Français Libres et ...    178
128         157  Plantation d'arbres et diversification de la p...    128
149         153                Réduction de la pollution lumineuse    113
58          155              Lutter contre la pollution des mégots     63
3           149  Un jardin en mouvement le long de la voie ferr...     50
CASE 3 Loosing projects
      project_id                                       project_name  votes
165         151                     Aménager la place Henri Russel     98
144         150           Coup de pinceau sur le béton du quartier     56
99          154                         Nichoirs pour des mésanges     44
159         148             Plan du voisinage aux sorties de métro     24
187         152  Aire de jeux accessible à tous au jardin du Pa...     14


In [18]:
case_3_target_project_id = 151

for index, row in case_3_district_winning_projects.iterrows():
    result = compute_voter_overlap(case_3_target_project_id, row['project_id'], electionDetails)
    print(f"target project->{case_3_target_project_id} | project_compared_to->{row['project_id']} | overlapping_score_to_target->{result['overlap_percentage_target_project']}")
    print("Overlapping voters")
    print(result['overlapping_voters'],"\n")

target project->151 | project_compared_to->149 | overlapping_score_to_target->0.061224489795918366
Overlapping voters
[4896, 5101, 4913, 1782, 4759, 5017] 

target project->151 | project_compared_to->156 | overlapping_score_to_target->0.6632653061224489
Overlapping voters
[4491, 1037, 4750, 1805, 2576, 1681, 18, 5008, 1301, 2711, 1434, 2716, 2720, 2721, 3874, 4900, 1063, 4906, 813, 1327, 1842, 4147, 3636, 2742, 1591, 4406, 2617, 1335, 2871, 5050, 1597, 2749, 4028, 2499, 2756, 4422, 3912, 715, 77, 2893, 3534, 4049, 5075, 1493, 2902, 2776, 2777, 4186, 5083, 3934, 3170, 4195, 4837, 615, 1129, 3309, 5101, 2416, 3697, 4594, 4979, 2804, 2807, 2043, 893] 

target project->151 | project_compared_to->155 | overlapping_score_to_target->0.061224489795918366
Overlapping voters
[165, 4841, 3309, 2871, 4889, 4186] 

target project->151 | project_compared_to->157 | overlapping_score_to_target->0.19387755102040816
Overlapping voters
[1681, 4759, 3352, 2716, 1951, 2721, 3874, 4900, 813, 4913, 2756, 442

**We can also try a different approach for the pipeline**  
Instead of using the LLM for the semantic comparison between descriptions, we can use embeddings, they are faster and deterministic  

In [19]:
sentence = target_project['description']
# sentence-transformers/LaBSE Alibaba-NLP/gte-Qwen2-1.5B-instruct
model = SentenceTransformer('Alibaba-NLP/gte-Qwen2-1.5B-instruct', device='cpu')
embeddings = model.encode(sentence)

embedded_target_project = {
    "project_id": target_project['id'],
    "description": target_project['description'],
    "embedded_description": embeddings
}
print(embedded_target_project)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/284 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/146k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/55.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/901 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/370 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

{'project_id': 51, 'description': "Bonjour, Je pense que beaucoup de choses peuvent être améliorées. La première qui me vient à l'esprit est la question de l'environnement urbain. Nous allons vivre des années de plus en plus chaudes et je pense que ces pics de chaleur vont particulièrement se faire ressentir dans les villes où le béton absorbe de façon conséquente la chaleur. Personnellement, quand l'été arrive j'ai souvent l'impression d'étouffer. C'est pourquoi je pense que la végétalisation des espaces serait une solution efficace pour contrer la chaleur. Cela permettrait de créer des zones d'ombres, de faire baisser les températures, d'apporter un peu d'air et de casser l'aspect béton. Il me semble que beaucoup de gens aspirent à voir la nature un peu plus présente en ville ! J'habite dans le quartier des arènes où certes quelques arbres ont été plantés, mais cela ne me semble pas suffisant. Il serait tellement plus agréable de voir de l'herbe, des fleurs et cie à la place de ces t

In [20]:
print(embeddings.shape)

(1536,)


In [21]:
embedded_winning_projects = []

for index, row in district_winning_projects.iterrows():
    proj_obj = {
    "project_id": row['project_id'],
    "description": row['description'],
    "embedded_description": model.encode(row['description'])
    }
    embedded_winning_projects.append(proj_obj)

In [22]:
for embedded_winning_project in embedded_winning_projects:
    print(embedded_winning_project['project_id'])
embedding_winners = [x['embedded_description'] for x in embedded_winning_projects]
print(len(embedding_winners))

56
60
52
58
4


In [23]:
similarities = model.similarity(embedded_target_project['embedded_description'],embedding_winners)

  a = torch.tensor(a)


In [24]:
print(similarities)

tensor([[0.7539, 0.7389, 0.8527, 0.7960]])


**This works significantly faster and more consistent than the conversational model option but the similarity values can be different from model to model**  
Benchmark the most performant models can be a good idea