In [75]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import re
import pickle
import ast
import os
from langchain_groq import ChatGroq

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

In [77]:
test_data = pd.read_csv('drugsComTest_raw.tsv',sep = '\t')
train_data = pd.read_csv('drugsComTrain_raw.tsv',sep = '\t')

In [78]:
df = pd.concat([train_data, test_data], axis=0)

In [79]:
with open('y_pred_glove_bilstm.pkl', 'rb') as fichier:
    BiLSTM_GloVe = pickle.load(fichier)

with open('y_pred_llm2vec_rand_forest.pkl', 'rb') as fichier:
    RandForest_llm2vec = pickle.load(fichier)

In [80]:
df['BiLSTM_GloVe'] = BiLSTM_GloVe
df['RandForest_llm2vec'] = RandForest_llm2vec

In [81]:
df.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount,BiLSTM_GloVe,RandForest_llm2vec
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27,0.997195,0.96
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,0.999868,0.95
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,0.000784,0.04
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,0.99929,0.89
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37,1.0,1.0


In [82]:
#models weights
weights = {
    'BiLSTM_GloVe': 0.4,
    'RandForest_llm2vec': 0.6}

In [83]:
df['weighted_prediction'] = sum(df[model] * weight for model, weight in weights.items())

# ACWS FUNCTION

In [None]:
def calculate_final_score(weighted_prediction, useful_count):
    # Normalisation de useful_count entre 0 et 1
    max_useful_count = 1291 #max useful count
    normalized_useful_count = useful_count / max_useful_count
    
    # Fonction de transformation pour weighted_prediction
    # Transformation de [0,1] vers [-5,5]
    base_score = (weighted_prediction * 10) - 5
    
    # Facteur d'influence basé sur useful_count
    # Utilisation d'une fonction sigmoïde modifiée pour une transition douce
    influence_factor = 1 / (1 + np.exp(-10 * (normalized_useful_count - 0.5)))
    
    # Calcul du score final avec pondération
    if normalized_useful_count < 0.1:  # Très bas useful_count
        final_score = base_score
    else:
        # Amplification non-linéaire basée sur useful_count
        amplification = np.power(normalized_useful_count, 0.5)
        final_score = base_score * (1 + (influence_factor * amplification))
        
        # Garantir que le score reste dans [-5, 5]
        final_score = np.clip(final_score, -5, 5)
    
    return round(final_score, 3)

In [85]:
df['final_score'] = df.apply(lambda row: calculate_final_score(row['weighted_prediction'], row['usefulCount']), axis=1)

### Importation data feature extraction

In [None]:
df2 = pd.read_excel('data_processed_final.xlsx')

In [87]:
df2

Unnamed: 0.1,Unnamed: 0,drugName,feature extraction,drugName_y,condition,review,rating,date,usefulCount
0,92703,Lybrel,"[-3, 1, -1, -2, 4, -2, 7]",Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,"December 14, 2009",17
1,48928,Ethinyl estradiol / levonorgestrel,"[-4, 2, 3, 2, 4, 3, 3]",Ethinyl estradiol / levonorgestrel,Birth Control,"""I had been on the pill for many years. When m...",8,"December 8, 2016",1
2,75612,L-methylfolate,"[4, 3, 2, 5, 5, 4, 5]",L-methylfolate,Depression,"""I have taken anti-depressants for years, with...",10,"March 9, 2017",54
3,227020,Etonogestrel,"[-4, -3, -2, -1, -2, -3, -2]",Etonogestrel,Birth Control,"""Nexplanon does its job. I can have worry free...",9,"August 11, 2014",11
4,96233,Sertraline,"[-3, 2, 2, 4, 4, 2, 4]",Sertraline,Depression,"""1 week on Zoloft for anxiety and mood swings....",8,"May 7, 2011",3
...,...,...,...,...,...,...,...,...,...
62173,9891,Lo Loestrin Fe,"[-3, 2, 2, 2, 4, -2, 4]",Lo Loestrin Fe,Birth Control,"""It works, but stressful experience. This pill...",10,"April 4, 2014",17
62174,144201,Etonogestrel,"[-3, 1, 2, -4, 2, -3, -4]",Etonogestrel,Birth Control,"""This is THE absolute worst birth control and ...",1,"June 7, 2016",2
62175,85053,Ethinyl estradiol / norgestimate,"[-3, -4, -2, -4, -2, -3, -4]",Ethinyl estradiol / norgestimate,Birth Control,"""I just finished my first month of being on th...",5,"December 8, 2015",0
62176,148141,Norethindrone,"[-3, 2, 2, 3, 4, 4, 4]",Norethindrone,Birth Control,"""I&#039;ve tried many, many forms of BCPs. Hav...",9,"July 15, 2015",20


In [88]:
data = df2.merge(df, on='Unnamed: 0', how='left')

In [89]:
data

Unnamed: 0.1,Unnamed: 0,drugName_x,feature extraction,drugName_y,condition_x,review_x,rating_x,date_x,usefulCount_x,drugName_y.1,condition_y,review_y,rating_y,date_y,usefulCount_y,BiLSTM_GloVe,RandForest_llm2vec,weighted_prediction,final_score
0,92703,Lybrel,"[-3, 1, -1, -2, 4, -2, 7]",Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,"December 14, 2009",17,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,0.000784,0.04,0.024313,-4.757
1,48928,Ethinyl estradiol / levonorgestrel,"[-4, 2, 3, 2, 4, 3, 3]",Ethinyl estradiol / levonorgestrel,Birth Control,"""I had been on the pill for many years. When m...",8,"December 8, 2016",1,Ethinyl estradiol / levonorgestrel,Birth Control,"""I had been on the pill for many years. When m...",8.0,"December 8, 2016",1,0.999983,0.92,0.951993,4.520
2,75612,L-methylfolate,"[4, 3, 2, 5, 5, 4, 5]",L-methylfolate,Depression,"""I have taken anti-depressants for years, with...",10,"March 9, 2017",54,L-methylfolate,Depression,"""I have taken anti-depressants for years, with...",10.0,"March 9, 2017",54,0.999982,1.00,0.999993,5.000
3,227020,Etonogestrel,"[-4, -3, -2, -1, -2, -3, -2]",Etonogestrel,Birth Control,"""Nexplanon does its job. I can have worry free...",9,"August 11, 2014",11,Etonogestrel,Birth Control,"""Nexplanon does its job. I can have worry free...",9.0,"August 11, 2014",11,0.999861,0.96,0.975944,4.759
4,96233,Sertraline,"[-3, 2, 2, 4, 4, 2, 4]",Sertraline,Depression,"""1 week on Zoloft for anxiety and mood swings....",8,"May 7, 2011",3,Sertraline,Depression,"""1 week on Zoloft for anxiety and mood swings....",8.0,"May 7, 2011",3,0.963192,0.90,0.925277,4.253
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62173,9891,Lo Loestrin Fe,"[-3, 2, 2, 2, 4, -2, 4]",Lo Loestrin Fe,Birth Control,"""It works, but stressful experience. This pill...",10,"April 4, 2014",17,Lo Loestrin Fe,Birth Control,"""It works, but stressful experience. This pill...",10.0,"April 4, 2014",17,0.999137,0.96,0.975655,4.757
62174,144201,Etonogestrel,"[-3, 1, 2, -4, 2, -3, -4]",Etonogestrel,Birth Control,"""This is THE absolute worst birth control and ...",1,"June 7, 2016",2,Etonogestrel,Birth Control,"""This is THE absolute worst birth control and ...",1.0,"June 7, 2016",2,0.000598,0.10,0.060239,-4.398
62175,85053,Ethinyl estradiol / norgestimate,"[-3, -4, -2, -4, -2, -3, -4]",Ethinyl estradiol / norgestimate,Birth Control,"""I just finished my first month of being on th...",5,"December 8, 2015",0,Ethinyl estradiol / norgestimate,Birth Control,"""I just finished my first month of being on th...",5.0,"December 8, 2015",0,0.331228,0.15,0.222491,-2.775
62176,148141,Norethindrone,"[-3, 2, 2, 3, 4, 4, 4]",Norethindrone,Birth Control,"""I&#039;ve tried many, many forms of BCPs. Hav...",9,"July 15, 2015",20,Norethindrone,Birth Control,"""I&#039;ve tried many, many forms of BCPs. Hav...",9.0,"July 15, 2015",20,0.999943,0.98,0.987977,4.880


In [90]:
data.rename(columns={"drugName_x": "drugName"}, inplace=True)
data.rename(columns={"condition_x": "condition"}, inplace=True)
data.rename(columns={"rating_x": "rating"}, inplace=True)

data = data[['Unnamed: 0', 'drugName','condition','rating', 'final_score', 'feature extraction']]

In [91]:
def ajuster_vecteur(vecteur):
    try:
        vecteur = ast.literal_eval(vecteur)
        if len(vecteur) == 7:
            return [max(-5, min(5, val)) for val in vecteur]
    except (ValueError, SyntaxError):
        return None
    return None
    
data['feature extraction'] = data['feature extraction'].apply(ajuster_vecteur)
data = data.dropna(subset=['feature extraction'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['feature extraction'] = data['feature extraction'].apply(ajuster_vecteur)


# COSINE SIMILARITY FOR RANKING

In [92]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from difflib import get_close_matches

In [93]:
# Pondération pour le score final
weight = 2  # Pondération pour le score final

data["weighted_features"] = data.apply(
    lambda row: row["feature extraction"] + [row["final_score"] * weight], axis=1
)

In [94]:
valeurs_frequentes = data['condition'].value_counts()
resultat = valeurs_frequentes[valeurs_frequentes > 20].index.tolist()


In [95]:
data

Unnamed: 0.1,Unnamed: 0,drugName,condition,rating,final_score,feature extraction,weighted_features
0,92703,Lybrel,Birth Control,5,-4.757,"[-3, 1, -1, -2, 4, -2, 5]","[-3, 1, -1, -2, 4, -2, 5, -9.514]"
1,48928,Ethinyl estradiol / levonorgestrel,Birth Control,8,4.520,"[-4, 2, 3, 2, 4, 3, 3]","[-4, 2, 3, 2, 4, 3, 3, 9.04]"
2,75612,L-methylfolate,Depression,10,5.000,"[4, 3, 2, 5, 5, 4, 5]","[4, 3, 2, 5, 5, 4, 5, 10.0]"
3,227020,Etonogestrel,Birth Control,9,4.759,"[-4, -3, -2, -1, -2, -3, -2]","[-4, -3, -2, -1, -2, -3, -2, 9.518]"
4,96233,Sertraline,Depression,8,4.253,"[-3, 2, 2, 4, 4, 2, 4]","[-3, 2, 2, 4, 4, 2, 4, 8.506]"
...,...,...,...,...,...,...,...
62173,9891,Lo Loestrin Fe,Birth Control,10,4.757,"[-3, 2, 2, 2, 4, -2, 4]","[-3, 2, 2, 2, 4, -2, 4, 9.514]"
62174,144201,Etonogestrel,Birth Control,1,-4.398,"[-3, 1, 2, -4, 2, -3, -4]","[-3, 1, 2, -4, 2, -3, -4, -8.796]"
62175,85053,Ethinyl estradiol / norgestimate,Birth Control,5,-2.775,"[-3, -4, -2, -4, -2, -3, -4]","[-3, -4, -2, -4, -2, -3, -4, -5.55]"
62176,148141,Norethindrone,Birth Control,9,4.880,"[-3, 2, 2, 3, 4, 4, 4]","[-3, 2, 2, 3, 4, 4, 4, 9.76]"


In [186]:
def recommend_drugs(user_condition, user_features, data, top_n=10):
    # Ajouter le score final pondéré de l'utilisateur
    user_features_weighted = user_features + [5 * weight]
    user_vector = np.array(user_features_weighted).reshape(1, -1)

    # Filtrer uniquement les médicaments avec la même condition
    same_condition = data[data["condition"].isin(user_condition)].copy()

    if same_condition.empty:
        print("Aucun médicament trouvé pour la condition donnée.")
        return pd.DataFrame(columns=["drugName", "condition", "similarity"])

    # Calculer les similarités uniquement sur ce sous-ensemble
    drug_vectors = np.array(same_condition["weighted_features"].tolist())
    similarities = cosine_similarity(user_vector, drug_vectors).flatten()

    # Ajouter les similarités au DataFrame filtré
    same_condition["similarity"] = similarities

    # Trier par similarité décroissante
    same_condition = same_condition.sort_values(by="similarity", ascending=False)

    # Supprimer les doublons sur le nom du médicament
    unique_recommendations = same_condition.drop_duplicates(subset=["drugName"], keep="first")

    # Retourner les top_n
    recommendations = unique_recommendations.head(top_n)

    print(recommendations[["drugName", "condition", "similarity","rating" ]])
    return recommendations[["drugName", "condition", "similarity", "rating"]]


In [187]:
conditions = data['condition'].unique().tolist()
conditions

['Birth Control', 'Depression', 'Pain', 'Acne', 'Anxiety']

### Condition extraction

In [188]:
def condition_extraction(query, conditions):
    prompt = f"""You are an expert in extracting health conditions from user requests. 
    1. Identify diseases or health problems mentioned in the user query.
    2. From the list provided, select the condition that best match the disease or health problem described.
    3. Return the result in this format: 
       @<condition>@
    
    List of conditions: {conditions}
    
    User query: {query} """
    
    llm = ChatGroq(
        model="llama-3.1-8b-instant",
        temperature=0,
    )
    response = llm.invoke(prompt).content
    pattern = r'@([^@]+)@'
    matches = re.findall(pattern, response)
    return matches

In [None]:
query = "Over the past few weeks, I've experienced persistent Anxiety, including restlessness, difficulty concentrating"

In [None]:
conditions_user = [get_close_matches(str(i),[str(i) for i in conditions], n=1, cutoff=0.6)[0] 
                   for i in condition_extraction(query, conditions)]

"""To achieve optimal recommendations, the feature vector
components and the final desired score are assigned a maximum value of
5, reflecting the user’s ideal preferences. This enables the system’s
language model to focus specifically on accurately identifying the
condition mentioned in the user’s query. Recommendations are ranked
based on the cosine similarity between the user-defined vector and the
drug vectors in the final dataset."""

user_features = [5, 5, 5, 5, 5, 5, 5]
recommendations = recommend_drugs(conditions_user, user_features, data)

          drugName condition  similarity  rating
51616   Alprazolam   Anxiety    0.992542       9
34277  Mirtazapine   Anxiety    0.992090       9
37235   Clonazepam   Anxiety    0.992001       8
32437        Xanax   Anxiety    0.991834       8
38790     Klonopin   Anxiety    0.991768      10
35286  Venlafaxine   Anxiety    0.991768      10
58215      Lexapro   Anxiety    0.991768      10
15596    Buspirone   Anxiety    0.991701       9
26933   Paroxetine   Anxiety    0.991670      10
31699     Tramadol   Anxiety    0.991581      10


In [288]:
#recommendations.to_excel('Evaluation_Recsys/acne_methodology.xlsx', index=False)

# EVALUATION

### 1. Mean Rating at k

In [289]:
def mean_rating_at_k(df, k=10):
    top_k = df.nlargest(k, 'similarity')  # ou df.head(k) si c’est déjà trié
    return top_k['rating'].mean()

mean_rating_at_k(recommendations)

9.3

### 2. Hit ratio at k

In [290]:
def hit_ratio_at_k(df, k=10, threshold=5):
    top_k = df.nlargest(k, 'similarity')  # ou df.head(k) si déjà trié
    hits = top_k[top_k['rating'] >= threshold]
    return len(hits) / k
    
hit_ratio_at_k(recommendations)

1.0

### 3. NDCG at k

In [292]:
import numpy as np

def ndcg_at_10_thresholded(df, threshold=5):
    # Top 10 recommendations par similarité
    top_k = df.nlargest(10, 'similarity')
    
    # r_i: les ratings top-10, delta = 1 si r_i >= threshold, 0 sinon
    ratings = top_k['rating'].values
    deltas = (ratings >= threshold).astype(int)
    
    # DCG@10 avec le masque delta
    dcg = np.sum(((2 ** ratings - 1) * deltas) / np.log2(np.arange(2, 12)))  # i = 1 à 10 -> log2(i+1) = log2(2) à log2(11)
    
    # IDCG@10 : ratings triés (>= threshold), les meilleurs en premier
    ideal_ratings = df[df['rating'] >= threshold]['rating'].sort_values(ascending=False).values[:10]
    idcg = np.sum((2 ** ideal_ratings - 1) / np.log2(np.arange(2, len(ideal_ratings) + 2)))
    
    return dcg / idcg if idcg != 0 else 0.0


ndcg_at_10_thresholded(recommendations)

0.7979629579110247