In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [2]:
df =pd.read_csv('train_set.csv',index_col=0)
df.head(5)

Unnamed: 0,note,avis,assureur,produit,date_publication,date_exp,avis_en,tokens_en,bigrams_en,tokens_fr,bigrams_fr
0,4.0,la personne au téléphone était clair et sympat...,L'olivier Assurance,auto,06/10/2021,01/10/2021,the person on the phone was clear and friendly...,"['person', 'phone', 'clear', 'friendly', 'expl...","['person_phone', 'phone_clear', 'clear_friendl...","['personne', 'téléphone', 'clair', 'sympathiqu...","['personne_téléphone', 'téléphone_clair', 'cla..."
1,4.0,satisfaitréactivité simplicité prix attractif ...,APRIL Moto,moto,09/07/2021,01/07/2021,satisfiedreactivity simplicity attractive pric...,"['satisfiedreactivity', 'simplicity', 'attract...","['satisfiedreactivity_simplicity', 'simplicity...","['satisfaitréactivité', 'simplicité', 'prix', ...","['satisfaitréactivité_simplicité', 'simplicité..."
2,1.0,assureur à fuir n assure pas ses responsabilit...,SwissLife,vie,15/10/2020,01/10/2020,insurer to flee does not ensure its responsibi...,"['insurer', 'flee', 'ensure', 'responsibility'...","['insurer_flee', 'flee_ensure', 'ensure_respon...","['assureur', 'fuir', 'assure', 'responsabilité...","['assureur_fuir', 'fuir_assure', 'assure_respo..."
3,1.0,voilà 3 mois que la gmf me fait attendre pour ...,GMF,habitation,03/03/2020,01/03/2020,the gmf has been waiting for a water damage fo...,"['gmf', 'waiting', 'water', 'damage', 'month',...","['gmf_waiting', 'waiting_water', 'water_damage...","['voilà', 'mois', 'gmf', 'fait', 'attendre', '...","['voilà_mois', 'mois_gmf', 'gmf_fait', 'fait_a..."
4,3.0,je suis bien avec cet assuranceelle est pratiq...,L'olivier Assurance,auto,28/08/2021,01/08/2021,i am good with this insurance she is practical...,"['good', 'insurance', 'practical', 'least', 'f...","['good_insurance', 'insurance_practical', 'pra...","['bien', 'cet', 'assuranceelle', 'pratique', '...","['bien_cet', 'cet_assuranceelle', 'assuranceel..."


### Information Retrieval

In [3]:
def preprocess(text):
    # Remove special characters and put in lower case
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    # Tokenization, remove stop words, and stemming
    stop_words = set(stopwords.words('english'))
    stemmer = SnowballStemmer('english')
    tokens = [stemmer.stem(word) for word in text.split() if word not in stop_words]
    return ' '.join(tokens)

In [4]:
def retrieve_information(query, df):
    """IRS system using TF-IDF and cosine similarity to give the best results associated with the query"""
    # Combine relevant informations in one column and preprocess it
    df['information'] =  df['avis_en'] + df['produit'] + df["assureur"].str.lower()
    df['information'] = df['information'].apply(preprocess)
    # Use Tf_idf vectorizer for the combined_text
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df['information'])
    # Vectorize the query
    query_vectorized = vectorizer.transform([preprocess(query)])
    # Compute cosine similarity between the query and the tfidf_matrix
    similarity_scores = cosine_similarity(query_vectorized, tfidf_matrix).flatten()
    # Add similarity scores to df and sort by similarity 
    df['similarity'] = similarity_scores
    df = df.sort_values(by='similarity', ascending=False)
    return df[['assureur', 'produit','avis_en','note','similarity']]

# Query
query = "best auto insurance"
result_df = retrieve_information(query, df)
print(" 10 Results:")
display(result_df.head(10))

 10 Results:


Unnamed: 0,assureur,produit,avis_en,note,similarity
21354,Direct Assurance,auto,easy to use and quick you need to have a phone...,5.0,0.495108
1326,Direct Assurance,auto,i am really satisfied with your service top cu...,5.0,0.389005
5688,L'olivier Assurance,auto,i am satisfied with your service prices suit m...,3.0,0.371196
20156,L'olivier Assurance,auto,the best insurance i have ever had100 satisfied,5.0,0.355177
19154,AMV,moto,it is the insurance that offered me the best g...,5.0,0.353148
5293,Direct Assurance,auto,simple and quick best price on car insurance ...,5.0,0.350366
1112,Direct Assurance,auto,i am cool and kind i like insurance companies ...,5.0,0.33203
21050,Direct Assurance,auto,speed professionalism cheaper than an agency ...,4.0,0.327845
19692,L'olivier Assurance,auto,i am satisfied with the servicei was very well...,4.0,0.324359
13766,L'olivier Assurance,auto,very satisfactory service on the online phone ...,4.0,0.317091


In [5]:
display(result_df.head(10).iloc[4]["avis_en"])

'it is the insurance that offered me the best guarantees at the best price i hope i never have a claim to judge their efficiency in any case the staff are available listening and responsive'

In [6]:
query = "best moto insurance"
result_df = retrieve_information(query, df)
print(" 10 Results:")
display(result_df.head(10))

 10 Results:


Unnamed: 0,assureur,produit,avis_en,note,similarity
21442,APRIL Moto,moto,best insurance with good quality of services a...,4.0,0.472762
13736,APRIL Moto,moto,i am satisfied with the price and the services...,5.0,0.439569
1326,Direct Assurance,auto,i am really satisfied with your service top cu...,5.0,0.424429
5688,L'olivier Assurance,auto,i am satisfied with your service prices suit m...,3.0,0.404999
10595,APRIL Moto,moto,no complaints on the prices and services offer...,4.0,0.402496
4138,APRIL Moto,moto,this is the best price i have found for a youn...,5.0,0.395192
20156,L'olivier Assurance,auto,the best insurance i have ever had100 satisfied,5.0,0.387521
20674,APRIL Moto,moto,i am satisfied insurance is not very expensive...,4.0,0.387088
19154,AMV,moto,it is the insurance that offered me the best g...,5.0,0.385307
5293,Direct Assurance,auto,simple and quick best price on car insurance ...,5.0,0.382271


In [7]:
list(result_df[result_df['produit']=='auto'].head(4)['avis_en'])

['i am really satisfied with your service top customer service advantageous prices i recommend 10000 times best insurance best customer relationship',
 'i am satisfied with your service prices suit me i will try to do my best to be able to fulfill my contracts of contracts in the best conditions',
 'the best insurance i have ever had100 satisfied',
 'simple and quick  best price on car insurance walking sobt customer service of good advice and does not hesitate to help choose the insurance that will suit you best i recommend']