In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

In [None]:
from huggingface_hub import InferenceClient
def review_sentiment(text):
  client = InferenceClient(
  "distilbert/distilbert-base-uncased-finetuned-sst-2-english",
  api_key=os.getenv("HUGGINGFACE_TOKEN")
  )
  result = client.text_classification(text)
  sentiment = result[0]['label']
  return sentiment

## Scraping

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [None]:
base_url = "https://www.trustpilot.com/review/www.travelinsured.com?page="
max_page = 10
r = requests.get(base_url)
r.status_code

200

In [None]:
html_content = r.text

In [None]:
print(html_content[:500])

<!DOCTYPE html><html lang="en-US"><head><meta charSet="UTF-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="shortcut icon" type="image/x-icon" href="https://cdn.trustpilot.net/brand-assets/4.3.0/favicons/favicon.ico"/><link rel="manifest" href="/manifest.json"/><meta name="application-name" content="Trustpilot"/><meta name="theme-color" content="#1c1c1c"/><link rel="apple-touch-icon" sizes="180x180" href="https://cdn.trustpilot.net/brand-assets/4.3.0/favicons/a


In [None]:
soup = BeautifulSoup(html_content, 'html.parser')

In [None]:
reviews_div = soup.find_all('div', {'class': "styles_cardWrapper__w4HBQ styles_show__Zsi_m styles_reviewCard__vOUqi"})

In [None]:
reviews_div[1]

<div class="styles_cardWrapper__w4HBQ styles_show__Zsi_m styles_reviewCard__vOUqi"><article class="paper_paper__EGeEb paper_outline__bqVmn card_card__yyGgu card_noPadding__OOiac styles_reviewCard__6j0RQ" data-service-review-card-paper="true"><div class="styles_reviewCardInner__UZk1x"><aside aria-label="Info for Ellen" class="styles_consumerInfoWrapper__MOCv1"><div class="styles_consumerDetailsWrapper__2XThH"><div class="avatar_avatar__QtS0N avatar_orange__cIcPd" data-consumer-avatar="true" style="width:44px;min-width:44px;height:44px;min-height:44px"><span class="typography_heading-xs__osRhC typography_appearance-default__t8iAq typography_disableResponsiveSizing__z3EGy avatar_avatarName__2WZwR">EL</span></div><a class="link_internal__Eam_b link_wrapper__ahpyq styles_consumerDetails__DW9Hp" data-consumer-profile-link="true" href="/users/67b3b39e633676096ff27769" name="consumer-profile" rel="nofollow" target="_self"><span class="typography_heading-xxs__UmE9o typography_appearance-default

In [None]:
section = reviews_div[1].find('section', {'class': "styles_reviewContentwrapper__W9Vqf"})

In [None]:
section

<section aria-disabled="false" class="styles_reviewContentwrapper__W9Vqf"><div class="styles_reviewHeader__xV2js" data-service-review-rating="5"><div class="star-rating_starRating__sdbkn star-rating_medium__Oj7C9"><img alt="Rated 5 out of 5 stars" src="https://cdn.trustpilot.net/brand-assets/4.1.0/stars/stars-5.svg"/></div><div class="typography_body-m__k2UI7 typography_appearance-subtle__PYOVM styles_datesWrapper__2T9ri"><time class="" data-service-review-date-time-ago="true" datetime="2025-02-18T00:09:36.000Z">2 days ago</time></div><div class="styles_reviewLabels__a1QhX styles_reviewLabels____3__"><div class="styles_reviewLabel__I43un"><span><button class="styles_reviewLabelButton__Drv0Q" data-review-label-tooltip-trigger="true"><div class="typography_body-m__k2UI7 typography_appearance-subtle__PYOVM styles_detailsIcon__xmMRm"><svg class="icon_icon__RdICC" fill="currentColor" height="14px" viewbox="0 0 16 16" width="14px" xmlns="http://www.w3.org/2000/svg"><path clip-rule="evenodd" 

Rate

In [None]:
rate = section.find('div', {'class': "star-rating_starRating__sdbkn star-rating_medium__Oj7C9"}).find('img').get('alt')

In [None]:
rate

'Rated 5 out of 5 stars'

In [None]:
type(rate)

str

Review

In [None]:
review_div = section.find('div', {'class': "styles_reviewContent__44s_M"})

In [None]:
review_title_a = review_div.a

In [None]:
review_title = review_title_a.h2.get_text()

In [None]:
review_title

'Fortunately we didn’t have to use our…'

In [None]:
type(review_title)

str

In [None]:
review_content_p = review_title_a.next_sibling

In [None]:
review_content = review_content_p.get_text()

In [None]:
print(review_content)

Fortunately we didn’t have to use our travel insurance but it gave us peace of mind knowing that we were insured.


In [None]:
review_date = review_content_p.next_sibling.get_text()

In [None]:
print(review_date)

Date of experience: November 25, 2024


### Ingestion

Il faut noter que c'est les avis les plus concrets fournis par le site qu'on a scrapé

In [None]:
base_url = "https://www.trustpilot.com/review/www.travelinsured.com?page="
max_page = 46

In [None]:
def get_rate(rate_text: str) -> int:
  match rate_text:
    case "Rated 0 out of 5 stars":
      return 0
    case "Rated 1 out of 5 stars":
      return 1
    case "Rated 2 out of 5 stars":
      return 2
    case "Rated 3 out of 5 stars":
      return 3
    case "Rated 4 out of 5 stars":
      return 4
    case "Rated 5 out of 5 stars":
      return 5
    case _:
      return None

In [None]:
TII_reviews_df = pd.DataFrame(columns=["rate", "title", "content", "review_date"])

In [None]:
# Gestion de la pagination
for page in range(max_page):
    url = f"{base_url}{page}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    reviews_div = soup.find_all('div', {'class': "styles_cardWrapper__w4HBQ styles_show__Zsi_m styles_reviewCard__vOUqi"})

    for r_div in reviews_div:
      section = r_div.find('section', {'class': "styles_reviewContentwrapper__W9Vqf"})
      rate = get_rate(section.find('div', {'class': "star-rating_starRating__sdbkn star-rating_medium__Oj7C9"}).find('img').get('alt'))
      # Review div
      review_div = section.find('div', {'class': "styles_reviewContent__44s_M"})
      # Title
      review_title_a = review_div.a
      review_title = review_title_a.h2.get_text() if review_title_a and review_title_a.h2 else None
      # Content
      review_content_p = review_div.find('p', {'class': "typography_body-l__v5JLj typography_appearance-default__t8iAq typography_color-black__wpn7m"})
      review_content = review_content_p.get_text() if review_content_p else None
      # Date
      review_date_p = review_div.find('p', {'class': "typography_body-m__k2UI7 typography_appearance-default__t8iAq"})
      review_date = review_date_p.get_text().split(":")[-1].strip() if review_date_p else None

      TII_reviews_df = pd.concat([TII_reviews_df, pd.DataFrame({"rate": [rate], "title": [review_title], "content": [review_content], "review_date": [review_date]})], ignore_index=True)


In [None]:
TII_reviews_df.head()

Unnamed: 0,rate,title,content,review_date
0,5,Better Than Thought,We purchased a travel plan from Travel Insured...,"February 14, 2025"
1,5,Fortunately we didn’t have to use our…,Fortunately we didn’t have to use our travel i...,"November 25, 2024"
2,5,Insurance was easy to purchase,Insurance was easy to purchase online & gave m...,"November 01, 2024"
3,5,Peace of mind,Isabel took care of our travel insurance needs...,"January 18, 2025"
4,5,ABC ruise coverage,Policy issued same day with good coverage expl...,"January 28, 2025"


In [None]:
TII_reviews_df.shape

(900, 4)

In [None]:
# Convert 'review_date' column to datetime objects
TII_reviews_df['review_date'] = pd.to_datetime(TII_reviews_df['review_date'], errors='coerce')

print(TII_reviews_df.head())
#TII_reviews_df.dtypes

  rate                                   title  \
0    5                     Better Than Thought   
1    5  Fortunately we didn’t have to use our…   
2    5          Insurance was easy to purchase   
3    5                           Peace of mind   
4    5                      ABC ruise coverage   

                                             content review_date  
0  We purchased a travel plan from Travel Insured...  2025-02-14  
1  Fortunately we didn’t have to use our travel i...  2024-11-25  
2  Insurance was easy to purchase online & gave m...  2024-11-01  
3  Isabel took care of our travel insurance needs...  2025-01-18  
4  Policy issued same day with good coverage expl...  2025-01-28  


In [None]:
# Filter for reviews in December 2024
december_2024_complaints = TII_reviews_df[
    (TII_reviews_df['review_date'].dt.year == 2024) &
    (TII_reviews_df['review_date'].dt.month == 12)
]

december_2024_complaints

Unnamed: 0,rate,title,content,review_date
15,5,I have used this company before on…,I have used this company before on previous tr...,2024-12-21
18,4,Travel insurance for Vietnam tour,I had to buy travel insurance to go on a NatGe...,2024-12-19
24,4,I’ve used this insurance for quite a…,I’ve used this insurance for quite a few trips...,2024-12-25
25,5,It was an insurance product and the 2nd…,It was an insurance product and the 2nd time I...,2024-12-05
35,5,The process was smooth,The process was smooth. I called answered a fe...,2024-12-13
45,5,Simple and Easy,Simple and easy to apply for and purchase.Just...,2024-12-04
48,5,My flight was delayed for a day due to…,My flight was delayed for a day due to fog at ...,2024-12-16
49,5,Wonderful options and great customer service!,As a travel advisor I really appreciate that m...,2024-12-15
56,5,To the rescue once again.,I had an injury on our cruise that involved a ...,2024-12-26
69,1,Virtually impossible to file a claim,"Virtually impossible to file a claim, everythi...",2024-12-01


In [None]:
# Filter reviews for the year 2022
TII_reviews_df_2022 = TII_reviews_df[TII_reviews_df['review_date'].dt.year == 2022]

TII_reviews_df_2022

Unnamed: 0,rate,title,content,review_date
855,2,Made one medical claim that was never…,Made one medical claim that was never paid. N...,2022-09-01
856,5,Our flight was delayed for four days…,Our flight was delayed for four days due to co...,2022-12-28
862,5,Highly recommend,Travel Insured International has wonderful cus...,2022-10-10
864,5,We are lucky to say we have not had to…,We are lucky to say we have not had to make an...,2022-11-25
869,1,I filed a claim in October of 2022 and…,I filed a claim in October of 2022 and still h...,2022-10-02
876,3,Cannot rate without a claim There is no way t...,Thre is no way to rate my experience until I h...,2022-11-20
884,5,The thing that made my experience good…A Piece...,The thing that made my experience good is that...,2022-11-03
895,1,TravelInsured.com - Avoid like the plague,I have a hard time believing that Travel insur...,2022-10-07
896,1,Still waiting almost seven months after…,Still waiting almost seven months after my VAL...,2022-06-04
897,1,I sent in a claim on September 7th and…,I sent in a claim on September 7th and was fin...,2022-10-27


In [None]:
TII_reviews_df_2022.shape

(12, 4)

In [None]:
type(TII_reviews_df['title'][0])

str

Fusion Title-Content

In [None]:
TII_reviews_df['title'][899].endswith("…")

True

In [None]:
TII_reviews_df['title'][899][:-1]

'I was unable to travel to an'

In [None]:
# Fonction pour fusionner correctement title et content
def merge_title_content(row):
    title, content = row["title"], row["content"]

    # Handle None values for title and content
    if title is None or content is None:
        return content if content is not None else title  # Return whichever is not None, or None if both are None

    # Supprimer les "…" à la fin du titre (s'il y en a)
    if title.endswith("…"):
        title_cleaned = title[:-1]
    else:
        title_cleaned = title

    # Vérifier si title_cleaned est bien au début de content
    if content.lower().startswith(title_cleaned.lower()):
        return content  # On garde uniquement content

    # Sinon, on concatène title et content
    return f"{title_cleaned}. {content}"

# Appliquer la transformation
TII_reviews_df["Reviews"] = TII_reviews_df.apply(merge_title_content, axis=1)

# Vérifier le résultat
print(TII_reviews_df[["title", "content", "Reviews"]])

                                         title  \
0                          Better Than Thought   
1       Fortunately we didn’t have to use our…   
2               Insurance was easy to purchase   
3                                Peace of mind   
4                           ABC ruise coverage   
..                                         ...   
895  TravelInsured.com - Avoid like the plague   
896   Still waiting almost seven months after…   
897    I sent in a claim on September 7th and…   
898           Don't believe all their promises   
899              I was unable to travel to an…   

                                               content  \
0    We purchased a travel plan from Travel Insured...   
1    Fortunately we didn’t have to use our travel i...   
2    Insurance was easy to purchase online & gave m...   
3    Isabel took care of our travel insurance needs...   
4    Policy issued same day with good coverage expl...   
..                                                 

Les sentiments

In [None]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


In [None]:
import emoji
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Décodage des emojis
    text = emoji.demojize(text, delimiters=("", ""))  # Remplace les emojis par leur description textuelle
    # Gestion des répétitions
    text = re.sub(r'\b(\w+)\s+\1\b', r'\1', text)  # Supprime les mots répétés consécutifs
    # Conversion en minuscules
    text = text.lower()
    # Suppression de la ponctuation et des chiffres
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Suppression des stop words et lemmatisation
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

In [None]:
TII_reviews_df["Sentiment"] = TII_reviews_df["Reviews"].apply(clean_text).apply(review_sentiment)

In [None]:
TII_reviews_df.to_csv('Travel-Insured-International_Custumers_Reviews.csv', index=False)