# Scraping data from Trusted Shops

## 1. Packages required

In [None]:
# python version used : python 3.11.5
# !python3 --version

In [1]:
# Import required packages
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import json
import ast

## 2. Function to collect reviews raw data for a given supplier

In [1]:
def get_reviews(url, n_pages, output_file):
    """ Collecte les avis listés dans les n pages de résultats de trustedShop à l'adresse "url"
        et sauvegarde les données dans un fichier output_file :"file_name.csv"
    """
    data = pd.DataFrame({})
    id = []
    rating = []
    title = []
    
    for i in range(1, n_pages+1):

        url1 = url+"?page={}".format(i)
        req = requests.get(url1)
        soup = BeautifulSoup(req.content,"html")
        
        # les données ciblés sont dansle premier script dans body"
        raw0 = json.loads(soup.body.script.contents[0])
        reviews0 = raw0['props']['pageProps']['reviews']['reviews']
        
        # les données sont mis dans un pandas DataFrame
        data = pd.concat([data, pd.DataFrame(reviews0)])

        # pour suivre l'exécution du programme
        #print("Page", i)

    # change the index
    data = data.set_index("id")
    
    # Sauvegarde des données collectée dans un fchier csv
    data.to_csv(output_file, sep = ";")


In [None]:
url = "https://www.trustedshops.fr/evaluation/info_XA3F5BBDE34B5AD3FB64015EB823AC7C6.html"
req = requests.get(url)
soup = BeautifulSoup(req.content,"html")
        
# les données ciblés sont dansle premier script dans body"
raw0 = json.loads(soup.body.script.contents[0])
reviews = raw0['props']['pageProps']['reviews']['reviews']
for i in range (len(reviews)):
    print(i,":")
    #print(reviews[i]["reply"]["comment"])
    print(reviews[i]["customer"].keys())

df = pd.read_csv("momox_raw.csv", sep = ";")
df.head()
# les données sont mis dans un pandas DataFrame
#data = pd.concat([data, pd.DataFrame(reviews0)])

### 2.1. Test get_reviews(...)

In [None]:
url = "https://www.trustedshops.fr/evaluation/info_XA3F5BBDE34B5AD3FB64015EB823AC7C6.html"
n_pages = 1
file = "test0.csv"
get_reviews(url, n_pages, file)
df = pd.read_csv(file, sep = ";", index_col = 0)
print(df.info())
df.head(1)


## 3. Function to clean the reviews data collected

In [6]:
def clean_reviews(input_file, output_file):
    """ 
        This function cleans raw reviews data scraped from trustedShop for a specific supplier.
        input : csv file of raw data : " xx...xx.csv"
        output : a cleaned data csv file : "yy..yy.csv"
    """
    # Load the row data to clean
    df = pd.read_csv(input_file, sep = ";")

    # change the index
    df = df.set_index("id")
    
    # remove irrelevant columns
    df = df.drop(["provider", "inModeration", "verificationStatus"], axis = 1)
    
    #  convert date data to datetime format
    df["createdAt"] = pd.to_datetime(df["createdAt"], unit = "ms")

    # extract the date of transaction and convert to datetime format
    df["transaction"] = df["transaction"].apply(lambda x: '' if len(x)==2 else int(x.split(":")[1][:-1]))
    df["transaction"] = pd.to_datetime(df["transaction"], unit = "ms")

    # extract customer city when available
    df["customerCity"] = df["customer"].apply(lambda x: '' if len(ast.literal_eval(x))<= 3 else ast.literal_eval(x)['city'])

    # extract the exchanges between the supplier and the customer
    df["reply"] = df["reply"].apply(lambda x: ast.literal_eval(x))
    df["SupplierReply"] = df["reply"].apply(lambda x: "" if len(x) == 0 else x["comment"])

    # Extract the date of supplier response and convert to date time format
    df["SupplierReplyDate"] = df["reply"].apply(lambda x: "" if len(x) <= 1 else int(x["createdAt"]))
    df["SupplierReplyDate"] = pd.to_datetime(df["SupplierReplyDate"], unit = "ms")

    # Remove remaining irrelevant columns
    df = df.drop(["customer", "reply"], axis = 1)
    
    # Save cleaned data into a csv file
    df.to_csv(output_file, sep = ";")

## 4. Function that collect reviews data into a row_data csv file, clean them and save into a clean_data csv file

In [None]:
def get_and_clean_reviews(url, n_pages, raw_data, cleaned_data):
    """
        The function scrapes data from trustedShop given url of search results into a csv file raw_data,
        then clean the raw data and save them into another csv file cleanded_data
    """
    get_reviews(url, n_pages, raw_data)
    clean_reviews(raw_data, cleaned_data)

### 4.1 test get_and_clean_reviews(url, n_pages, raw_data, cleaned_data)

In [None]:
# Test
url = "https://www.trustedshops.fr/evaluation/info_XA3F5BBDE34B5AD3FB64015EB823AC7C6.html"
n_pages = 1
raw_data = "test0_raw.csv"
cleaned_data = "test0_cleaned.csv"
get_and_clean_reviews(url, n_pages, raw_data, cleaned_data)
df = pd.read_csv(cleaned_data, sep = ";", index_col = 0)
print(df.info())
df.head(2)

## 5. Collect and clean reviews data

In [None]:
# collect reviews data of "RAYON D'OR BAGAGES"
url = "https://www.trustedshops.fr/evaluation/info_X5877FAE851EE366564CE1EC1604DF2B0.html"
n_pages = 55
raw_data = "rayonOrBags_raw.csv"
cleaned_data = "rayonOrBags.csv"
get_and_clean_reviews(url, n_pages, raw_data, cleaned_data)
df1 = pd.read_csv(cleaned_data, sep = ";", index_col = 0)
print(df1.info())
df1.head()

In [None]:
# collect reviews data of "LA REDOUTE"
url = "https://www.trustedshops.fr/evaluation/info_XA3F5BBDE34B5AD3FB64015EB823AC7C6.html"
n_pages = 1738
raw_data = "redoute_raw.csv"
cleaned_data = "redoute.csv"
get_and_clean_reviews(url, n_pages, raw_data, cleaned_data)
df2 = pd.read_csv(cleaned_data, sep = ";", index_col = 0)
print(df2.info())
df2.head()

In [None]:
# collect reviews data of MECATECHNIC FR
url = "https://www.trustedshops.fr/evaluation/info_X7CF89FDC3EBA9A38117A700B8AA37DCB.html"
n_pages = 430
raw_data = "mecatech_raw.csv"
cleaned_data = "mecatech.csv"
get_and_clean_reviews(url, n_pages, raw_data, cleaned_data)
df3 = pd.read_csv(cleaned_data, sep = ";", index_col = 0)
print(df3.info())
df3.head()

In [None]:
# collect reviews data of momox-shop
url = "https://www.trustedshops.fr/evaluation/info_X5D6CA473CA05A0D41F1334B0783BEA80.html"
n_pages = 890
raw_data = "momox_raw.csv"
cleaned_data = "momox.csv"
get_and_clean_reviews(url, n_pages, raw_data, cleaned_data)
df3 = pd.read_csv(cleaned_data, sep = ";", index_col = 0)
print(df3.info())
df3.head()

In [None]:
# collect review data of Back Market France
url = "https://www.trustedshops.fr/evaluation/info_X194F6470570B4368A91A3C7230E9014C.html"
n_pages = 769
raw_data = "backMarket_raw.csv"
cleaned_data = "backMarket.csv"
get_and_clean_reviews(url, n_pages, raw_data, cleaned_data)
df3 = pd.read_csv(cleaned_data, sep = ";", index_col = 0)
print(df3.info())
df3.head()

In [2]:
url = "https://www.trustedshops.fr/evaluation/info_X9E5F2816C0E395DA80950236909BEBC1.html"
n_pages = 575
raw_data = "showRoomPrive_raw.csv"
cleaned_data = "showRoomPrive.csv"
#get_and_clean_reviews(url, n_pages, raw_data, cleaned_data)
df3 = pd.read_csv(cleaned_data, sep = ";", index_col = 0)
print(df3.info())
df3.head()

<class 'pandas.core.frame.DataFrame'>
Index: 11491 entries, rev-ae4c6190-a467-490e-8e60-cbade430aaa2 to rev-353ef5d0-9a89-4871-a10d-6db56dbad020
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   rating             11491 non-null  int64  
 1   title              7356 non-null   object 
 2   comment            7275 non-null   object 
 3   createdAt          11491 non-null  object 
 4   transaction        11491 non-null  object 
 5   totalLikeCount     82 non-null     float64
 6   updatedAt          2 non-null      float64
 7   customerCity       1019 non-null   object 
 8   SupplierReply      11490 non-null  object 
 9   SupplierReplyDate  11490 non-null  object 
dtypes: float64(2), int64(1), object(7)
memory usage: 987.5+ KB
None


Unnamed: 0_level_0,rating,title,comment,createdAt,transaction,totalLikeCount,updatedAt,customerCity,SupplierReply,SupplierReplyDate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
rev-ae4c6190-a467-490e-8e60-cbade430aaa2,5,J'aime les choix des marchands et les…,J'aime les choix des marchands et les petits p...,2023-09-17,2023-09-11,,,,"Bonjour Liliane,\n\nJe vous remercie d'avoir p...",2023-09-18
rev-fd950caf-cf2c-4c93-8106-4fbc43ca6ae3,4,J'adore la marque Rowenta,"J'adore la marque Rowenta, tous mes appareils ...",2023-09-26,2023-09-09,,,Paris,"Bonjour Florence, \n\nVous nous faites part de...",2023-09-26
rev-d2452f37-1518-49c8-b21a-6ee6723940b3,3,Bonjour,"Bonjour, vous pouvez vous améliorer sur le cho...",2023-10-10,2023-09-26,,,,"Bonjour Florence,\n\nJe vous remercie pour avo...",2023-10-10
rev-bb15aed9-b61e-4549-aa8a-1c1783e2ce86,3,Achat remboursement différent du montant de l’...,"Bonjour, j’ai fait deux commandes sur le site....",2023-10-11,2023-09-28,,,Paris,"Bonjour Thanh,\n\nVous nous faites part d'un r...",2023-10-11
rev-fd7a8359-2d2d-4227-bd2e-a9b21cfaefe3,4,Jolie et livré rapidement.,Jolie et livré rapidement.\nJ'ai déjà eu un qu...,2023-09-20,2023-09-10,,,,"Bonjour Marine, \n\nMerci infiniment pour votr...",2023-09-21


In [7]:
clean_reviews("redoute_raw.csv", "redoute.csv")

In [8]:
df = pd.read_csv("redoute.csv", sep = ";")
df.head()

Unnamed: 0,id,rating,title,comment,createdAt,transaction,updatedAt,totalLikeCount,customerCity,SupplierReply,SupplierReplyDate
0,rev-cc907a30-4fb0-4e55-a4a1-307b89eb0d99,4,Facilte,"Facilte, sollicitations dosées...qualité..on p...",2023-10-05 14:18:42,2023-09-23,,,,,
1,rev-b54467c4-4769-42e3-a74a-c72c000059f9,4,Jolie table mais plus pour 6 que pour 8,Livraison impeccable avec possibilité de prend...,2023-10-02 13:08:27,2023-09-11,,,,,
2,rev-b480f537-524e-491e-9f85-74405332c047,4,Achat d'un canapé La Redoute intérieurs…,Achat d'un canapé La Redoute intérieurs le 6/9...,2023-09-18 16:52:21,2023-09-06,,,,,
3,rev-f04626f5-846d-41ca-9866-46573746f75b,4,excellent site mais des changements qui compl...,je commande depuis longtemps chez la redoute; ...,2023-10-13 08:57:16,2023-10-02,,,,,
4,rev-d3bb4eb5-ea0f-45db-afb5-313eaeaed6cb,5,"Comme d'habitude, aucun souci et rapidité d'ex...","Comme d'habitude, aucun souci et rapidité d'ex...",2023-09-21 11:09:15,2023-09-15,,,,,
