# Scraping data from Trusted Shops

## 1. Packages required

In [19]:
# python version used : python 3.11.5
# !python3 --version

In [1]:
# Import required packages
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import json
import ast

## 2. Function to collect reviews raw data for a given supplier

In [2]:
def get_reviews(url, n_pages, output_file):
    """ Collecte les avis listés dans les n pages de résultats de trustedShop à l'adresse "url"
        et sauvegarde les données dans un fichier output_file :"file_name.csv"
    """
    data = pd.DataFrame({})
    for i in range(1, n_pages+1):

        url1 = url+"?page={}".format(i)
        req = requests.get(url1)
        soup = BeautifulSoup(req.content,"html")
        
        # les données ciblés sont dansle premier script dans body"
        raw0 = json.loads(soup.body.script.contents[0])
        reviews0 = raw0['props']['pageProps']['reviews']['reviews']
        
        # les données sont mis dans un pandas DataFrame
        data = pd.concat([data, pd.DataFrame(reviews0)])

        # pour suivre l'exécution du programme
        #print("Page", i)

    # change the index
    data = data.set_index("id")
    
    # Sauvegarde des données collectée dans un fchier csv
    data.to_csv(output_file, sep = ";")


### 2.1. Test get_reviews(...)

In [None]:
url = "https://www.trustedshops.fr/evaluation/info_XA3F5BBDE34B5AD3FB64015EB823AC7C6.html"
n_pages = 1
file = "test0.csv"
get_reviews(url, n_pages, file)
df = pd.read_csv(file, sep = ";", index_col = 0)
print(df.info())
df.head(1)


## 3. Function to clean the reviews data collected

In [3]:
def clean_reviews(input_file, output_file):
    """ 
        This function cleans raw reviews data scraped from trustedShop for a specific supplier.
        input : csv file of raw data : " xx...xx.csv"
        output : a cleaned data csv file : "yy..yy.csv"
    """
    # Load the row data to clean
    df = pd.read_csv(input_file, sep = ";")

    # change the index
    df = df.set_index("id")
    
    # remove irrelevant columns
    df = df.drop(["provider", "inModeration", "verificationStatus"], axis = 1)
    
    #  convert date data to datetime format
    df["createdAt"] = pd.to_datetime(df["createdAt"], unit = "ms").dt.date

    # extract the date of transaction and convert to datetime format
    df["transaction"] = df["transaction"].apply(lambda x: '' if len(x)==2 else int(x.split(":")[1][:-1]))
    df["transaction"] = pd.to_datetime(df["transaction"], unit = "ms").dt.date

    # extract customer city when available
    df["customerCity"] = df["customer"].apply(lambda x: '' if len(ast.literal_eval(x))<= 3 else ast.literal_eval(x)['city'])

    # extract the exchanges between the supplier and the customer
    df["reply"] = df["reply"].apply(lambda x: ast.literal_eval(x))
    df["SupplierReply"] = df["reply"].apply(lambda x: "" if len(x) == 0 else x["comment"])

    # Extract the date of supplier response and convert to date time format
    df["SupplierReplyDate"] = df["reply"].apply(lambda x: "" if len(x) <= 1 else int(x["createdAt"]))
    df["SupplierReplyDate"] = pd.to_datetime(df["SupplierReplyDate"], unit = "ms").dt.date

    # Remove remaining irrelevant columns
    df = df.drop(["customer", "reply"], axis = 1)
    
    # Save cleaned data into a csv file
    df.to_csv(output_file, sep = ";")

## 4. Function that collect reviews data into a row_data csv file, clean them and save into a clean_data csv file

In [9]:
def get_and_clean_reviews(url, n_pages, raw_data, cleaned_data):
    """
        The function scrapes data from trustedShop given url of search results into a csv file raw_data,
        then clean the raw data and save them into another csv file cleanded_data
    """
    get_reviews(url, n_pages, raw_data)
    clean_reviews(raw_data, cleaned_data)

### 4.1 test get_and_clean_reviews(url, n_pages, raw_data, cleaned_data)

In [12]:
# Test
url = "https://www.trustedshops.fr/evaluation/info_XA3F5BBDE34B5AD3FB64015EB823AC7C6.html"
n_pages = 1
raw_data = "test0_raw.csv"
cleaned_data = "test0_cleaned.csv"
get_and_clean_reviews(url, n_pages, raw_data, cleaned_data)
df = pd.read_csv(cleaned_data, sep = ";", index_col = 0)
print(df.info())
df.head(2)

<class 'pandas.core.frame.DataFrame'>
Index: 20 entries, rev-cc907a30-4fb0-4e55-a4a1-307b89eb0d99 to rev-c6f732ca-6c82-4ee0-a508-3af3ccc1db4b
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   rating             20 non-null     int64  
 1   title              20 non-null     object 
 2   comment            20 non-null     object 
 3   createdAt          20 non-null     object 
 4   transaction        20 non-null     object 
 5   customerCity       3 non-null      object 
 6   SupplierReply      0 non-null      float64
 7   SupplierReplyDate  0 non-null      float64
dtypes: float64(2), int64(1), object(5)
memory usage: 1.4+ KB
None


Unnamed: 0_level_0,rating,title,comment,createdAt,transaction,customerCity,SupplierReply,SupplierReplyDate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
rev-cc907a30-4fb0-4e55-a4a1-307b89eb0d99,4,Facilte,"Facilte, sollicitations dosées...qualité..on p...",2023-10-05,2023-09-23,,,
rev-b54467c4-4769-42e3-a74a-c72c000059f9,4,Jolie table mais plus pour 6 que pour 8,Livraison impeccable avec possibilité de prend...,2023-10-02,2023-09-11,,,


## 5. Collect and clean reviews data

In [14]:
# collect reviews data of "RAYON D'OR BAGAGES"
url = "https://www.trustedshops.fr/evaluation/info_X5877FAE851EE366564CE1EC1604DF2B0.html"
n_pages = 55
raw_data = "rayonOrBags_raw.csv"
cleaned_data = "rayonOrBags_clean.csv"
get_and_clean_reviews(url, n_pages, raw_data, cleaned_data)
df1 = pd.read_csv(cleaned_data, sep = ";", index_col = 0)
print(df1.info())
df1.head()

<class 'pandas.core.frame.DataFrame'>
Index: 1084 entries, rev-a2bf124b-dfd2-4ea9-ad92-75a7ace2ed61 to rev-2efc7fd6-311c-4f90-8b4d-9748aec7fc09
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   rating             1084 non-null   int64  
 1   title              734 non-null    object 
 2   comment            727 non-null    object 
 3   createdAt          1084 non-null   object 
 4   transaction        919 non-null    object 
 5   totalLikeCount     15 non-null     float64
 6   updatedAt          2 non-null      float64
 7   customerCity       79 non-null     object 
 8   SupplierReply      91 non-null     object 
 9   SupplierReplyDate  91 non-null     object 
dtypes: float64(2), int64(1), object(7)
memory usage: 93.2+ KB
None


Unnamed: 0_level_0,rating,title,comment,createdAt,transaction,totalLikeCount,updatedAt,customerCity,SupplierReply,SupplierReplyDate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
rev-a2bf124b-dfd2-4ea9-ad92-75a7ace2ed61,5,Objet Conforme et prix imbattable,"Franchement, vu le prix, j'ai pensé à une arna...",2023-10-06,,,,,,
rev-3e01aed1-74ca-4d1f-91c6-7b36c904d68a,4,Livraison très rapide de l'article…,Livraison très rapide de l'article commandé !\...,2023-09-28,,,,NUITS SAINT GEORGES,,
rev-ff51282e-0601-47e4-8384-c2d598b2fae6,5,Valise cabine Samsonite silver,Nous avons reçu notre valise dans un délai inf...,2023-09-27,,,,ROYAN,,
rev-98884d0e-8be6-4080-a3d2-2c10b7ea8db1,5,Prise en compte quasi immédiate de la…,Prise en compte quasi immédiate de la non livr...,2023-10-09,,,,,,
rev-096046f4-f70e-4828-9281-6edb59b64ba1,5,Tout est nickel,Tout est nickel \nJ’ai fait mon choix sur le s...,2023-10-10,,,,,,


In [None]:
# collect reviews data of "LA REDOUTE"
url = "https://www.trustedshops.fr/evaluation/info_XA3F5BBDE34B5AD3FB64015EB823AC7C6.html"
n_pages = 1738
raw_data = "redoute_raw.csv"
cleaned_data = "redoute_clean.csv"
#get_and_clean_reviews(url, n_pages, raw_data, cleaned_data)
df2 = pd.read_csv(cleaned_data, sep = ";", index_col = 0)
print(df2.info())
df2.head()

In [None]:
# collect reviews data of MECATECHNIC FR
url = "https://www.trustedshops.fr/evaluation/info_X7CF89FDC3EBA9A38117A700B8AA37DCB.html"
n_pages = 430
raw_data = "mecatech_raw.csv"
cleaned_data = "mecatech_clean.csv"
get_and_clean_reviews(url, n_pages, raw_data, cleaned_data)
df3 = pd.read_csv(cleaned_data, sep = ";", index_col = 0)
print(df3.info())
df3.head()