# TrustPilot Scraper | Results Analysis

This notebook conducts some analysis on the results of the data scraped by the TrustPilot scraper developed in the previous notebook. 

## 0.0 Import Libraries

In [1]:
# Data manipulation & stats
import pandas as pd
import numpy as np
import re
import csv

# Data visualisation
import matplotlib.pyplot
%matplotlib inline
import seaborn as sns

# Standard libraries
import os
import datetime
import time
from tqdm import tqdm
import random

## 1.0 Setup Config

### 1.1 Local paths

In [2]:
notebooks_dir_path = os.getcwd()
repo_dir_path = notebooks_dir_path.replace("/notebooks", "")
data_dir_path = os.path.join(repo_dir_path, "data")

In [3]:
# Local data paths
category_dirs = os.listdir(data_dir_path)
category_dirs = [directory for directory in category_dirs if not directory.startswith(".")]
print(f"Categories scraped: \n{category_dirs}")

# Category paths
category_dir_paths = [os.path.join(data_dir_path, directory) for directory in category_dirs]
category_df_paths = [os.path.join(directory, "companies_df_full.csv") for directory in category_dir_paths]
category_df_paths = [
    os.path.join(directory, "companies_df_full.csv") for directory in category_dir_paths \
    if os.path.exists(os.path.join(directory, "companies_df_full.csv"))
]

# Read in data
df_dict = {}
for category_df_path in category_df_paths:
    category_name = category_df_path.split("/")[7]
    print(category_name)
    
    # Read in df
    data = []
    
    with open(category_df_path, "r", encoding="utf-8") as file:
        reader = csv.reader(file)
    
        try:
            for row in reader:
                data.append(row)
        except csv.Error as e:
            print(f"Error reading CSV file: {e}")
    
    # Convert the list of lists (data) to a pandas DataFrame
    category_df = pd.DataFrame(data)

    category_df.columns = category_df.iloc[0]
    category_df = category_df[1:]  # Skip the first row since it's now the header
    
    df_dict[category_name] = category_df
    df_clean_path = os.path.join(data_dir_path, category_name, "clean_categories_data.csv")
    category_df.to_csv(df_clean_path)


Categories scraped: 
['shipping_logistics', 'home_garden', 'travel_vacation', 'animals_pets', 'beauty_wellbeing', 'events_entertainment', 'restaurants_bars', 'money_insurance', 'electronics_technology', 'food_beverages_tobacco', 'vehicles_transportation', 'shopping_fashion']
home_garden
travel_vacation
animals_pets
beauty_wellbeing
events_entertainment
money_insurance
electronics_technology
food_beverages_tobacco
vehicles_transportation
shopping_fashion


## 2.0 Category data analysis

### 2.1 Data cleaning

In [47]:
# Dimensions for each raw category df
for key in df_dict.keys():
    print(f"{key}: {df_dict[key].shape}")

home_garden: (1280, 11)
travel_vacation: (20830, 14)
animals_pets: (6561, 11)
beauty_wellbeing: (9095, 11)
events_entertainment: (2971, 11)
money_insurance: (21048, 11)
electronics_technology: (32094, 11)
food_beverages_tobacco: (11041, 11)
vehicles_transportation: (23312, 11)
shopping_fashion: (30315, 11)


In [55]:
df = df_dict["travel_vacation"].copy()


In [57]:
print(f"{'-'*100}\nNumber of None records by column\n{'-'*100}\n{df.isnull().sum()}")

----------------------------------------------------------------------------------------------------
Number of None records by column
----------------------------------------------------------------------------------------------------
0
company_name            0
review_link            76
company_score         208
num_reviews           208
categories_page       208
address               208
is_uk                   0
date                  208
score                 208
review                208
reviews_page          340
company_name_clean      0
address_clean           0
review_clean            0
dtype: int64


In [58]:
# Drop Null rows
df = df.dropna()

# Drop duplicates
df = df.drop_duplicates()

# Convert all str columns to lower
for col_str in ["company_name", "address", "review"]:
    df[f"{col_str}_clean"] = df[col_str].apply(lambda x: x.lower() if x is not None else "")
    
# Clean is_uk column
df["is_uk"] = df.groupby("company_name_clean")["address_clean"].transform(
    lambda x: any(("uk" in str(a) or "united kingdom" in str(a)) for a in x)
)
# Find most common address for each company - that isn't blank
most_common_address = df.groupby("company_name_clean")["address_clean"].agg(
    lambda x: x.mode().iloc[0] if not x.mode().empty else None
).reset_index()
most_common_address.columns = ["company_name_clean", "most_common_address_clean"]
most_common_address["most_common_address_clean"] = most_common_address["most_common_address_clean"].apply(
    lambda x: x if (x is not None and x != "") else "united kingdom"
)
# Merge and replace the most common address
df = pd.merge(df, most_common_address, on='company_name_clean', how='left')
df["address_clean"] = df["most_common_address_clean"]
df = df.drop("most_common_address_clean", axis=1)





df.drop_duplicates(subset="company_name").head(100)

Unnamed: 0,company_name,review_link,company_score,num_reviews,categories_page,address,is_uk,date,score,review,reviews_page,company_name_clean,address_clean,review_clean
0,P&O Cruises,https://uk.trustpilot.com/review/pocruises.com,2.7,1966,3,united kingdom,True,2023-11-12,1,Transatlantic on Arvia returning to Manchester...,1,p&o cruises,united kingdom,transatlantic on arvia returning to manchester...
980,British Airways,https://uk.trustpilot.com/review/www.britishai...,1.4,9597,4,,False,2023-11-20,1,Four flights on round trip Edinburgh to Montre...,1,british airways,united kingdom,four flights on round trip edinburgh to montre...
1900,Premier Inn,https://uk.trustpilot.com/review/www.premierin...,2.3,4593,4,united kingdom,True,2023-11-20,1,Stayed at premier in Newark for 2 nights 20th ...,1,premier inn,united kingdom,stayed at premier in newark for 2 nights 20th ...
2839,lastminute.com,https://uk.trustpilot.com/review/www.lastminut...,2.6,53330,4,united states,False,2023-11-22,1,Worst travel agency We booked ticket over phon...,1,lastminute.com,united states,worst travel agency we booked ticket over phon...
3119,Travelodge,https://uk.trustpilot.com/review/www.travelodg...,2.3,3094,4,united kingdom,True,2022-12-06,1,I stayed in this hotel for around 6 full weeks...,1,travelodge,united kingdom,i stayed in this hotel for around 6 full weeks...
4079,Trusted Travel,https://uk.trustpilot.com/review/trustedtravel...,2.6,2764,5,"10 chiswick court, fy39tw, blackpool | lancash...",True,2023-11-09,1,I Urge Do not use this bogus booking company.W...,1,trusted travel,"10 chiswick court, fy39tw, blackpool | lancash...",i urge do not use this bogus booking company.w...
4319,Easyjet,https://uk.trustpilot.com/review/www.easyjet.com,1.4,15923,6,united kingdom,True,2023-11-19,1,DO NOT RENT a car through easyJet appI had to ...,1,easyjet,united kingdom,do not rent a car through easyjet appi had to ...
5119,Gotogate.co.uk,https://uk.trustpilot.com/review/www.gotogate....,2.8,29590,9,united kingdom,True,2023-11-19,5,Gave the best price for tickets to New York an...,1,gotogate.co.uk,united kingdom,gave the best price for tickets to new york an...
5239,Virgin Atlantic,https://uk.trustpilot.com/review/www.virgin-at...,1.7,1665,11,united kingdom,True,2023-11-15,1,Terrible experience.I bought 2 tickets from Br...,1,virgin atlantic,united kingdom,terrible experience.i bought 2 tickets from br...
6079,Ryanair,https://uk.trustpilot.com/review/www.ryanair.com,1.4,19489,12,united kingdom,True,2023-11-21,1,Theft - they have everything organized to stea...,1,ryanair,united kingdom,theft - they have everything organized to stea...


In [64]:
pd.DataFrame(df.groupby("company_name_clean")["address"].value_counts())

Unnamed: 0_level_0,Unnamed: 1_level_0,count
company_name_clean,address,Unnamed: 2_level_1
a1travel.com,,641
aer lingus,united kingdom,900
big bus tours london,"london, united states",480
british airways,,920
city sightseeing,"city sightseeing ltd suite 8, floor 3 grosvenor house , b97 4dl, redditch, worcestershire, united kingdom",440
dream world travel,"central chambers, suite21 ealing broadway, london., w5 2nr, uk, london, united kingdom",780
easyjet,united kingdom,800
ebookers,,833
emirates,united states,620
esky travel,united kingdom,120


In [115]:
keywords = ['whatsapp', 'whats app', 'whats app', 'message', 'texted', 'text', 'sms', 'messaged']
df["review_clean"] = df["review"].apply(lambda x: x.lower() if x is not None else "")
pattern = '|'.join(map(re.escape, keywords))
df["contains_keyword"] = df["review_clean"].str.contains(pattern)

In [118]:
df.loc[
    (df.contains_keyword == True) &
    (df.is_uk == True)
].to_csv("travel_vacation_example_df.csv")

(475, 14)