In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
import time
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
review_list = []

In [3]:
# define a function to scrape the reviews from each page, input is a tuple from start page to end page
def scrape_reviews(page_range: tuple):
    for page_number in range(page_range[0], page_range[1]+1):
        try:
            # Create a new instance of the Chrome driver
            driver = webdriver.Chrome()
            driver.get("https://www.glassdoor.co.uk/Reviews/Zara-Reviews-E17544_P"+str(page_number)+".htm?filter.iso3Language=eng")
            
            # Find all elements with matching XPath
            review_elements = driver.find_elements(By.XPATH, '//*[starts-with(@id, "empReview_")]')
            
            # Iterate through each element and extract the text
            for element in review_elements:
                review = element.text
                # append each review to a list
                review_list.append(review)
            #close the driver
            driver.quit()
        except:
            # if a unknown error occurs, try again
            driver = webdriver.Chrome()
            driver.get("https://www.glassdoor.co.uk/Reviews/Zara-Reviews-E17544_P"+str(page_number)+".htm?filter.iso3Language=eng")
            ids = driver.find_elements(By.XPATH, '//*[starts-with(@id, "empReview_")]')
            for element in ids:
                review = element.text
            review_list.append(review)
            driver.quit()
            # continue to the next page
            continue

In [6]:
scrape_reviews((65,100))

In [8]:
# convert the list to a dictionary
review_dict = {}
for i in range(len(review_list)):
    review_dict[i+1] = review_list[i]
# convert the dictionary to a csv file
df = pd.DataFrame.from_dict(review_dict, orient='index', columns=['review'])

In [57]:
# create the following columns in the df: overall rates, status(whether the reviewer is a current employee or not),title,date,position,location,pros,cons.
# split the review column into the above columns
df['overall_rates'] = df['review'].str.split('\n', expand=True)[0]
df['status'] = df['review'].str.split('\n', expand=True)[2]
df['title'] = df['review'].str.split('\n', expand=True)[3]
# the date,position,location information is all in a single split row, so we need to split it again
# the date and the position split by a '-', and the position and the location split by a string ' in '
df['date'] = df['review'].str.split('\n', expand=True)[4].str.split('-', expand=True)[0]
df['position'] = df['review'].str.split('\n', expand=True)[4].str.split('-', expand=True)[1].str.split(' in ', expand=True)[0]
df['location'] = df['review'].str.split('\n', expand=True)[4].str.split('-', expand=True)[1].str.split(' in ', expand=True)[1]

df['pros'] = df['review'].str.split('\n', expand=True)[9]
df['cons'] = df['review'].str.split('\n', expand=True)[11]


In [61]:
# if the pros and cons have "-" in front of the text, remove it
df['pros'] = df['pros'].str.replace('-','')
df['cons'] = df['cons'].str.replace('-','')
# fill the NaN values with 'None'
df = df.fillna('None')

In [62]:
# create a new df withou the review column
df_reviews = df.drop(columns=['review'])
# eda of the df_reviews
df_reviews.info()
df_reviews.describe()
df_reviews.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 918 entries, 1 to 918
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   overall_rates  918 non-null    object
 1   status         918 non-null    object
 2   title          918 non-null    object
 3   date           918 non-null    object
 4   position       918 non-null    object
 5   location       918 non-null    object
 6   pros           918 non-null    object
 7   cons           918 non-null    object
dtypes: object(8)
memory usage: 96.8+ KB


Unnamed: 0,overall_rates,status,title,date,position,location,pros,cons
1,5.0,Former Employee,Retail assistant review,8 May 2023,Retail Assistant,,Was fun I enjoyed working there.,Rude customers during the sales
2,2.0,"Current Employee, more than 3 years",Avoid!!,23 May 2023,Sales Associate,"Manchester City Centre, England",1. Making different people 2. Potential for ...,1. Overworking yourself 2. Long hours 3. Manag...
3,3.0,Former Employee,Not the worst but not the best either,22 May 2023,Retail Sales Assistant,,Not bad salary Not too stressful,Uniform Enviornment Staff and managers not ver...
4,3.0,Former Employee,Experience at Zara,28 Jan 2023,Sales Assistant,,I was able to broaden my skills whilst workin...,I feel that there were a handful of employees ...
5,3.0,Former Employee,law salaries,26 May 2023,Sales Associate,"Reading, England","career growth, good team, good manager",always very busy and not enough staff


In [None]:
df_reviews.to_csv('F:/Meta/Web/zara_reviews.csv', index=False, encoding='utf-8-sig')


In [76]:
# get most frequent words in pros and cons
from collections import Counter
import nltk
nltk.download('stopwords')
import re

# define a function to get the most frequent words in a column
def get_most_frequent_words(column):
    # create a list to store all the words
    all_words = []
    # iterate through each row in the column
    for row in column:
        # split the text by space
        words = row.split(' ')
        # iterate through each word in the row
        for word in words:
            # if the word is not a stopword, append it to the list
            if word not in stopwords.words('english'):
                all_words.append(word)
    # get the top 10 most frequent words
    most_frequent_words = Counter(all_words).most_common(10)
    # return the most frequent words
    return most_frequent_words

# get the most frequent words in pros and cons
most_frequent_words_in_pros = get_most_frequent_words(df_reviews['pros'])
most_frequent_words_in_cons = get_most_frequent_words(df_reviews['cons'])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sunwe\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [77]:
most_frequent_words_in_cons

[('work', 229),
 ('I', 224),
 ('', 196),
 ('hours', 150),
 ('management', 127),
 ('staff', 127),
 ('managers', 127),
 ('get', 112),
 ('time', 104),
 ('working', 97)]

In [78]:
most_frequent_words_in_pros

[('good', 232),
 ('Good', 184),
 ('discount', 165),
 ('work', 163),
 ('pay', 121),
 ('', 110),
 ('nice', 104),
 ('people', 95),
 ('I', 93),
 ('staff', 84)]