In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import numpy as np

## Web Scraping

In [2]:
URL_main = 'https://www.westfield.com.au'
URL_main_nz = 'https://www.westfield.co.nz'
#URL_centre = 'https://www.westfield.com.au/bondijunction/whats-happening'
#URL_offers = 'https://www.westfield.com.au/bondijunction/offer/5lV2u2FyI19BiEP3MKFhyX/midseasonsale'

In [22]:
def get_all_centres():
    #Query to send to graphql api
    query_centre="""
                query{
                    centres{
                        title
                        suburb
                        state
                        country
                        slug
                        status
                          }
                     }
                """ 
    #Make a call to Scentre group's open graphql API that holds the centres information
    request_graphql = requests.post('https://api.scentregroup.io/v1/graphql', json={'query': query_centre}) 
    
    if request_graphql.status_code == 200:
        graphql_json = request_graphql.json() #results will be in json format
    else:
        raise Exception("Query failed to run by returning code of {}. {}".format(request_graphql.status_code, query_centre))

    df_centres=pd.DataFrame(graphql_json['data']['centres']) #need to drill down the nested dictionaries until the final layer that holds the "slug" info
    df_centres['CentreLink']=np.where(df_centres['country']=='Australia', URL_main + '/' + df_centres['slug'],
                                                                      URL_main_nz + '/' + df_centres['slug'])
    df_centres['WhatsonLink']=np.where(df_centres['country']=='Australia', 
                                                   URL_main + '/' + df_centres['slug'] +'/whats-happening',
                                                URL_main_nz +'/' + df_centres['slug'] +'/whats-happening')
    
    return df_centres

In [24]:
df_centres=get_all_centres()

In [25]:
df_centres.head(5)

Unnamed: 0,title,suburb,state,country,slug,status,CentreLink,WhatsonLink
0,Doncaster,Doncaster,VIC,Australia,doncaster,Open,https://www.westfield.com.au/doncaster,https://www.westfield.com.au/doncaster/whats-h...
1,Newmarket,Newmarket,AUCKLAND,New Zealand,newmarket,Open,https://www.westfield.co.nz/newmarket,https://www.westfield.co.nz/newmarket/whats-ha...
2,Whitford City,Hillarys,WA,Australia,whitfordcity,Open,https://www.westfield.com.au/whitfordcity,https://www.westfield.com.au/whitfordcity/what...
3,West Lakes,West Lakes,SA,Australia,westlakes,Open,https://www.westfield.com.au/westlakes,https://www.westfield.com.au/westlakes/whats-h...
4,Warringah Mall,Brookvale,NSW,Australia,warringahmall,Open,https://www.westfield.com.au/warringahmall,https://www.westfield.com.au/warringahmall/wha...


In [26]:
# Find what's on at a given centre
def get_today_whatson_list(df_centres):
    
    list_on_today=[] #prepare an empty list
    list_centre_name=[]
    
    for i in range(len(df_centres)):
        url = df_centres['WhatsonLink'][i]
        centre_name = df_centres['title'][i]
        country = df_centres['country'][i]
        
        page_centre = requests.get(url)
        soup_centre = BeautifulSoup(page_centre.content, 'lxml')
        link_today = soup_centre.find(id='today-list')
    
        for link in link_today.find_all('a',href=True): #find all link tags
            if country == 'Australia':
                list_on_today.append(URL_main + link.get('href'))
            else:
                list_on_today.append(URL_main_nz + link.get('href'))
                
            list_centre_name.append(centre_name)
            
        df_on_today = pd.DataFrame({'Hyperlinks':list_on_today, 'Centres':list_centre_name})
        df_on_today.drop_duplicates(keep='first', inplace=True)
        df_on_today.reset_index(drop=True, inplace=True) 
        
    return df_on_today

In [27]:
df_whatson=get_today_whatson_list(df_centres=df_centres)

In [28]:
df_whatson.head(5)

Unnamed: 0,Hyperlinks,Centres
0,https://www.westfield.com.au/doncaster/offer/5...,Doncaster
1,https://www.westfield.com.au/doncaster/offer/2...,Doncaster
2,https://www.westfield.com.au/doncaster/offer/O...,Doncaster
3,https://www.westfield.com.au/doncaster/offer/4...,Doncaster
4,https://www.westfield.com.au/doncaster/offer/1...,Doncaster


In [32]:
# Go to each what's on link and scrape relevant content
def scrape_content_from_link(df_link):
    list_centre=[]
    list_retailer=[]
    list_time=[]
    list_header=[]
    list_para=[] 
    list_link=[]
    
    for i in range(len(df_link)):
        URL_offers = df_link['Hyperlinks'][i]
        centres_name = df_link['Centres'][i]
        page_offer = requests.get(URL_offers)
        soup_offer = BeautifulSoup(page_offer.content, 'lxml')
    
        retailer = soup_offer.find('h3', 
            attrs={'class': "index__small___1e5XG typography__headingSmall___yBDIq typography__heading___3ravE index__heading___2Wcro"})
        time_range = soup_offer.find('li', 
            attrs={'class': "index__occurrence___3etDh index__occurrenceHasIcon___2_Kxf"})
        header = soup_offer.find('h1', 
            attrs={'class':"index__large___NFuOE typography__headingLarge___20L47 typography__heading___3ravE"})
        paragraph = soup_offer.find('p', 
            attrs={'class': "index__large___1gqog typography__paragraphLarge___2lmtJ typography__bodyFont___1exgA"})
    
        if retailer is None: #Need to do this for retailer as webpages have ununiformed layout
            list_retailer.append('Missing')
        else:
            list_retailer.append(retailer.text)
    
        if time_range is None: #Need to do this for time as some posts are not time bound
            list_time.append('Missing')
        else:
            list_time.append(time_range.text)
        
        list_header.append(header.text)
        list_para.append(paragraph.text)
        list_centre.append(centres_name)
        list_link.append(URL_offers)
        
    df_content=pd.DataFrame({'Centres':list_centre,'Retailer':list_retailer,'EffectiveTime':list_time, 
                             'link':list_link, 'Header': list_header, 'Paragraph':list_para}) 
    
    #impute the missing retailer with the text before ":" in the header
    df_content['Retailer'][df_content['Retailer']=='Missing']= df_content['Header'][df_content['Retailer']=='Missing'].str.split(':').str[0]
    df_content['Vebatim']=df_content['Header']+'. '+df_content['Paragraph']
    
    return df_content

In [33]:
#df_offers = scrape_content_from_link(df_link=df_whatson)  -- The full list will take too long

# Chose 3 centres as a test for now, this will have approximately 140+ posts
df_sample = df_whatson[df_whatson['Centres'].isin(['Sydney','Parramatta','Bondi Junction'])]
df_sample.reset_index(drop=True, inplace=True)

df_offers = scrape_content_from_link(df_link=df_sample)

In [34]:
df_offers.head(5)

Unnamed: 0,Centres,Retailer,EffectiveTime,link,Header,Paragraph,Vebatim
0,Sydney,The Other Art Fair pop-up exhibition,"Tue 21st Apr, 10:00am - 11:55pm",https://www.westfield.com.au/sydney/event/019i...,The Other Art Fair pop-up exhibition,We’ve partnered with the team at The Other Art...,The Other Art Fair pop-up exhibition. We’ve pa...
1,Sydney,Review,20th Mar - 23rd Mar,https://www.westfield.com.au/sydney/offer/4EIF...,Review: 25% off store wide,25% Off Store Wide,Review: 25% off store wide. 25% Off Store Wide
2,Sydney,alice McCALL,19th Mar - 26th Mar,https://www.westfield.com.au/sydney/offer/12Ti...,alice McCALL: Mid Season Sale - Take a further...,"Shop alice McCALL's Mid Season Sale, take a fu...",alice McCALL: Mid Season Sale - Take a further...
3,Sydney,Kookai,19th Mar - 28th Mar,https://www.westfield.com.au/sydney/offer/6ecx...,KOOKAI: Mid Season Sale,Kookai Mid Season Sale - Up To 50% Off Selecte...,KOOKAI: Mid Season Sale . Kookai Mid Season Sa...
4,Sydney,Swarovski,19th Mar - 27th Mar,https://www.westfield.com.au/sydney/offer/6Qkp...,Swarovski: travel jewellery box gift with purc...,Receive your free Travel Jewellery Box when yo...,Swarovski: travel jewellery box gift with purc...


In [35]:
df_offers.to_csv('Whats_on_SYD_BON_PAR.csv',sep=';')