In [None]:
from bs4 import BeautifulSoup 
import requests as r
import pandas as pd
import re
from fake_useragent import UserAgent
import time
from datetime import datetime
import tkinter as tk
from tkinter import messagebox

In [None]:
root = tk.Tk()
width= root.winfo_screenwidth()               
height= root.winfo_screenheight()               
root.geometry(f"{width}x{int(height*0.95)}")
root.title("Main")
root.option_add( "*font", "TkDefaultFont 16" )

header = tk.Label(
        root, text="Car Price Scraper", font=("Arial", 32)
    )
header.pack(padx=10, pady=30)

y_pad=30
input_data = None

frame = tk.Frame(root,pady=y_pad)
label = tk.Label(frame,text="Maximum Car Price (SGD) ")
label.grid(row=0,column=0,sticky='e')
price = tk.Entry(frame)
price.grid(row=0,column=1)

label2 = tk.Label(frame,text="Maximum No. of Rows of Data ")
label2.grid(row=1,column=0,sticky='e')
rows = tk.Entry(frame)
rows.grid(row=1,column=1,pady=y_pad)

label3 = tk.Label(frame,text="Maximum Time Spent Scraping (minutes) ")
label3.grid(row=2,column=0)
duration = tk.Entry(frame)
duration.grid(row=2,column=1,sticky='e')


def check_sub():
    global input_data 
    data = [rows.get(),duration.get(),price.get()]
    is_empty = any(list(map(lambda x:x.strip()=='',data)))
    not_num = any(list(map(lambda x:x.strip().isdigit()==False,data)))
    if is_empty or not_num:
        messagebox.showwarning(title="Invalid Input", message="All inputs are required and must contain numeric digits ONLY")
    else:
        is_0 = any(list(map(lambda x:float(x)==0,data)))
        large_time = float(data[1])>240
        if is_0:
            messagebox.showwarning(title="Contains Zero", message="Inputs must be positive")
            return 
        if large_time:
            messagebox.showwarning(title="Long Period", message="Scraper cannot run longer than 4 hours")
            return 
        else:
            root.destroy()
            input_data = list(map(lambda x:float(x),data))

sub= tk.Button(
    frame, text="Submit", 
    command=check_sub
)
sub.grid(row=3,column=2,pady=y_pad*2)
frame.pack()

root.mainloop()

In [None]:
class InputError(Exception):
    pass

if input_data == None:
    raise InputError("Full Input Data Not Entered By User")
# Vs 3
# of finding ways to scrape used cars 
# built using bs4 and requests as opposed to scrapy in order to keep it low-level & simplistic w/o having to rely on virtual envs
# enabling code to run 'in-place'; no need to have other code where spiders are set up etc 

In [None]:
url='https://www.sgcarmart.com/used_cars/listing.php?BRSR=0&RPG=100&AVL=2&VEH=0'
# url lists 1st page; displaying 100 used-cars 
request=r.get(url)
soup=BeautifulSoup(request.text,'html.parser')

max_price=input_data[-1] # displays the max price 1 is willing to spend on a used car; in this case 50K

start_urls=[]
for e in soup.find('div',class_='price_child dropdown_child').find_all('a')[1:]: 
    # goes to the website code and searches for the relevant links which display used cars with prices below the max_price 
    # as the prices are segmented (10-15k, 20-25k etc, necessary to visit each url) 
    max_range=int(re.findall('\d+,?\d+',str(e.text))[-1].replace(',',''))
    
    if max_price>=max_range: # adds these urls to the start_urls list if the max price (price at the end; e.g. 10-15k; looks at the 15k), is 
        # less than/equal to the max price
        url='https://www.sgcarmart.com/used_cars/'+(e.get('href'))
        start_urls.append(url)
print(start_urls)
title=(f'Cars_below_{max_price/1000:.0f}k_v2') # title for excel sheet (referenced at end)

In [None]:
ua=UserAgent() # initialises a user agent

In [None]:
overall=pd.DataFrame(columns=['Car_name','Price','Car_type','Car_transmission','Reg_date','Manufacture_date',\
'Engine_cap_cc','Car_weight_kg','Brake_hp','COE_term_len','No_owners','Car_url','Description'])
c_time=time.time()
stop=False

overall_min_time = int(input_data[1])
overall_nrow = int(input_data[0])
# dataframe created
for url in start_urls:  # to visit each url in the start_urls list 
    if stop:
        break
    while True:
        agent=str(ua.random) # creates a random user agent (necessary to prevent multiple service requests to the server from the same user agent)
        # which in turn can flag the computer 
        # also the reason why this code cannot be run frequently
        while 'Windows' not in agent: # checks if agent has Windows keyword (being run from a windows computer); ideal that website used is 
            # templated for windows as well
            agent=str(ua.random)
        header={'User-Agent':agent} 

        try:
            request=r.get(url,headers=header) 
            soup=BeautifulSoup(request.text,'html.parser')
        except: # in the event of failure to get the request; code is exited
            print('Code Exited')
            break 
        else:
            pass
        for e in soup.find_all('table',{'style':'table-layout: fixed;'}): # extracts necessary info
            if (e.find('strong').find('a').get('href')).startswith('info.php'):
                name=((e.find('strong').find('a').text))
                car_url=('https://www.sgcarmart.com/used_cars/'+(e.find('strong').find('a').get('href')))
                c_request=r.get(car_url,headers=header) # to visit each car pages url (for extra info not available on the main page)
                c_soup=BeautifulSoup(c_request.text,'html.parser')
                try:
                    reg_date=c_soup.find_all('tr',class_='row_bg')[-1].find_all('td')[-1].text.strip().split('(')[0]
                except:
                    reg_date='' 
                try:
                    coe_term=re.findall(r'\((.*) C.*\)',c_soup.find_all('tr',class_='row_bg')[-1].find_all('td')[-1].text.strip())
                except:
                    coe_term=[] 
                if len(coe_term)>0:
                    coe_term_len=coe_term[0]
                else:
                    coe_term_len=''  
                for x in c_soup.find_all(class_='eachInfo'):
                    if 'No. Of Owners' in str(x).title():
                        try:
                            no_owners=int(x.find(class_='row_info').text.strip())
                        except:
                            no_owners=None
                    elif 'Engine Cap' in str(x).title():
                        try:
                            engine_cap=int(''.join(re.findall('\d',x.find(class_='row_info').text.strip())))
                        except:
                            engine_cap=None
                    elif 'Curb Weight' in str(x).title():
                        try:
                            weight=int(''.join(re.findall('\d',x.find(class_='row_info').text.strip())))
                        except:
                            weight=None
                    elif 'Transmission' in str(x).title():
                        try:
                            transmission=x.find(class_='row_info').text.strip()
                        except:
                            transmission=None
                    elif 'Manufactured' in str(x).title():
                        try:
                            made=int(x.find(class_='row_info').text.strip())
                        except:
                            made=None
            else:
                continue 
            price=(e.find('td',{'class':'font_red'}).text.strip()[1:].replace(',',''))
            car_type=(e.find('a',class_="link_black nounderline").text)
            try:
                description=(e.find('div',class_='two_line_text').text)
            except:
                description='' 
            # info is collated into a row and added to a new row of the overall dataframe
            new_row={'Car_name':name,'Price':price,'Car_type':car_type,'Car_transmission':transmission, 'Reg_date':reg_date,
                     'Manufacture_date':made,'Engine_cap_cc':engine_cap,'Car_weight_kg':weight,
                     'COE_term_len':coe_term_len,'No_owners':no_owners,'Car_url':car_url,'Description':description}
            overall.loc[len(overall)]=new_row  
            if ((time.time()-c_time)/60)>overall_min_time or len(overall)>overall_nrow:
                stop=True
                break
        if soup.find('a',class_='pagebar')!=None and soup.find_all('a',class_='pagebar')[-1].text.strip()[0].upper()=='N':
            url='https://www.sgcarmart.com'+(soup.find_all('a',class_='pagebar')[-1].get('href'))
            request=r.get(url,headers=header)
            soup=BeautifulSoup(request.text,'html.parser')
        else:
            break
        

In [None]:
overall['Price']=overall['Price'].astype('int32') # saves price information as numerical data (done separately from the main code 
# in event that str data is captured which could throw an error in the main block of code)


In [None]:
overall['Reg_year']=overall['Reg_date'].apply(lambda x:x.split('-')[-1])
# done to get just the year of registration of the car (which generally tallies with the manufacturing of the car (closer year>recently made))

In [None]:
def coe_len(term): # function to determine the coe length (yrs mths and days) into days only (assuming 1 mth has 30 days & 1 year has 365 days)
    if 'yr' in term and 'mth' in term and 'day' in term:
        years=int(re.findall(r'(\d+)yr',term)[0])
        mths=int(re.findall(r'(\d+)mth',term)[0])
        days=int(re.findall(r'(\d+)day',term)[0])
    elif 'yr' in term and 'mth' in term:
        years=int(re.findall(r'(\d+)yr',term)[0])
        mths=int(re.findall(r'(\d+)mth',term)[0])
        days=0
    elif 'yr' in term and 'day' in term:
        years=int(re.findall(r'(\d+)yr',term)[0])
        days=int(re.findall(r'(\d+)day',term)[0])
        mths=0
    elif 'mth' in term and 'day' in term:
        mths=int(re.findall(r'(\d+)mth',term)[0])
        days=int(re.findall(r'(\d+)day',term)[0])
        years=0
    elif 'yr' in term:
        years=int(re.findall(r'(\d+)yr',term)[0])
        mths=0 
        days=0 
    elif 'mth' in term:
        mths=int(re.findall(r'(\d+)mth',term)[0])
        years=0 
        days=0 
    else:
        try:
            days=int(re.findall(r'(\d+)day',term)[0])
            years=0 
            mths=0  
        except:
            days=0 
            years=0 
            mths=0
    return (years*365+mths*30+days) 

overall['COE_term_days']=overall['COE_term_len'].apply(coe_len)

In [None]:
overall['dollar_p_day']=overall['Price']/overall['COE_term_days'] # observes how 'worth' the car is; lower value ideal as for same amount; 
# can possess the car for longer
print(overall.shape)
overall.sample(5)

In [None]:
overall['Price'].describe()

In [None]:
file_save_time = datetime.now().strftime("%d-%m-%Y-%H:%M")
overall.to_excel(f'{title}_{file_save_time}.xlsx',index=False) # sends df information to an excel sheet in the same host folder; no index present