In [4]:
import requests
import bs4
import pandas as pd
import datetime
import sqlalchemy
import time
from contextlib import contextmanager
import random
import urllib3
from nordvpn_connect import initialize_vpn, rotate_VPN, close_vpn_connection
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [5]:
def change_ip():
    """ Connects to a nord VPN server.
    Args:
        No args.
    Returns:
        Nothing. Changes server in background.
    """ 
    settings = initialize_vpn("Poland")  # starts nordvpn and stuff
    rotate_VPN(settings)  # actually connect to server

In [6]:
@contextmanager
def database(url):
    """ Creates context in which engine is created, perform an 
    action and tear down the connection once finished.
    Args:
        Connecion URL.
    Returns:
        Postgres database 
    """
    # Create engine
    db = sqlalchemy.create_engine(url)
    
    try: 
        yield db
        
    finally:
        # Tear down database connection
       db.dispose()
       # pass

In [7]:
def read_last_page_db():
    """ Check last page in database.
    Args:
        Connecion URL.
    Returns:
        Value of the last page added to database.
    """
# set connection
    db_string = "postgresql://postgres:Congitos211!!!@localhost:5432/copart"

    with database(db_string) as db:
        # Run the query to fetch the data
        result = db.execute("SELECT MAX(\"Page Number\") FROM cars")
        row = result.fetchone() # Select one row
        return row[0]

In [8]:
def check_for_null(df):
    """ Check null vales that may appear when scarping is incorrect.
    Args:
        Dataframe.
    Returns:
        dataframe without null values and dataframe with null values.
    """
    df_null = df[df.isna().any(axis=1)]
    df.dropna(inplace=True)
    #if df_null is not None:
    #    return df, None
    #else:
    return df

In [32]:
def timed(func):
        """ Decorator for mesuring time of function execution.
    Args:
        function
    Returns:
        executes function and prints elapsed time since start.
    """
    def wrapper():
        start = time.time()
        result = func()
        end = time.time()
        elapsed = end - start
        print('extract executed in ' + str(elapsed) + 's')
        return result
    return wrapper

In [37]:
@timed
def extract():
    """ Scraps data from bids-history.com until stoped by server, change ip adress and
    create a dataframe with sraped data.
    Args:
        None.
    Returns:
        dataframe
    """
    # check last page in DB
    page_number = 4969538
    #page_number = read_last_page_db()
    dict_list = []
    while True:
        try:
            soup = extract_data_and_create_soup(page_number)
            soup_dict = dictionary_with_data_and_features(soup, page_number)
            ## Adding data from soup that is not included in the soup_dict
            lot_number(soup_dict, soup)
            web_adress(soup_dict)
            car_year(soup_dict, soup)
            car_model(soup_dict, soup)
            dict_list.append(soup_dict)
            page_number = page_number + 1
        except IndexError:
            # In case of being blocked by server change ip
            change_ip()
            print('Number of records stored ' + str(len(dict_list)))
            df = converting_dictionary_to_dataframe(dict_list)
            break
    return df

In [11]:
def transform(df):
    ## Change monetary values to correct format
    convert_to_num('Final bid:', df)
    convert_to_num('Estimated Repair Cost:', df)
    convert_to_num('Estimated Retail Value:', df)
    location(df)
    doc_type(df)
    odometer_to_km(df)
    convert_date_str_to_date(df)
    #engine_type(df)
    #cilinders(df)
    return df

In [12]:
def load():
    # Load
    # Load the data in batch processing to sql database and close connectionn
    pass

In [13]:
def etl():
    """ Scraps data from bids-history.com, create a dataframe with auction detail 
        and load the data into a sql db.
    Args:
        None.
    Returns:
        postgres database 
    """
    dictionary = extract()
    df = converting_dictionary_to_dataframe(dictionary)
    #df = check_for_null(df)
    df = transform(df)
    return df
    

In [14]:
def converting_dictionary_to_dataframe(dictionary):
    """ Converts dictionary to dataframe.
    Args:
        Takes as agrument dict object.
    Returns:
        Pandas dataframe.
    """
    df = pd.DataFrame.from_dict(dictionary,orient='columns')
    df = df.set_index("Page Number")
    return df

In [15]:
def extract_data_and_create_soup(page_number):
    """ Makes a url request and creates soup.
    Args:
        None.
    Returns:
        bs4.BeautifulSoup object.
    """    
    url = "https://bids-history.com/lot/" + str(page_number) + "/"
    #Get the HMTL text from the homepage.
    res = requests.get(url,verify = False)
    soup = bs4.BeautifulSoup(res.text,'lxml')
    # Create soup, if text is 'Not Found' and page_number is high then no more data avaliable 
    #and stop update. Minimum page number starts from 69600
    return soup

In [16]:
def dictionary_with_data_and_features(soup, page_number):
    """ Selects features and vales from the soup.
    Args:
        Takes as agrument object bs4.BeautifulSoup and web page_number.
    Returns:
        Dictionary with features as keys and data as values. 
    """
    items = 0 
    Lot_info_key = []
    Lot_info_val = []
    Lot_info_key.append("Page Number")
    Lot_info_val.append(page_number)
    for item in soup.select(".col-6"):
        item = item.text
        item = item.replace("\n", "") # Formating word
        if items % 2 == 0:
            Lot_info_key.append(item)
            items += 1
        else:
            Lot_info_val.append(item)
            items += 1
    return dict(zip(Lot_info_key, Lot_info_val))

In [17]:
def convert_to_num(column_name, df):
    """ Separates data with monetary amounts to 'column_name','currency' columns.
    Args:
        Takes as agrument column name from dataframe.
    Returns:
        Nothing. It converts values inplace in the dataframe.
    """
    
    df[[column_name,'Currency']] = df[column_name].str.split(n=1, expand=True) # Formating price
    df[column_name].replace(to_replace=[",","\$"],value='',regex=True, inplace=True)
    df[column_name] = df[column_name].astype(int)

In [18]:
 def odometer_to_km(df):
    """ Converts odometer number to KM.
    Args:
        No args.
    Returns:
        Nothing. It converts values inplace in the dataframe.
    """
    
    df['Odometer:'].replace(to_replace=[",","mi"],value='',regex=True, inplace=True)
    df['Odometer:'] = (df['Odometer:'].astype(int) * 1.60934).astype(int) # miles to KM

In [19]:
def convert_date_str_to_date(df):
    """ Converts string to python date format.
    Args:
        No args.
    Returns:
        Nothing. It converts values inplace in the dataframe.
    """    
    # Consolidating months abreviation
    df['Auction Date:'] = df['Auction Date:'].str.replace('March', 'Mar')
    df['Auction Date:'] = df['Auction Date:'].str.replace('April', 'Apr')
    df['Auction Date:'] = df['Auction Date:'].str.replace('June', 'Jun')
    df['Auction Date:'] = df['Auction Date:'].str.replace('July', 'Jul')
    df['Auction Date:'] = df['Auction Date:'].str.replace('Sept.', 'Sep')
    
    # Deleting comas and points
    df['Auction Date:'] = df['Auction Date:'].str.replace(',', '')
    df['Auction Date:'] = df['Auction Date:'].str.replace('.', '')
    
    # Converting data to date type
    df['Auction Date:'] = df['Auction Date:'].str.upper()
    df['Auction Date:'] = [datetime.datetime.strptime(date,'%b %d %Y %H %p') for date in df['Auction Date:']]

In [20]:
def engine_type(car_dict):
# Converting Engine Type to float number
    car_dict['Engine Type:'] = car_dict['Engine Type:'][0].replace(car_dict['Engine Type:'][0][3:],'')

In [21]:
def cilinders(car_dict):
# Converting cilinders to  float number
    car_dict['Cylinders:'] =int(car_dict['Cylinders:'])

In [22]:
def lot_number(car_dict, soup):
    car_dict['Lot number:'] = int(soup.select("td a")[1].text)

In [23]:
def web_adress(car_dict):
    car_dict['Web adress'] = str("https://bids-history.com/lot/"+ str(car_dict['Page Number']))

In [24]:
def car_year(car_dict, soup):
    car_dict['Production year'] = soup.select("ol li")[-1].text[0:4]

In [25]:
def car_model(car_dict, soup):
    car_dict['Car Model'] = soup.select("ol li")[-1].text[5:]   

In [26]:
#to be corrected form dict to df
def location(car_dict):
    car_dict['Location'] = car_dict['Doc Type:'][0:2]

In [27]:
#to be corrected form dict to df
def doc_type(car_dict):
    car_dict['Doc Type:'] =  car_dict['Doc Type:'][5:] 

import sqlalchemy

connection_uri = "postgresql://postgres:Congitos211!!!@localhost:5432/copart"
db_engine_copart = sqlalchemy.create_engine(connection_uri)

# Finish the .to_sql() call to write to store.film
df.to_sql("cars", con=db_engine_copart,  if_exists="replace")

# Run the query to fetch the data
pd.read_sql("SELECT * FROM cars", db_engine_copart)

In [39]:
df = extract() 

2021-02-04 21:11:59.530 | INFO     | nordvpn_connect.nordvpn_connect:start_vpn_windows:100 - You're using Windows.
Performing system check...

2021-02-04 21:11:59.532 | INFO     | nordvpn_connect.nordvpn_connect:start_vpn_windows:120 - NordVPN installation check: OK
2021-02-04 21:11:59.544 | INFO     | nordvpn_connect.nordvpn_connect:start_vpn_windows:127 - NordVPN service check: OK
2021-02-04 21:11:59.545 | INFO     | nordvpn_connect.nordvpn_connect:start_vpn_windows:130 - Opening NordVPN app and disconnecting if necessary...
2021-02-04 21:11:59.944 | INFO     | nordvpn_connect.nordvpn_connect:start_vpn_windows:137 - NordVPN app launched: OK
2021-02-04 21:11:59.945 | INFO     | nordvpn_connect.nordvpn_connect:initialize_vpn:71 - Done!
2021-02-04 21:12:00.433 | INFO     | nordvpn_connect.nordvpn_connect:check_old_ip:187 - Your current ip-address is: 217.138.209.84
2021-02-04 21:12:00.433 | INFO     | nordvpn_connect.nordvpn_connect:rotate_VPN:155 - Connecting you to poland
2021-02-04 2

Number of records stored 12
extract executed in 17.58206605911255s


In [40]:
df

Unnamed: 0_level_0,Final bid:,Doc Type:,Odometer:,Highlights:,Primary Damage:,Secondary Damage:,Estimated Repair Cost:,Estimated Retail Value:,VIN:,Auction Date:,Body Style:,Engine Type:,Cylinders:,Transmission:,Drive:,Fuel:,Lot number:,Web adress,Production year,Car Model
Page Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
4969538,"$2,450 USD",TX - SALVAGE VEHICLE TITLE,"65,802 mi",Runs And Drives,FRONT END,REAR END,"$12,305 USD","$11,587 USD",5NPE34AFXGH******,"Sept. 1, 2020, 10 a.m.",SEDAN 4D,2.4L 4,4,AUTOMATIC,Front-wheel Drive,GAS,38861040,https://bids-history.com/lot/4969538,2016,HYUNDAI SONATA SPORT
4969539,$275 USD,MN - PARTS VEHICLE BILL OF SALE,"169,335 mi",Runs And Drives,FRONT END,REAR END,$0 USD,"$3,945 USD",1HGES16533L******,"Sept. 1, 2020, 10 a.m.",SEDAN 4D,1.7L 4,4,AUTOMATIC,Front-wheel Drive,GAS,38864880,https://bids-history.com/lot/4969539,2003,HONDA CIVIC LX
4969540,$575 USD,TX - SALVAGE VEHICLE TITLE,"145,227 mi",Engine Start Program,FRONT END,REAR END,"$9,828 USD","$4,303 USD",1HGCM66585A******,"Sept. 1, 2020, 10 a.m.",SEDAN 4D,3.0L 6,6,AUTOMATIC,Front-wheel Drive,GAS,38868300,https://bids-history.com/lot/4969540,2005,HONDA ACCORD EX
4969541,"$3,750 USD",TX - SALVAGE VEHICLE TITLE,"157,087 mi",Runs And Drives,SIDE,TOP/ROOF,"$26,095 USD","$11,800 USD",5TFEV54119X******,"Sept. 1, 2020, 10 a.m.",CREW PIC,5.7L 8,8,AUTOMATIC,Rear-wheel drive,GAS,38868890,https://bids-history.com/lot/4969541,2009,TOYOTA TUNDRA CREWMAX
4969542,"$1,950 USD",MN - CERTIFICATE OF TITLE,"190,704 mi",Runs And Drives,FRONT END,MINOR DENT/SCRATCHES,$0 USD,"$4,454 USD",2CNALBEC6B6******,"Sept. 1, 2020, 10 a.m.",4DR SPOR,2.4L 4,4,AUTOMATIC,Front-wheel Drive,GAS,38870130,https://bids-history.com/lot/4969542,2011,CHEVROLET EQUINOX LS
4969543,"$3,050 USD",MO - SALVAGE CERTIFICATE OF TITLE,"76,770 mi",Engine Start Program,BURN - INTERIOR,MECHANICAL,"$10,765 USD","$11,140 USD",1FMCU0GX1EU******,"Sept. 1, 2020, 10 a.m.",4DR SPOR,1.6L 4,4,AUTOMATIC,Front-wheel Drive,GAS,38873550,https://bids-history.com/lot/4969543,2014,FORD ESCAPE SE
4969544,"$1,650 USD",TX - CERTIFICATE OF TITLE,"139,037 mi",Engine Start Program,UNDERCARRIAGE,MINOR DENT/SCRATCHES,"$3,358 USD","$5,578 USD",1GKFK66U46J******,"Sept. 1, 2020, 10 a.m.",4DR SPOR,6.0L 8,8,AUTOMATIC,All wheel drive,GAS,38882820,https://bids-history.com/lot/4969544,2006,GMC YUKON XL DENALI
4969545,$325 USD,MN - CERTIFICATE OF TITLE,"146,974 mi",Runs And Drives,SIDE,TOP/ROOF,$0 USD,"$4,545 USD",1LNHM82W4XY******,"Sept. 22, 2020, 10 a.m.",SEDAN 4D,4.6L 8,8,AUTOMATIC,Rear-wheel drive,GAS,38909700,https://bids-history.com/lot/4969545,1999,LINCOLN TOWN CAR SIGNATURE
4969546,"$4,650 USD",MN - CERT OF TITLE-SALVAGE,"80,657 mi",Runs And Drives,FRONT END,MINOR DENT/SCRATCHES,"$9,084 USD","$11,244 USD",1C4NJDEBXFD******,"Sept. 1, 2020, 10 a.m.",4DR SPOR,2.4L 4,4,AUTOMATIC,4x4 w/Front Whl Drv,GAS,38916560,https://bids-history.com/lot/4969546,2015,JEEP COMPASS LATITUDE
4969547,$500 USD,MN - CERTIFICATE OF TITLE,"134,418 mi",Runs And Drives,FRONT END,MINOR DENT/SCRATCHES,$0 USD,"$2,017 USD",KMHDN45D52U******,"Sept. 1, 2020, 10 a.m.",SEDAN 4D,2.0L 4,4,AUTOMATIC,Front-wheel Drive,GAS,38919930,https://bids-history.com/lot/4969547,2002,HYUNDAI ELANTRA GLS
