In [1]:
import requests
import bs4
import pandas as pd
import datetime
import sqlalchemy
import time
from contextlib import contextmanager
import random
import urllib3
from nordvpn_connect import initialize_vpn, rotate_VPN, close_vpn_connection
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [2]:
def change_ip():
    """ Connects to a nord VPN server.
    Args:
        No args.
    Returns:
        Nothing. Changes server in background.
    """ 
    settings = initialize_vpn("Poland")  # starts nordvpn and stuff
    rotate_VPN(settings)  # actually connect to server

In [3]:
@contextmanager
def database(url):
    """ Creates context in which engine is created, perform an 
    action and tear down the connection once finished.
    Args:
        Connecion URL.
    Returns:
        Postgres database 
    """
    # Create engine
    db = sqlalchemy.create_engine(url)
    
    try: 
        yield db
        
    finally:
        # Tear down database connection
       db.dispose()
       # pass

In [4]:
def read_last_page_db():
    """ Check last page in database.
    Args:
        Connecion URL.
    Returns:
        Value of the last page added to database.
    """
# set connection
    db_string = "postgresql://postgres:Congitos211!!!@localhost:5432/copart"

    with database(db_string) as db:
        # Run the query to fetch the data
        result = db.execute("SELECT MAX(\"Page Number\") FROM cars")
        row = result.fetchone() # Select one row
        return row[0]

In [5]:
def check_for_null(df):
    """ Check null vales that may appear when scarping is incorrect.
    Args:
        Dataframe.
    Returns:
        dataframe without null values and dataframe with null values.
    """
    df_null = df[df.isna().any(axis=1)]
    df.dropna(inplace=True)
    #if df_null is not None:
    #    return df, None
    #else:
    return df

In [6]:
def timed(func):
    """ Decorator for mesuring time of function execution.
    Args:
        function
    Returns:
        executes function and prints elapsed time since start.
    """
    def wrapper():
        start = time.time()
        result = func()
        end = time.time()
        elapsed = end - start
        print('extract executed in ' + str(elapsed) + 's')
        return result
    return wrapper

In [7]:
@timed
def extract():
    """ Scraps data from bids-history.com until stoped by server, change ip adress and
    create a dataframe with sraped data.
    Args:
        None.
    Returns:
        dataframe
    """
    # check last page in DB
    page_number = 4969538
    #page_number = read_last_page_db()
    dict_list = []
    while True:
        try:
            soup = extract_data_and_create_soup(page_number)
            soup_dict = dictionary_with_data_and_features(soup, page_number)
            ## Adding data from soup that is not included in the soup_dict
            lot_number(soup_dict, soup)
            web_adress(soup_dict)
            car_year(soup_dict, soup)
            car_model(soup_dict, soup)
            dict_list.append(soup_dict)
            page_number = page_number + 1
        except IndexError:
            # In case of being blocked by server change ip
            change_ip()
            print('Number of records stored ' + str(len(dict_list)))
            df = converting_dictionary_to_dataframe(dict_list)
            break
    return df

In [8]:
def load():
    # Load
    # Load the data in batch processing to sql database and close connectionn
    pass

In [9]:
def etl():
    """ Scraps data from bids-history.com, create a dataframe with auction detail 
        and load the data into a sql db.
    Args:
        None.
    Returns:
        postgres database 
    """
    dictionary = extract()
    df = converting_dictionary_to_dataframe(dictionary)
    #df = check_for_null(df)
    df = transform(df)
    return df
    

In [10]:
def converting_dictionary_to_dataframe(dictionary):
    """ Converts dictionary to dataframe.
    Args:
        Takes as agrument dict object.
    Returns:
        Pandas dataframe.
    """
    df = pd.DataFrame.from_dict(dictionary,orient='columns')
    df = df.set_index("Page Number")
    return df

In [11]:
def extract_data_and_create_soup(page_number):
    """ Makes a url request and creates soup.
    Args:
        None.
    Returns:
        bs4.BeautifulSoup object.
    """    
    url = "https://bids-history.com/lot/" + str(page_number) + "/"
    #Get the HMTL text from the homepage.
    res = requests.get(url,verify = False)
    soup = bs4.BeautifulSoup(res.text,'lxml')
    # Create soup, if text is 'Not Found' and page_number is high then no more data avaliable 
    #and stop update. Minimum page number starts from 69600
    return soup

In [12]:
def dictionary_with_data_and_features(soup, page_number):
    """ Selects features and vales from the soup.
    Args:
        Takes as agrument object bs4.BeautifulSoup and web page_number.
    Returns:
        Dictionary with features as keys and data as values. 
    """
    items = 0 
    Lot_info_key = []
    Lot_info_val = []
    Lot_info_key.append("Page Number")
    Lot_info_val.append(page_number)
    for item in soup.select(".col-6"):
        item = item.text
        item = item.replace("\n", "") # Formating word
        if items % 2 == 0:
            Lot_info_key.append(item)
            items += 1
        else:
            Lot_info_val.append(item)
            items += 1
    return dict(zip(Lot_info_key, Lot_info_val))

In [13]:
def convert_to_num(column_name, df):
    """ Separates data with monetary amounts to 'column_name','currency' columns.
    Args:
        Takes as agrument column name from dataframe.
    Returns:
        Nothing. It converts values inplace in the dataframe.
    """
    
    df[[column_name,'Currency']] = df[column_name].str.split(n=1, expand=True) # Formating price
    df[column_name].replace(to_replace=[",","\$"],value='',regex=True, inplace=True)
    df[column_name] = df[column_name].astype(int)

In [14]:
 def odometer_to_km(df):
    """ Converts odometer number to KM.
    Args:
        No args.
    Returns:
        Nothing. It converts values inplace in the dataframe.
    """
    
    df['Odometer:'].replace(to_replace=[",","mi"],value='',regex=True, inplace=True)
    df['Odometer:'] = (df['Odometer:'].astype(int) * 1.60934).astype(int) # miles to KM

In [15]:
def convert_date_str_to_date(df):
    """ Converts string to python date format.
    Args:
        No args.
    Returns:
        Nothing. It converts values inplace in the dataframe.
    """    
    # Consolidating months abreviation
    df['Auction Date:'] = df['Auction Date:'].str.replace('March', 'Mar')
    df['Auction Date:'] = df['Auction Date:'].str.replace('April', 'Apr')
    df['Auction Date:'] = df['Auction Date:'].str.replace('June', 'Jun')
    df['Auction Date:'] = df['Auction Date:'].str.replace('July', 'Jul')
    df['Auction Date:'] = df['Auction Date:'].str.replace('Sept.', 'Sep')
    
    # Deleting comas and points
    df['Auction Date:'] = df['Auction Date:'].str.replace(',', '')
    df['Auction Date:'] = df['Auction Date:'].str.replace('.', '')
    
    # Converting data to date type
    df['Auction Date:'] = df['Auction Date:'].str.upper()
    df['Auction Date:'] = [datetime.datetime.strptime(date,'%b %d %Y %H %p') for date in df['Auction Date:']]

In [16]:
def engine_type(car_dict):
    series  = pd.Series(car_dict['Engine Type:']).map(lambda x: x[:5])
    series = series.str.strip()
    series = series.str.replace('L', '')
    series = series.astype(float)
    car_dict['Engine Type:'] =  series

In [17]:
def cilinders(car_dict):
    car_dict['Cylinders:'] =  pd.Series(car_dict['Cylinders:']).map(int)

In [18]:
def lot_number(car_dict, soup):
    car_dict['Lot number:'] = int(soup.select("td a")[1].text)

In [19]:
def web_adress(car_dict):
    car_dict['Web adress'] = str("https://bids-history.com/lot/"+ str(car_dict['Page Number']))

In [20]:
def car_year(car_dict, soup):
    car_dict['Production year'] = soup.select("ol li")[-1].text[0:4]

In [21]:
def car_model(car_dict, soup):
    car_dict['Car Model'] = soup.select("ol li")[-1].text[5:]   

In [22]:
#to be corrected form dict to df
def location(car_dict):
    car_dict['Location'] = pd.Series(car_dict['Doc Type:']).map(lambda x: x[0:2])

In [23]:
#to be corrected form dict to df
def doc_type(car_dict):
    car_dict['Doc Type:'] =  pd.Series(car_dict['Doc Type:']).map(lambda x: x[5:])

In [24]:
def production_year(car_dict):
    car_dict['Production year'] =  car_dict['Production year'].astype(int)

import sqlalchemy

connection_uri = "postgresql://postgres:Congitos211!!!@localhost:5432/copart"
db_engine_copart = sqlalchemy.create_engine(connection_uri)

# Finish the .to_sql() call to write to store.film
df.to_sql("cars", con=db_engine_copart,  if_exists="replace")

# Run the query to fetch the data
pd.read_sql("SELECT * FROM cars", db_engine_copart)

In [25]:
df = extract() 

2021-02-06 14:28:54.634 | INFO     | nordvpn_connect.nordvpn_connect:start_vpn_windows:100 - You're using Windows.
Performing system check...

2021-02-06 14:28:54.638 | INFO     | nordvpn_connect.nordvpn_connect:start_vpn_windows:120 - NordVPN installation check: OK
2021-02-06 14:28:54.650 | INFO     | nordvpn_connect.nordvpn_connect:start_vpn_windows:127 - NordVPN service check: OK
2021-02-06 14:28:54.651 | INFO     | nordvpn_connect.nordvpn_connect:start_vpn_windows:130 - Opening NordVPN app and disconnecting if necessary...
2021-02-06 14:28:54.974 | INFO     | nordvpn_connect.nordvpn_connect:start_vpn_windows:137 - NordVPN app launched: OK
2021-02-06 14:28:54.975 | INFO     | nordvpn_connect.nordvpn_connect:initialize_vpn:71 - Done!
2021-02-06 14:28:55.278 | INFO     | nordvpn_connect.nordvpn_connect:check_old_ip:187 - Your current ip-address is: 5.253.206.156
2021-02-06 14:28:55.279 | INFO     | nordvpn_connect.nordvpn_connect:rotate_VPN:155 - Connecting you to poland
2021-02-06 14

Number of records stored 12
extract executed in 17.08253526687622s


In [34]:
def transform(df):
    ## Change monetary values to correct format
    convert_to_num('Final bid:', df)
    convert_to_num('Estimated Repair Cost:', df)
    convert_to_num('Estimated Retail Value:', df)
    location(df)
    doc_type(df)
    odometer_to_km(df)
    convert_date_str_to_date(df)
    engine_type(df)
    cilinders(df)
    production_year(df)
    return df

In [33]:
df_test = df.copy()

In [28]:
df.head()

Unnamed: 0_level_0,Final bid:,Doc Type:,Odometer:,Highlights:,Primary Damage:,Secondary Damage:,Estimated Repair Cost:,Estimated Retail Value:,VIN:,Auction Date:,Body Style:,Engine Type:,Cylinders:,Transmission:,Drive:,Fuel:,Lot number:,Web adress,Production year,Car Model
Page Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
4969538,"$2,450 USD",TX - SALVAGE VEHICLE TITLE,"65,802 mi",Runs And Drives,FRONT END,REAR END,"$12,305 USD","$11,587 USD",5NPE34AFXGH******,"Sept. 1, 2020, 10 a.m.",SEDAN 4D,2.4L 4,4,AUTOMATIC,Front-wheel Drive,GAS,38861040,https://bids-history.com/lot/4969538,2016,HYUNDAI SONATA SPORT
4969539,$275 USD,MN - PARTS VEHICLE BILL OF SALE,"169,335 mi",Runs And Drives,FRONT END,REAR END,$0 USD,"$3,945 USD",1HGES16533L******,"Sept. 1, 2020, 10 a.m.",SEDAN 4D,1.7L 4,4,AUTOMATIC,Front-wheel Drive,GAS,38864880,https://bids-history.com/lot/4969539,2003,HONDA CIVIC LX
4969540,$575 USD,TX - SALVAGE VEHICLE TITLE,"145,227 mi",Engine Start Program,FRONT END,REAR END,"$9,828 USD","$4,303 USD",1HGCM66585A******,"Sept. 1, 2020, 10 a.m.",SEDAN 4D,3.0L 6,6,AUTOMATIC,Front-wheel Drive,GAS,38868300,https://bids-history.com/lot/4969540,2005,HONDA ACCORD EX
4969541,"$3,750 USD",TX - SALVAGE VEHICLE TITLE,"157,087 mi",Runs And Drives,SIDE,TOP/ROOF,"$26,095 USD","$11,800 USD",5TFEV54119X******,"Sept. 1, 2020, 10 a.m.",CREW PIC,5.7L 8,8,AUTOMATIC,Rear-wheel drive,GAS,38868890,https://bids-history.com/lot/4969541,2009,TOYOTA TUNDRA CREWMAX
4969542,"$1,950 USD",MN - CERTIFICATE OF TITLE,"190,704 mi",Runs And Drives,FRONT END,MINOR DENT/SCRATCHES,$0 USD,"$4,454 USD",2CNALBEC6B6******,"Sept. 1, 2020, 10 a.m.",4DR SPOR,2.4L 4,4,AUTOMATIC,Front-wheel Drive,GAS,38870130,https://bids-history.com/lot/4969542,2011,CHEVROLET EQUINOX LS


In [35]:
transformed = transform(df_test)

In [36]:
transformed.head()

Unnamed: 0_level_0,Final bid:,Doc Type:,Odometer:,Highlights:,Primary Damage:,Secondary Damage:,Estimated Repair Cost:,Estimated Retail Value:,VIN:,Auction Date:,...,Cylinders:,Transmission:,Drive:,Fuel:,Lot number:,Web adress,Production year,Car Model,Currency,Location
Page Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4969538,2450,SALVAGE VEHICLE TITLE,105897,Runs And Drives,FRONT END,REAR END,12305,11587,5NPE34AFXGH******,2020-09-01 10:00:00,...,4,AUTOMATIC,Front-wheel Drive,GAS,38861040,https://bids-history.com/lot/4969538,2016,HYUNDAI SONATA SPORT,USD,TX
4969539,275,PARTS VEHICLE BILL OF SALE,272517,Runs And Drives,FRONT END,REAR END,0,3945,1HGES16533L******,2020-09-01 10:00:00,...,4,AUTOMATIC,Front-wheel Drive,GAS,38864880,https://bids-history.com/lot/4969539,2003,HONDA CIVIC LX,USD,MN
4969540,575,SALVAGE VEHICLE TITLE,233719,Engine Start Program,FRONT END,REAR END,9828,4303,1HGCM66585A******,2020-09-01 10:00:00,...,6,AUTOMATIC,Front-wheel Drive,GAS,38868300,https://bids-history.com/lot/4969540,2005,HONDA ACCORD EX,USD,TX
4969541,3750,SALVAGE VEHICLE TITLE,252806,Runs And Drives,SIDE,TOP/ROOF,26095,11800,5TFEV54119X******,2020-09-01 10:00:00,...,8,AUTOMATIC,Rear-wheel drive,GAS,38868890,https://bids-history.com/lot/4969541,2009,TOYOTA TUNDRA CREWMAX,USD,TX
4969542,1950,CERTIFICATE OF TITLE,306907,Runs And Drives,FRONT END,MINOR DENT/SCRATCHES,0,4454,2CNALBEC6B6******,2020-09-01 10:00:00,...,4,AUTOMATIC,Front-wheel Drive,GAS,38870130,https://bids-history.com/lot/4969542,2011,CHEVROLET EQUINOX LS,USD,MN


In [37]:
transformed.describe()

Unnamed: 0,Final bid:,Odometer:,Estimated Repair Cost:,Estimated Retail Value:,Engine Type:,Cylinders:,Lot number:,Production year
count,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0
mean,1914.583333,219663.333333,5952.916667,6265.5,3.15,5.333333,38890260.0,2008.083333
std,1504.215982,67882.334713,8053.150016,4085.634467,1.572202,1.775251,25855.71,5.838093
min,275.0,105897.0,0.0,0.0,1.6,4.0,38861040.0,1999.0
25%,556.25,194694.0,0.0,4213.5,1.95,4.0,38868740.0,2003.0
50%,1800.0,228738.0,1679.0,4559.0,2.4,4.0,38878180.0,2007.5
75%,3087.5,257733.75,10062.25,11166.0,4.3,6.5,38917400.0,2014.0
max,4650.0,311367.0,26095.0,11800.0,6.0,8.0,38923660.0,2016.0


In [32]:
transformed['Engine Type:']

Page Number
4969538    2.4
4969539    1.7
4969540    3.0
4969541    5.7
4969542    2.4
4969543    1.6
4969544    6.0
4969545    4.6
4969546    2.4
4969547    2.0
4969548    4.2
4969549    1.8
Name: Engine Type:, dtype: float64