In [49]:
import requests
import bs4
import pandas as pd
import datetime
import sqlalchemy as sa
from sqlalchemy import exc
import time
from time import sleep
from contextlib import contextmanager
import random
import urllib3
from nordvpn_connect import initialize_vpn, rotate_VPN, close_vpn_connection
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [68]:
def change_ip():
    """ Connects to a nord VPN server.
    Args:
        No args.
    Returns:
        Nothing. Changes server in background.
    """ 
    settings = initialize_vpn("Poland")  # starts nordvpn and stuff
    rotate_VPN(settings)  # actually connect to server

In [61]:
@contextmanager
def database(url):
    """ Creates context in which engine is created, perform an 
    action and tear down the connection once finished.
    Args:
        Connecion URL.
    Returns:
        Postgres database 
    """
    # Create engine
    db = sa.create_engine(url)
    
    try: 
        yield db
        
    finally:
        # Tear down database connection
       db.dispose()
       # pass

In [62]:
def read_last_page_db():
    """ Check last page in database.
    Args:
        Connecion URL.
    Returns:
        Value of the last page added to database.
    """
# set connection
    db_string = "postgresql://postgres:Congitos211!!!@localhost:5432/copart"

    with database(db_string) as db:
        # Run the query to fetch the data
        result = db.execute("SELECT MAX(page_number) FROM cars")
        row = result.fetchone() # Select one row
        if row[0] == None:
            row = 70000
            return row
        return row[0]

In [63]:
def check_for_null(df):
    """ Check null vales that may appear when scarping is incorrect.
    Args:
        Dataframe.
    Returns:
        dataframe without null values and dataframe with null values.
    """
    df_null = df[df.isna().any(axis=1)]
    df.dropna(inplace=True)
    #if df_null is not None:
    #    return df, None
    #else:
    return df

In [64]:
def timed(func):
    """ Decorator for mesuring time of function execution.
    Args:
        function
    Returns:
        Executes function and prints elapsed time since start.
    """
    def wrapper():
        start = time.time()
        result = func()
        end = time.time()
        elapsed = end - start
        print('extract executed in ' + str(elapsed) + 's')
        return result
    return wrapper

In [131]:
from socket import gethostbyname, gaierror
def extract_test(return_every=12):
    while True:
            # check last page in DB
            #page_number = read_last_page_db() + 1
            #page_number = 70346
            page_number = 70346 + 1
            dict_list = []
            dict_error = {}
            while True:
                try:
                    soup = extract_data_and_create_soup(page_number)
                    soup_dict = dictionary_with_data_and_features(soup, page_number)
                    ## Adding data from soup that is not included in the soup_dict
                    lot_number(soup_dict, soup)
                    web_adress(soup_dict)
                    car_year(soup_dict, soup)
                    car_model(soup_dict, soup)
                    dict_list.append(soup_dict)
                    page_number = page_number + 1
                    stored_in_dic = len(dict_list)
                    # Return if number of stored items reach
                    if stored_in_dic >= return_every:
                        df = converting_dictionary_to_dataframe(dict_list)
                        return df
                        #break
                        
                except IndexError as e:
                    print("OOPS!! Index error")
                    print(str(e))
                    print('Number of records stored ' + stored_in_dic)
                    print('Current page_number ' + str(page_number))
                    if page_number not in dict_error:
                        dict_error[page_number] = 1
                    else:
                        dict_error[page_number] += 1
                    if dict_error[page_number] > 3:
                        page_number = page_number + 1
                        print('Modified page_number ' + str(page_number))
                    print(dict_error)
                    print('Changing ip adress')
                    change_ip()
                    continue
                except requests.ConnectionError as e:
                    print("OOPS!! Connection Error. Make sure you are connected to Internet. Technical Details given below.\n")
                    print(str(e))            
                    change_ip()
                    continue
                except requests.Timeout as e:
                    print("OOPS!! Timeout Error")
                    print(str(e))
                    change_ip()
                    continue
                except requests.RequestException as e:
                    print("OOPS!! General Error")
                    print(str(e))
                    change_ip()
                    continue
                except KeyboardInterrupt:
                    print("Someone closed the program")

In [132]:
extract_test(return_every=12)

TypeError: '>=' not supported between instances of 'str' and 'int'

In [114]:
df = df_test

In [None]:
df1 = extract_test()

In [115]:
total = 0
for x in range(2):
    total += 1
    if total == 3:
        df1 = extract_test()
        df2 = pd.concat([df, df1], join="inner")
    

In [12]:
def extract():
    """ Scraps data from bids-history.com until stoped by server, change ip adress and
    create a dataframe with scraped data.
    Args:
        None.
    Returns:
        dataframe
    """
    # check last page in DB
    page_number = read_last_page_db() + 1
    #page_number = 70346
    dict_list = []
    while True:
        try:
            soup = extract_data_and_create_soup(page_number)
            soup_dict = dictionary_with_data_and_features(soup, page_number)
            ## Adding data from soup that is not included in the soup_dict
            lot_number(soup_dict, soup)
            web_adress(soup_dict)
            car_year(soup_dict, soup)
            car_model(soup_dict, soup)
            dict_list.append(soup_dict)
            page_number = page_number + 1
        except IndexError:
            break
            
    if len(dict_list) > 0:
        print('Number of records stored ' + str(len(dict_list)))
        print('Current page_number ' + str(page_number))
        df = converting_dictionary_to_dataframe(dict_list)
        return df
    else:
        print('No df to return, check if page exist')

test = extract()

In [13]:
def transform(df):
    """ Transforms strings to numeric values.
    Args:
        Dataframe from extract step.
    Returns:
        Modified dataframe.
    """
    ## Drop records with NaN values
    df.dropna(inplace=True)
    ## Change monetary values to correct format
    convert_to_num('Final bid:', df)
    convert_to_num('Estimated Repair Cost:', df)
    convert_to_num('Estimated Retail Value:', df)
    location(df)
    doc_type(df)
    odometer_to_km(df)
    convert_date_str_to_date(df)
    engine_type(df)
    cilinders(df)
    production_year(df)
    return df

In [14]:
def load(df):
    # Load
    # Load the data in batch processing to sql database and close connectionn
    rename_colums(df)
    load_to_sql_db(df)

In [15]:
@timed
def etl():
    """ Scraps data from bids-history.com, create a dataframe with auction detail 
        and load the data into a sql db.
    Args:
        None.
    Returns:
        postgres database 
    """
    while True:
        try:
            df = extract()
            df = transform(df)
            load(df)    
        except:
            print('wait 2s')
            sleep(2)
            #change_ip()
            print('wait 10s')
            sleep(10)

In [16]:
def converting_dictionary_to_dataframe(dictionary):
    """ Converts dictionary to dataframe.
    Args:
        Takes as agrument dict object.
    Returns:
        Pandas dataframe.
    """
    df = pd.DataFrame.from_dict(dictionary,orient='columns')
    df = df.set_index("Page Number")
    return df

In [17]:
def extract_data_and_create_soup(page_number):
    """ Makes a url request and creates soup.
    Args:
        None.
    Returns:
        bs4.BeautifulSoup object.
    """    
    url = "https://bids-history.com/lot/" + str(page_number) + "/"
    #Get the HMTL text from the homepage.
    res = requests.get(url,verify = False, allow_redirects=False)
    soup = bs4.BeautifulSoup(res.text,'lxml')
    # Create soup, if text is 'Not Found' and page_number is high then no more data avaliable 
    #and stop update. Minimum page number starts from 69600
    return soup

In [18]:
def dictionary_with_data_and_features(soup, page_number):
    """ Selects features and vales from the soup.
    Args:
        Takes as agrument object bs4.BeautifulSoup and web page_number.
    Returns:
        Dictionary with features as keys and data as values. 
    """
    items = 0 
    Lot_info_key = []
    Lot_info_val = []
    Lot_info_key.append("Page Number")
    Lot_info_val.append(page_number)
    for item in soup.select(".col-6"):
        item = item.text
        item = item.replace("\n", "") # Formating word
        if items % 2 == 0:
            Lot_info_key.append(item)
            items += 1
        else:
            Lot_info_val.append(item)
            items += 1
    return dict(zip(Lot_info_key, Lot_info_val))

In [19]:
def convert_to_num(column_name, df):
    """ Separates data with monetary amounts to 'column_name','currency' columns.
    Args:
        Takes as agrument column name from dataframe.
    Returns:
        Nothing. It converts values inplace in the dataframe.
    """
    df[[column_name,'Currency']] = df[column_name].str.split(n=1, expand=True) # Formating price
    df[column_name].replace(to_replace=[",","\$"],value='',regex=True, inplace=True)
    df[column_name] = df[column_name].astype(int)

In [20]:
 def odometer_to_km(df):
    """ Converts odometer number to KM.
    Args:
        No args.
    Returns:
        Nothing. It converts values inplace in the dataframe.
    """
    
    df['Odometer:'].replace(to_replace=[",","mi"],value='',regex=True, inplace=True)
    df['Odometer:'] = (df['Odometer:'].astype(int) * 1.60934).astype(int) # miles to KM

In [21]:
def convert_date_str_to_date(df):
    """ Converts string to python date format.
    Args:
        No args.
    Returns:
        Nothing. It converts values inplace in the dataframe.
    """    
    # Consolidating months abreviation
    df['Auction Date:'] = df['Auction Date:'].str.replace('March', 'Mar')
    df['Auction Date:'] = df['Auction Date:'].str.replace('April', 'Apr')
    df['Auction Date:'] = df['Auction Date:'].str.replace('June', 'Jun')
    df['Auction Date:'] = df['Auction Date:'].str.replace('July', 'Jul')
    df['Auction Date:'] = df['Auction Date:'].str.replace('Sept.', 'Sep')
    
    # Deleting comas and points
    df['Auction Date:'] = df['Auction Date:'].str.replace(',', '')
    df['Auction Date:'] = df['Auction Date:'].str.replace('.', '')
    
    # Converting data to date type
    df['Auction Date:'] = df['Auction Date:'].str.upper()
    df['Auction Date:'] = [datetime.datetime.strptime(date,'%b %d %Y %H %p') for date in df['Auction Date:']]

In [22]:
def engine_type(car_dict):
    def test_completnes(x):
        "Full string with engine data has at least 6 characters"
        if len(x) > 1:
            return x
        else:
            return None
        
    series  = pd.Series(car_dict['Engine Type:']).map(lambda x: x[:4])
    series  = series.map(test_completnes)
    series = series.str.strip()
    series = series.str.replace('L', '')
    series = series.astype(float)
    car_dict['Engine Type:'] =  series

In [23]:
def cilinders(car_dict):
    car_dict['Cylinders:'] =  pd.Series(car_dict['Cylinders:']).map(int)

In [24]:
def lot_number(car_dict, soup):
    car_dict['Lot number:'] = int(soup.select("td a")[1].text)

In [25]:
def web_adress(car_dict):
    car_dict['Web adress'] = str("https://bids-history.com/lot/"+ str(car_dict['Page Number']))

In [26]:
def car_year(car_dict, soup):
    car_dict['Production year'] = soup.select("ol li")[-1].text[0:4]

In [27]:
def car_model(car_dict, soup):
    car_dict['Car Model'] = soup.select("ol li")[-1].text[5:]   

In [28]:
#to be corrected form dict to df
def location(car_dict):
    car_dict['Location'] = pd.Series(car_dict['Doc Type:']).map(lambda x: x[0:2])

In [29]:
#to be corrected form dict to df
def doc_type(car_dict):
    car_dict['Doc Type:'] =  pd.Series(car_dict['Doc Type:']).map(lambda x: x[5:])

In [30]:
def production_year(car_dict):
    car_dict['Production year'] =  car_dict['Production year'].astype(int)

In [31]:
def rename_colums(df):
    # rename index
    df.index.rename("page_number", inplace=True)
    # rename columns
    df.rename({'Final bid:':"final_bid",
                 'Doc Type:':"doc_type",
                 'Odometer:':"odometer",
                 'Highlights:':"highlights",
                 'Primary Damage:':"primary_damage",
                 'Secondary Damage:':"secondary_damage",
                 'Estimated Repair Cost:':"estimated_repair_cost",
                 'Estimated Retail Value:':"estimated_retail_value",
                 'VIN:':"vin",
                 'Auction Date:':"auction_date",
                 'Body Style:':"body_style",
                 'Engine Type:':"engine_type",
                 'Cylinders:':"cylinders",
                 'Transmission:':"transmission",
                 'Drive:':"drive",
                 'Fuel:':"fuel",  
                 'Lot number:':"lot_number",
                 'Web adress':"web_adress",
                 'Production year':"production_year",
                 'Car Model':"car_model",
                 "Currency":"currency",
                 'Location':"location"}, axis=1, inplace=True)

In [32]:
# Load to db

def load_to_sql_db(df):
    try:
        user="postgres",
        password="Congitos211!!!",
        host="localhost",
        #port="5432",
        database="copart"
        connection_uri = "postgresql://postgres:Congitos211!!!@localhost:5432/copart"
        #connection_uri = f"postgresql://{user}:{password}@{host}:{str(port)}/{database}"

        db_engine_copart = sa.create_engine(connection_uri)
        # Finish the .to_sql() call to write to store.film
        df.to_sql("cars", con=db_engine_copart,  if_exists="append")
        print("Records stored")
        # pd.read_sql("SELECT * FROM cars", db_engine_copart)
    except exc.IntegrityError:
        print("Exception. Records already stored")
    finally:
        db_engine_copart.dispose()

In [33]:
def read_from_sql_db():

    user="postgres"
    password="Congitos211!!!"
    host="localhost"
    port="5432"
    database="copart"

    connection_uri = f"postgresql://{user}:{password}@{host}:{str(port)}/{database}"
    #connection_uri = "postgresql://postgres:Congitos211!!!@localhost:5432/copart"

    db_engine_copart = sa.create_engine(connection_uri)

    pd.read_sql("SELECT * FROM cars", db_engine_copart)
    print("Records loaded")

In [34]:
df = extract() 

Number of records stored 12
Current page_number 70374


In [38]:
df_test = df.copy()

In [39]:
df_test

Unnamed: 0_level_0,Doc Type:,Odometer:,Highlights:,Primary Damage:,Secondary Damage:,Estimated Repair Cost:,Estimated Retail Value:,VIN:,Auction Date:,Body Style:,Engine Type:,Cylinders:,Transmission:,Drive:,Fuel:,Lot number:,Web adress,Production year,Car Model,Final bid:
Page Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
70362,CA - SALVAGE CERTIFICATE,"78,673 mi",Enhanced Vehicles,FRONT END,,$0 USD,"$12,052 USD",JTLZE4FE7EJ******,"July 11, 2018, 8 p.m.",4DR SPOR,2.4L 4,4,AUTOMATIC,Front-wheel Drive,GAS,51084967,https://bids-history.com/lot/70362,2014,TOYOTA SCION XB,
70363,TX - LIEN PAPERS,"166,996 mi",Runs And Drives,HAIL,MINOR DENT/SCRATCHES,$0 USD,$0 USD,1N4AL11D25C******,"Sept. 14, 2018, 6 p.m.",SEDAN 4D,2.5L 4,4,,Front-wheel Drive,GAS,51083547,https://bids-history.com/lot/70363,2005,NISSAN ALTIMA S,
70364,CA - SALVAGE CERTIFICATE,"75,644 mi",Engine Start Program,FRONT END,,$0 USD,$0 USD,3N1CN7AP1FL******,"July 31, 2018, 8 p.m.",SEDAN 4D,1.6L 4,4,AUTOMATIC,Front-wheel Drive,GAS,51082337,https://bids-history.com/lot/70364,2015,NISSAN VERSA S,
70365,AL - CERT OF TITLE-SALVAGE TITLE,"16,806 mi",Engine Start Program,FRONT END,SIDE,$0 USD,"$10,316 USD",1C4NJPBA4HD******,"July 26, 2018, 6 p.m.",4DR SPOR,2.0L 4,4,AUTOMATIC,Front-wheel Drive,GAS,51081747,https://bids-history.com/lot/70365,2017,JEEP PATRIOT SPORT,"$4,200 USD"
70366,LA - CERT OF TITLE-SALVAGE,"140,894 mi",Engine Start Program,ALL OVER,,$0 USD,"$6,791 USD",1D7HA18P47S******,"Aug. 2, 2018, 6 p.m.",CREW PIC,4.7L 8,8,AUTOMATIC,Rear-wheel drive,FLEXIBLE FUEL,51079947,https://bids-history.com/lot/70366,2007,DODGE RAM 1500 ST,$0 USD
70367,VA - CERT OF TITLE - SALVAGE,"41,827 mi",Runs And Drives,REAR END,SIDE,$0 USD,"$6,648 USD",3N1CN7AP9EL******,"July 17, 2018, 3 p.m.",SEDAN 4D,1.6L 4,4,AUTOMATIC,Front-wheel Drive,GAS,51079187,https://bids-history.com/lot/70367,2014,NISSAN VERSA S,"$1,100 USD"
70368,NC - SALVAGE CERTIFICATE OF TITLE,"83,406 mi",Runs And Drives,FRONT END,MINOR DENT/SCRATCHES,$0 USD,"$10,024 USD",KNDJT2A62D7******,"July 17, 2018, 3 p.m.",4DR SPOR,2.0L 4,4,AUTOMATIC,Front-wheel Drive,GAS,51077817,https://bids-history.com/lot/70368,2013,KIA SOUL +,"$2,150 USD"
70369,WV - NON-REPAIRABLE CERTIFICATE,0 mi,Enhanced Vehicles,FRONT END,SIDE,$0 USD,"$10,264 USD",5NPET4ACXAH******,"July 26, 2018, 3 p.m.",SEDAN 4D,2.4L 4,4,AUTOMATIC,Front-wheel Drive,GAS,51075947,https://bids-history.com/lot/70369,2010,HYUNDAI SONATA GLS,$500 USD
70370,CA - SALVAGE CERTIFICATE,0 mi,Enhanced Vehicles,SIDE,,$0 USD,"$28,426 USD",JF1VA1F66G9******,"July 6, 2018, 8 p.m.",SEDAN 4D,2.0L 4,4,AUTOMATIC,All wheel drive,GAS,51075057,https://bids-history.com/lot/70370,2016,SUBARU WRX PREMIUM,"$1,550 USD"
70371,CO - SALVAGE TITLE,"153,642 mi",Enhanced Vehicles,REAR END,MINOR DENT/SCRATCHES,$0 USD,"$3,733 USD",2CNBE634846******,"July 19, 2018, 7 p.m.",4DR SPOR,2.5L 6,6,AUTOMATIC,Rear-wheel drive,GAS,51074037,https://bids-history.com/lot/70371,2004,CHEVROLET TRACKER LT,


In [37]:
df_trans = transform(df_test)

In [None]:
df_trans

In [None]:
rename_colums(df_trans)

In [None]:
df_trans

In [None]:
load_to_sql_db(df_trans)

In [None]:
etl()