In [15]:
import pandas as pd
import numpy as np

In [16]:
path = r"./src/vehicle_data/BAT_auction_data.csv"
bat_data = pd.read_csv(path)

data = bat_data.copy()
data = data.drop(["URL", "Mileage Notes"], axis=1)

## Convert details tab into comma separated features

In [17]:
details_df = pd.DataFrame()

identifiers = ["transmission", "paint", "carfax"]
colors = ["white", "black", "gray", "silver", "blue", "red", "brown", "green", "orange", "beige", "purple", "gold", "yellow"]

for details in data["Details"]:
    transmission, paint, carfax = [], [], []
    detail_lst = []

    details_separated = details.split(",")
    details_separated = [detail.strip().lower() for detail in details_separated]
    
    for detail in details_separated:
        if identifiers[0] in detail:
            transmission.append(detail)
        if identifiers[1] in detail or any(color in detail for color in colors):
            paint.append(detail)
        if identifiers[2] in detail:
            carfax.append(detail)
            
    if len(transmission) < 1:
        transmission.append(None)
    if len(paint) < 1:
        paint.append(None)
    if len(carfax) < 1:
        carfax.append(None)
    [detail_lst.append(detail) for detail in [transmission[0], paint[0], carfax[0]]]
    # print(detail_lst, '\n')
    details_df = details_df.append(pd.Series(detail_lst), ignore_index=True)

details_df.reset_index()
details_df.columns = identifiers
details_df.head()

  details_df = details_df.append(pd.Series(detail_lst), ignore_index=True)


Unnamed: 0,transmission,paint,carfax
0,two-speed transmission w/centrifugal clutch,white fiberglass body,
1,two-speed transmission,white fiberglass body,
2,two-speed transmission w/centrifugal clutch,white fiberglass body,
3,two-speed transmission,white fiberglass body,
4,two-speed transmission w/ centrifugal clutch,white fiberglass body,


### Dealing with transmissions

In [18]:
# manual or auto - true or false

# Assign true or false depending on if it's a manual transmission
manual_lst = []
for trans in details_df.transmission:
    if trans and 'manual' in trans:
        manual_lst.append(1)
    else:
        manual_lst.append(0)

len(manual_lst)

72150

### Dealing with paint

In [19]:
# Paint may include upholstery color as well - need to separate and grab only paint color
# has Carfax - true or false

color_str_lst = []
for color_str in details_df.paint:
    first_color = ""
    first_color_idx = float('inf')
    for color in colors:
        if color_str and color in color_str:
            color_idx = color_str.find(color)
            if color_idx < first_color_idx:
                first_color = color
                first_color_idx = color_idx
    if first_color:
        color_str_lst.append(first_color)
    else:
        color_str_lst.append(None)
        
len(color_str_lst)

72150

### Dealing with carfax

In [20]:
carfax_lst = []
for carfax in details_df.carfax:
    if carfax:
        carfax_lst.append(1)
    else:
        carfax_lst.append(0)
        
len(carfax_lst)

72150

### Adding transmission, paint, and carfax to data dataframe

In [21]:
columns = ['manual_trans_bool', 'paint_str', 'carfax_bool']
data_lst = [manual_lst, color_str_lst, carfax_lst]

for idx, column in enumerate(columns):
    data[column] = data_lst[idx]

len(data)

72150

## Encode Sale Status to binary

In [22]:
print(data.Sale_Status.unique())
data.Sale_Status = data.Sale_Status.replace('Sold', 1)
data.Sale_Status = data.Sale_Status.replace('Not Sold', 0)
data.Sale_Status.unique()

array([1, 0])

## Start Training

In [23]:
data = data.drop(["Details"], axis=1)

In [24]:
data.columns
data.head()

Unnamed: 0,Auction Date,Make,Model,Year,Kilometers,Miles,Sale_Status,Final Bid Price,VIN,manual_trans_bool,paint_str,carfax_bool
0,3/16/22,ACOMA,Era1970s,1975,1200.0,745.0,1,10100.0,7202,0,white,0
1,12/10/20,ACOMA,Era1970s,1975,1200.0,745.0,1,8000.0,7202ECON,0,white,0
2,3/16/22,ACOMA,Era1970s,1975,1200.0,745.0,1,10100.0,7202,0,white,0
3,12/10/20,ACOMA,Era1970s,1975,1200.0,745.0,1,8000.0,7202ECON,0,white,0
4,7/29/20,Era1970s,OriginFrench,1975,1200.0,745.0,1,9100.0,7202ECON,0,white,0


In [25]:
data.to_csv("./src/vehicle_data/bat_intermediate_data.csv", header=True, index=False)

In [10]:
# target_feature = "Final Bid Price"
# ml_pipeline(data, target_feature)