In [24]:
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import re

import pandas as pd
import numpy as np
np.random.seed(0)

import os
import wget
from pathlib import Path

from matplotlib import pyplot as plt
%matplotlib inline

In [31]:
data = pd.read_csv("data.csv")

In [32]:
data.head()

Unnamed: 0,RefId,IsBadBuy,PurchDate,Auction,VehYear,VehicleAge,Make,Model,Trim,SubModel,...,MMRCurrentRetailAveragePrice,MMRCurrentRetailCleanPrice,PRIMEUNIT,AUCGUART,BYRNO,VNZIP1,VNST,VehBCost,IsOnlineSale,WarrantyCost
0,1,0,12/7/2009,ADESA,2006,3,MAZDA,MAZDA3,i,4D SEDAN I,...,11597.0,12409.0,,,21973,33619,FL,7100.0,0,1113
1,2,0,12/7/2009,ADESA,2004,5,DODGE,1500 RAM PICKUP 2WD,ST,QUAD CAB 4.7L SLT,...,11374.0,12791.0,,,19638,33619,FL,7600.0,0,1053
2,3,0,12/7/2009,ADESA,2005,4,DODGE,STRATUS V6,SXT,4D SEDAN SXT FFV,...,7146.0,8702.0,,,19638,33619,FL,4900.0,0,1389
3,4,0,12/7/2009,ADESA,2004,5,DODGE,NEON,SXT,4D SEDAN,...,4375.0,5518.0,,,19638,33619,FL,4100.0,0,630
4,5,0,12/7/2009,ADESA,2005,4,FORD,FOCUS,ZX3,2D COUPE ZX3,...,6739.0,7911.0,,,19638,33619,FL,4000.0,0,1020


In [33]:
data["PurchYear"] = pd.DatetimeIndex(data['PurchDate']).year
data["PurchMonth"] = pd.DatetimeIndex(data['PurchDate']).month
data["PurchDay"] = pd.DatetimeIndex(data['PurchDate']).day
data["PurchWeekday"] = pd.DatetimeIndex(data['PurchDate']).weekday

In [34]:
target = data["IsBadBuy"]

In [35]:
data.drop(["RefId", "IsBadBuy", "PurchDate"], axis=1, inplace=True)

In [36]:
categorical_features = set([0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 23, 24, 25, 26, 27, 29, 31, 32, 33, 34])

In [37]:
def clean_string(s):
    return re.sub('[^A-Za-z0-9]+', "_", str(s))

for i in categorical_features:
    data[data.columns[i]] = data[data.columns[i]].apply(clean_string)

In [38]:
for i in categorical_features:
    data.iloc[:, i] = LabelEncoder().fit_transform(data.iloc[:, i])

In [39]:
columns_to_impute = []
for i, column in enumerate(data.columns):
    if i not in categorical_features and pd.isnull(data[column]).any():
        columns_to_impute.append(column)

In [40]:
for column_name in columns_to_impute:
    data[column_name + "_imputed"] = pd.isnull(data[column_name]).astype(float)
    data[column_name].fillna(0, inplace=True)

In [41]:
for i, column in enumerate(data.columns):
    if i not in categorical_features:
        data[column] = data[column].astype(float)

In [42]:
data["TARGET"] = target

In [43]:
train_ratio = 0.70
validation_ratio = 0.15
test_ratio = 0.15

In [44]:
train, test = train_test_split(data, test_size=1 - train_ratio, random_state=42, shuffle=True)
valid, test = train_test_split(test, test_size=test_ratio/(test_ratio + validation_ratio), random_state=42, shuffle=True)

In [45]:
data.to_csv("data_clean.csv", index=False)
train.to_csv("train.csv", index=False)
valid.to_csv("val.csv", index=False)
test.to_csv("test.csv", index=False)