In [4]:
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
np.random.seed(0)

import os
import wget
from pathlib import Path

from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
target = "TARGET"

bool_columns = [
    "Wilderness_Area1", "Wilderness_Area2", "Wilderness_Area3",
    "Wilderness_Area4", "Soil_Type1", "Soil_Type2", "Soil_Type3", "Soil_Type4",
    "Soil_Type5", "Soil_Type6", "Soil_Type7", "Soil_Type8", "Soil_Type9",
    "Soil_Type10", "Soil_Type11", "Soil_Type12", "Soil_Type13", "Soil_Type14",
    "Soil_Type15", "Soil_Type16", "Soil_Type17", "Soil_Type18", "Soil_Type19",
    "Soil_Type20", "Soil_Type21", "Soil_Type22", "Soil_Type23", "Soil_Type24",
    "Soil_Type25", "Soil_Type26", "Soil_Type27", "Soil_Type28", "Soil_Type29",
    "Soil_Type30", "Soil_Type31", "Soil_Type32", "Soil_Type33", "Soil_Type34",
    "Soil_Type35", "Soil_Type36", "Soil_Type37", "Soil_Type38", "Soil_Type39",
    "Soil_Type40"
]

int_columns = [
    "Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology",
    "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways",
    "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points"
]

feature_columns = (
    int_columns + bool_columns + [target])

In [3]:
data = pd.read_csv("covtype.data", names=feature_columns)

In [7]:
data.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,TARGET
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


In [11]:
data["TARGET"] = data["TARGET"] - 1

In [13]:
train_ratio = 0.70
validation_ratio = 0.15
test_ratio = 0.15

In [14]:
data_train, data_test = train_test_split(data, test_size=1 - train_ratio, random_state=42, shuffle=True)
data_val, data_test = train_test_split(data_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state=42, shuffle=True)

data_train.to_csv("train.csv", index=False)
data_val.to_csv("val.csv", index=False)
data_test.to_csv("test.csv", index=False)

In [5]:
small = 0.05
medium = 0.2
high = 0.5
extra = 1.0

In [8]:
new_data = data.sample(int(data.shape[0] * small))
new_data.to_csv("data_small.csv")

In [9]:
new_data = data.sample(int(data.shape[0] * medium))
new_data.to_csv("data_medium.csv")

In [10]:
new_data = data.sample(int(data.shape[0] * high))
new_data.to_csv("data_high.csv")

In [11]:
new_data = data.sample(int(data.shape[0] * extra))
new_data.to_csv("data_extra.csv")

In [22]:
data.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,TARGET
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


In [12]:
data["TARGET"] = data[54]
data = data.drop([54], axis=1)

In [72]:
categorical_columns = ["Auction", "Make", "Model", "Trim", "SubModel", "Color",
                      "Transmission", "WheelTypeID", "Nationality", "Size", "TopThreeAmericanName",
                      "BYRNO", "VNZIP1", "VNST"]
useless_columns = ["WheelType", "PRIMEUNIT", "AUCGUART", "RefId"]

In [73]:
data = data.drop(useless_columns, axis=1)

In [74]:
data = data.dropna()

In [75]:
data["TARGET"] = LabelEncoder().fit_transform(data["TARGET"])

In [76]:
features = [col for col in categorical_columns] 
for feat in features:
    data[feat] = LabelEncoder().fit_transform(data[feat])

In [77]:
import time
import datetime
data["PurchDate"] = list(map(lambda x: int(time.mktime(datetime.datetime.strptime(x, "%m/%d/%Y").timetuple())), data["PurchDate"]))

In [78]:
data["PurchDate"] -= min(data["PurchDate"])
data["PurchDate"] //= 60 * 60 * 24

In [79]:
data.head()

Unnamed: 0,PurchDate,Auction,VehYear,VehicleAge,Make,Model,Trim,SubModel,Color,Transmission,...,MMRCurrentAuctionCleanPrice,MMRCurrentRetailAveragePrice,MMRCurrentRetailCleanPrice,BYRNO,VNZIP1,VNST,VehBCost,IsOnlineSale,WarrantyCost,TARGET
0,336,0,2006,3,16,533,131,213,12,0,...,8552.0,11597.0,12409.0,56,47,5,7100.0,0,1113,0
1,336,0,2004,5,5,0,92,730,14,0,...,9222.0,11374.0,12791.0,46,47,5,7600.0,0,1053,0
2,336,0,2005,4,5,795,97,284,7,0,...,5557.0,7146.0,8702.0,46,47,5,4900.0,0,1389,0
3,336,0,2004,5,5,591,97,147,13,0,...,2646.0,4375.0,5518.0,46,47,5,4100.0,0,630,0
4,336,0,2005,4,6,338,125,52,13,1,...,4384.0,6739.0,7911.0,46,47,5,4000.0,0,1020,0


In [80]:
data.to_csv("train.csv", index=False)