In [6]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
import utils.data_preprocessing as dp

# データの読み込み

In [7]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

In [8]:
train

Unnamed: 0,id,region,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state,price
0,0,nashville,1949,bmw,excellent,6 cylinders,gas,115148,clean,manual,rwd,mid-size,convertible,orange,,27587
1,1,state college,2013,toyota,fair,8 cylinders,gas,172038,clean,automatic,rwd,full-size,sedan,silver,pa,4724
2,2,wichita,1998,ford,good,6 cylinders,gas,152492,clean,automatic,fwd,full-size,SUV,silver,ks,10931
3,3,albany,2014,ford,excellent,4 cylinders,gas,104118,clean,manual,fwd,mid-size,SUV,blue,ny,16553
4,4,redding,2005,ford,excellent,6 cylinders,gas,144554,clean,manual,fwd,mid-size,sedan,red,ca,5158
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27527,27527,williamsport,2008,ford,good,6 cylinders,gas,26660,clean,automatic,rwd,compact,truck,black,pa,32212
27528,27528,tulsa,2007,ford,excellent,8 cylinders,gas,108072,clean,automatic,rwd,full-size,pickup,black,,5400
27529,27529,rochester,2019,jeep,like new,6 cylinders,gas,139908,clean,automatic,4wd,mid-size,SUV,white,ny,22227
27530,27530,rochester,2007,jeep,excellent,6 cylinders,gas,112326,clean,automatic,4wd,mid-size,sedan,white,ny,3054


In [9]:
train["manufacturer"].value_counts()

ford             6166
chevrolet        3339
bmw              2736
toyota           1570
honda            1526
                 ... 
SΑTURN              1
ＤＯＤＧＥ               1
ｉｎｆｉｎｉｔｉ            1
ｍｅｒｃｅｄｅｓ－ｂｅｎｚ       1
ᴄhrysler            1
Name: manufacturer, Length: 125, dtype: int64

# 定数

In [10]:
area_mapping = {
    # trainデータ
    'SF bay area': 'California',
    'ashtabula': 'Ohio',
    'brainerd': 'Minnesota',
    'brownsville': 'Texas',
    'columbia': 'South Carolina',
    'columbia / jeff city': 'Missouri',
    'daytona beach': 'Florida',
    'dubuque': 'Iowa',
    'el paso': 'Texas',
    'flagstaff / sedona': 'Arizona',
    'florence': 'South Carolina',
    'florida keys': 'Florida',
    'galveston': 'Texas',
    'grand forks': 'North Dakota',
    'grand rapids': 'Michigan',
    'great falls': 'Montana',
    'heartland florida': 'Florida',
    'imperial county': 'California',
    'joplin': 'Missouri',
    'kalispell': 'Montana',
    'lakeland': 'Florida',
    'las vegas': 'Nevada',
    'lawton': 'Oklahoma',
    'long island': 'New York',
    'manhattan': 'New York',
    'merced': 'California',
    'minneapolis / st paul': 'Minnesota',
    'morgantown': 'West Virginia',
    'moses lake': 'Washington',
    'nashville': 'Tennessee',
    'northeast SD': 'South Dakota',
    'northwest KS': 'Kansas',
    'panama city': 'Florida',
    'pittsburgh': 'Pennsylvania',
    'poconos': 'Pennsylvania',
    'pullman / moscow': 'Washington',
    'raleigh / durham / CH': 'North Carolina',
    'rockford': 'Illinois',
    'salem': 'Oregon',
    'san antonio': 'Texas',
    'san diego': 'California',
    'savannah / hinesville': 'Georgia',
    'southern WV': 'West Virginia',
    'southwest VA': 'Virginia',
    'spokane / coeur d\'alene': 'Washington',
    'tallahassee': 'Florida',
    'tucson': 'Arizona',
    'utica-rome-oneida': 'New York',
    'valdosta': 'Georgia',
    'vermont': 'Vermont',
    'waterloo / cedar falls': 'Iowa',
    'watertown': 'New York',
    'western KY': 'Kentucky',
    'yuba-sutter': 'California',
    'yuma': 'Arizona',
    #  testデータ
    'birmingham': 'Alabama',
    'central michigan': 'Michigan',
    'charleston': 'South Carolina',
    'cleveland': 'Ohio',
    'east oregon': 'Oregon',
    'eastern NC': 'North Carolina',
    'glens falls': 'New York',
    'hanford-corcoran': 'California',
    'huntsville / decatur': 'Alabama',
    'jersey shore': 'New Jersey',
    'lafayette': 'Louisiana',
    'muskegon': 'Michigan',
    'ocala': 'Florida',
    'prescott': 'Arizona',
    'rochester': 'New York',
    'siskiyou county': 'California',
    'texarkana': 'Texas',
    'waco': 'Texas',
    'western slope': 'Colorado',
    'wyoming': 'Wyoming'
} 

# 前処理

In [11]:
# manufacturer
train = dp.preprocess_manufacturer(train)

In [12]:
# year
train = dp.fix_year_column(train)

In [13]:
#odometer（負を正に転換）
train["odometer"] = (train["odometer"]**2)**0.5

In [14]:
# size
train = dp.normalize_size_column(train)

In [15]:
# state
dp.fill_missing_state(train, area_mapping)

In [16]:
# fuel, title_status, type（現状、最頻値で埋めている）
# TODO 最頻値の他に良い埋め方はないか
dp.fillna_with_mode(train, ['fuel', 'title_status', 'type'])

In [17]:
train.isnull().sum()

id              0
region          0
year            0
manufacturer    0
condition       0
cylinders       0
fuel            0
odometer        0
title_status    0
transmission    0
drive           0
size            0
type            0
paint_color     0
state           0
price           0
dtype: int64

In [18]:
train["odometer"].describe()

count    2.753200e+04
mean     1.160295e+05
std      6.449916e+04
min      1.000000e+00
25%      7.837100e+04
50%      1.114530e+05
75%      1.508562e+05
max      2.946000e+06
Name: odometer, dtype: float64

In [19]:
# priceの修正
train["price"]=train["price"].apply(lambda x:math.log10(x))
train

Unnamed: 0,id,region,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state,price
0,0,nashville,1949,bmw,excellent,6 cylinders,gas,115148.0,clean,manual,rwd,mid-size,convertible,orange,Tennessee,4.440704
1,1,state college,2013,toyota,fair,8 cylinders,gas,172038.0,clean,automatic,rwd,full-size,sedan,silver,pa,3.674310
2,2,wichita,1998,ford,good,6 cylinders,gas,152492.0,clean,automatic,fwd,full-size,SUV,silver,ks,4.038660
3,3,albany,2014,ford,excellent,4 cylinders,gas,104118.0,clean,manual,fwd,mid-size,SUV,blue,ny,4.218877
4,4,redding,2005,ford,excellent,6 cylinders,gas,144554.0,clean,manual,fwd,mid-size,sedan,red,ca,3.712481
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27527,27527,williamsport,2008,ford,good,6 cylinders,gas,26660.0,clean,automatic,rwd,compact,truck,black,pa,4.508018
27528,27528,tulsa,2007,ford,excellent,8 cylinders,gas,108072.0,clean,automatic,rwd,full-size,pickup,black,ok,3.732394
27529,27529,rochester,2019,jeep,like new,6 cylinders,gas,139908.0,clean,automatic,4wd,mid-size,SUV,white,ny,4.346881
27530,27530,rochester,2007,jeep,excellent,6 cylinders,gas,112326.0,clean,automatic,4wd,mid-size,sedan,white,ny,3.484869


# 特徴量の追加

# データの分割

In [20]:
train_set, test_set = train_test_split(train, test_size=0.2,random_state=0)

In [21]:
len(train_set)

22025

In [22]:
len(test_set)

5507

In [23]:
test_set

Unnamed: 0,id,region,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state,price
13009,13009,eastern CT,2004,toyota,excellent,8 cylinders,gas,222927.0,clean,automatic,4wd,full-size,truck,black,ct,3.998739
3668,3668,lawrence,2015,bmw,like new,6 cylinders,gas,113035.0,clean,automatic,4wd,full-size,SUV,black,ks,4.279963
22256,22256,kennewick-pasco-richland,2012,fiat,like new,4 cylinders,gas,71596.0,clean,manual,fwd,compact,hatchback,black,wa,4.282554
23585,23585,rhode island,2008,chevrolet,excellent,8 cylinders,gas,114423.0,clean,automatic,4wd,full-size,truck,black,ri,4.586182
25012,25012,ft myers / SW florida,2016,bmw,excellent,6 cylinders,gas,59807.0,clean,manual,rwd,mid-size,sedan,black,fl,4.108869
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14405,14405,gainesville,2012,ford,like new,4 cylinders,gas,108339.0,clean,automatic,fwd,mid-size,sedan,silver,fl,4.007961
312,312,boulder,2010,jeep,excellent,6 cylinders,gas,67562.0,clean,automatic,4wd,full-size,SUV,black,co,4.726849
25930,25930,tampa bay area,2002,toyota,like new,6 cylinders,gas,118638.0,clean,automatic,rwd,full-size,SUV,grey,fl,4.298329
9825,9825,rochester,2019,ford,excellent,8 cylinders,gas,26815.0,clean,automatic,rwd,full-size,van,white,ny,4.244005
