In [1]:
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_percentage_error

In [None]:
import utils.data_preprocessing as dp

# データの読み込み

In [None]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

In [None]:
train

In [None]:
train["manufacturer"].value_counts()

# 定数

In [None]:
area_mapping = {
    # trainデータ
    'SF bay area': 'California',
    'ashtabula': 'Ohio',
    'brainerd': 'Minnesota',
    'brownsville': 'Texas',
    'columbia': 'South Carolina',
    'columbia / jeff city': 'Missouri',
    'daytona beach': 'Florida',
    'dubuque': 'Iowa',
    'el paso': 'Texas',
    'flagstaff / sedona': 'Arizona',
    'florence': 'South Carolina',
    'florida keys': 'Florida',
    'galveston': 'Texas',
    'grand forks': 'North Dakota',
    'grand rapids': 'Michigan',
    'great falls': 'Montana',
    'heartland florida': 'Florida',
    'imperial county': 'California',
    'joplin': 'Missouri',
    'kalispell': 'Montana',
    'lakeland': 'Florida',
    'las vegas': 'Nevada',
    'lawton': 'Oklahoma',
    'long island': 'New York',
    'manhattan': 'New York',
    'merced': 'California',
    'minneapolis / st paul': 'Minnesota',
    'morgantown': 'West Virginia',
    'moses lake': 'Washington',
    'nashville': 'Tennessee',
    'northeast SD': 'South Dakota',
    'northwest KS': 'Kansas',
    'panama city': 'Florida',
    'pittsburgh': 'Pennsylvania',
    'poconos': 'Pennsylvania',
    'pullman / moscow': 'Washington',
    'raleigh / durham / CH': 'North Carolina',
    'rockford': 'Illinois',
    'salem': 'Oregon',
    'san antonio': 'Texas',
    'san diego': 'California',
    'savannah / hinesville': 'Georgia',
    'southern WV': 'West Virginia',
    'southwest VA': 'Virginia',
    'spokane / coeur d\'alene': 'Washington',
    'tallahassee': 'Florida',
    'tucson': 'Arizona',
    'utica-rome-oneida': 'New York',
    'valdosta': 'Georgia',
    'vermont': 'Vermont',
    'waterloo / cedar falls': 'Iowa',
    'watertown': 'New York',
    'western KY': 'Kentucky',
    'yuba-sutter': 'California',
    'yuma': 'Arizona',
    #  testデータ
    'birmingham': 'Alabama',
    'central michigan': 'Michigan',
    'charleston': 'South Carolina',
    'cleveland': 'Ohio',
    'east oregon': 'Oregon',
    'eastern NC': 'North Carolina',
    'glens falls': 'New York',
    'hanford-corcoran': 'California',
    'huntsville / decatur': 'Alabama',
    'jersey shore': 'New Jersey',
    'lafayette': 'Louisiana',
    'muskegon': 'Michigan',
    'ocala': 'Florida',
    'prescott': 'Arizona',
    'rochester': 'New York',
    'siskiyou county': 'California',
    'texarkana': 'Texas',
    'waco': 'Texas',
    'western slope': 'Colorado',
    'wyoming': 'Wyoming'
} 

# 前処理

In [None]:
# manufacturer
train = dp.preprocess_manufacturer(train)

In [None]:
# year
train = dp.fix_year_column(train)

In [None]:
#odometer（負を正に転換）
train["odometer"] = (train["odometer"]**2)**0.5

In [None]:
# size
train = dp.normalize_size_column(train)

In [None]:
# state
dp.fill_missing_state(train, area_mapping)

In [None]:
# fuel, title_status, type（現状、最頻値で埋めている）
dp.fillna_with_mode(train, ['fuel', 'title_status', 'type'])

In [None]:
train.isnull().sum()

In [None]:
train["odometer"].describe()

In [None]:
len(train["state"].value_counts(dropna=False))

In [None]:
len(train["region"].value_counts(dropna=False))

# 特徴量の追加

In [None]:
# odometerを5000で丸める
# train["odometer_bin5000"]=train["odometer"]//5000
# train.head()

In [None]:
# 人気度をカウント数で算出する（現状これだけだと精度が落ちる）
# df_mts=df_train[["id","manufacturer","type","size"]].groupby(["manufacturer","type","size"]).count().reset_index()
# df_mts.columns=["manufacturer","type","size","mts_counts"]
# df_mts
# train=pd.merge(train,df_mts,on=["manufacturer","type","size"],how="left")

In [None]:
# priceの修正
train["price"]=train["price"].apply(lambda x:math.log10(x))
train

In [None]:
# ワンホットに変換
train = pd.get_dummies(train, columns=["condition","manufacturer", "type", "size", "state", "region", "fuel"])
test = pd.get_dummies(test, columns=["condition", "manufacturer", "type", "size", "state", "region", "fuel"])

# データの分割

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# train_train, train_test = train_test_split(train, test_size=0.2,random_state=0)

In [None]:
#priceは予測対象で学習に必要なため別途targetの変数に格納する
target = train["price"]
#学習させる特徴以外を削除
train_set = train.drop(columns=['id', 'cylinders', 'title_status', 'transmission', 'drive', 'paint_color', "price"], axis=1)
test_set = test.drop(columns=['id', 'cylinders',  'title_status', 'transmission', 'drive',  'paint_color'], axis=1)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train_set, target, random_state = 82)
print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

In [None]:
len(train_train)

In [None]:
len(train_test)

In [None]:
train_test

# 学習と検証

In [None]:
# import pycaret

In [None]:
#ランダムフォレストで学習
model = RandomForestRegressor(random_state=100)
model.fit(X_train, y_train)

In [None]:
#予測
pred = model.predict(X_valid)

In [None]:
#評価
score = mean_absolute_percentage_error(y_valid, pred)
print(score*100)

# 提出ファイルの出力

In [None]:
predict = model.predict(test_set)
predict