In [68]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import datetime
import unicodedata
import numpy as np

In [69]:
data = pd.read_csv("train.csv")
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [70]:
#　表記ゆれ修正


# Convert all values to lowercase and replace full-width characters with half-width characters
data['manufacturer'] = data['manufacturer'].str.lower().apply(lambda x: unicodedata.normalize('NFKC', x))

# Manually fix specific character inconsistencies
data['manufacturer'] = data['manufacturer'].replace({
    'niѕsan': 'nissan',
    'nisѕan': 'nissan',
    'subαru': 'subaru',
    'sαturn': 'saturn',
    'аcura': 'acura',
    'vоlkswagen': 'volkswagen',
    'lexuѕ': 'lexus',
    'ᴄhrysler': 'chrysler'
})


In [71]:
# sizeの表記ゆれ

size_corrections = {
    "full−size": "full-size",
    "fullーsize": "full-size",
    "mid−size": "mid-size",
    "midーsize": "mid-size",
    "subーcompact": "sub-compact"
}

data['size'] = data['size'].replace(size_corrections)


In [72]:
# yearの値修正


current_year = datetime.datetime.now().year

# Identify years that are greater than current year + 1
anomalous_years = data[data['year'] > current_year + 1]['year'].unique()

# Correct the anomalous years by subtracting 1000
data['year'] = data['year'].replace({year: year - 1000 for year in anomalous_years})


In [73]:
# 走行距離の分類と車製造年の特徴量追加→精度悪化したので保留

# quantiles = data['odometer'].quantile([0.25, 0.5, 0.75])
# conditions = [
#     data['odometer'] <= quantiles[0.25],
#     (data['odometer'] > quantiles[0.25]) & (data['odometer'] <= quantiles[0.5]),
#     data['odometer'] > quantiles[0.5]
# ]
# choices = ['low', 'medium', 'high']

# data['odometer_usage'] = np.select(conditions, choices, default='unknown')

# # 2. Create a new feature for car age
# data['car_age'] = current_year - data['year']

# # Check the first few rows of the dataframe to confirm the changes
# data[['odometer', 'odometer_usage', 'year', 'car_age']].head()

Unnamed: 0,odometer,odometer_usage,year,car_age
0,115148,high,1949,74
1,172038,high,2013,10
2,152492,high,1998,25
3,104118,medium,2014,9
4,144554,high,2005,18


In [74]:
from sklearn.model_selection import train_test_split

# Filling missing values
data['fuel'].fillna(data['fuel'].mode()[0], inplace=True)
data['title_status'].fillna(data['title_status'].mode()[0], inplace=True)
data['type'].fillna(data['type'].mode()[0], inplace=True)
data['state'].fillna(data['state'].mode()[0], inplace=True)

# Encoding categorical columns
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    data[col] = data[col].astype('category').cat.codes

# Splitting the data into train and validation sets (80:20)
train_data, valid_data = train_test_split(data, test_size=0.2, random_state=42)

# Separating the target variable
X_train = train_data.drop(columns=['price', 'id'])
y_train = train_data['price']
X_valid = valid_data.drop(columns=['price', 'id'])
y_valid = valid_data['price']

X_train.shape, X_valid.shape


((22025, 16), (5507, 16))

In [75]:
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error

# Create LightGBM dataset
train_dataset = lgb.Dataset(X_train, label=y_train)
valid_dataset = lgb.Dataset(X_valid, label=y_valid, reference=train_dataset)

# Set parameters for LightGBM
params = {
    'objective': 'regression',
    'metric': 'mape',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# Train the model
num_round = 1000
bst = lgb.train(params, train_dataset, num_round, valid_sets=[valid_dataset], early_stopping_rounds=10, verbose_eval=False)

# Predict on validation set
y_pred = bst.predict(X_valid, num_iteration=bst.best_iteration)

# Calculate MAPE
mape = mean_absolute_error(y_valid, y_pred) / (y_valid.sum() / len(y_valid)) * 100

mape


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 804
[LightGBM] [Info] Number of data points in the train set: 22025, number of used features: 16
[LightGBM] [Info] Start training from score 13477.637321


43.530333650987465

In [29]:
df_train.info()
# df_train["region"].unique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27532 entries, 0 to 27531
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            27532 non-null  int64 
 1   region        27532 non-null  object
 2   year          27532 non-null  int64 
 3   manufacturer  27532 non-null  object
 4   condition     27532 non-null  object
 5   cylinders     27532 non-null  object
 6   fuel          26293 non-null  object
 7   odometer      27532 non-null  int64 
 8   title_status  27076 non-null  object
 9   transmission  27532 non-null  object
 10  drive         27532 non-null  object
 11  size          27532 non-null  object
 12  type          27076 non-null  object
 13  paint_color   27532 non-null  object
 14  state         24228 non-null  object
 15  price         27532 non-null  int64 
dtypes: int64(4), object(12)
memory usage: 3.4+ MB


In [51]:
data.sort_values(by="year", ascending=False)

Unnamed: 0,id,region,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state,price
6576,6576,"washington, DC",2022,fiat,like new,4 cylinders,gas,90464,rebuilt,automatic,fwd,compact,hatchback,grey,dc,14732
25053,25053,central NJ,2022,nissan,like new,4 cylinders,gas,16945,clean,automatic,fwd,mid-size,SUV,black,nj,29134
3452,3452,greensboro,2022,chevrolet,excellent,4 cylinders,gas,102290,clean,automatic,4wd,full-size,truck,blue,,49995
10280,10280,south bend / michiana,2022,ford,excellent,4 cylinders,gas,68142,rebuilt,manual,fwd,mid-size,sedan,blue,in,37533
24052,24052,greenville / upstate,2022,audi,like new,4 cylinders,gas,165074,clean,manual,fwd,mid-size,wagon,blue,sc,20767
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14338,14338,beaumont / port arthur,1925,bmw,excellent,8 cylinders,gas,85,clean,manual,rwd,full-size,coupe,blue,,17358
21233,21233,central NJ,1925,ford,excellent,8 cylinders,gas,67467,clean,automatic,rwd,full-size,truck,black,nj,17590
19319,19319,palm springs,1922,mercedes-benz,excellent,6 cylinders,gas,123190,clean,automatic,rwd,full-size,coupe,custom,ca,31349
5097,5097,san antonio,1922,bmw,excellent,6 cylinders,gas,33865,salvage,automatic,fwd,mid-size,sedan,silver,tx,2826


In [61]:
data["state"].unique()

array([nan, 'pa', 'ks', 'ny', 'ca', 'al', 'or', 'va', 'mt', 'nj', 'ma',
       'wi', 'sc', 'wa', 'dc', 'oh', 'in', 'de', 'fl', 'nm', 'az', 'ok',
       'mn', 'co', 'nv', 'wv', 'tn', 'mi', 'ri', 'il', 'tx', 'ut', 'ia',
       'ga', 'md', 'mo', 'ky', 'nc', 'ak', 'id', 'ct', 'wy', 'nd', 'me',
       'ar', 'hi', 'sd', 'ne', 'nh', 'vt', 'la', 'ms'], dtype=object)