In [1]:
import pandas as pd
from supplemental_english import REGION_CODES,GOVERNMENT_CODES
# pd.set_option('display.max_rows',None)

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_test['price'] = 0

In [84]:
def parse(df_train, df_test):
    
    df = pd.concat([df_train, df_test], axis=0)

    def plates(plate):
        letter = plate[0] + plate[4] + plate[5]
        number = plate[1:4]
        region = plate[6:]
        return pd.Series([letter,number,region])
    
    def region_info(region_code):
        for region, codes in REGION_CODES.items():
            if region_code in codes:
                return region
        return 'U'
    
    def goverment(letter,number, region_code):
        for (letter_code,num_range, reg_cod), value in GOVERNMENT_CODES.items():
            if (letter == letter_code 
                and num_range[0]<=number<=num_range[1] 
                and region_code==reg_cod
            ):
                 return pd.Series([value[1], value[2], value[3]], 
                             index=['forbidden', 'advanteg', 'significance'])
        return pd.Series([-1, -1, 0], 
                     index=['forbidden', 'advanteg', 'significance'])

    df[['letter','number','region']] = df['plate'].apply(plates)
    df['number'] = pd.to_numeric(df['number'])
    df.drop(columns=['plate'], axis=1, inplace=True)

    df['region_name'] = df['region'].apply(region_info)
    df[['forbidden','advanteg','significance']] = df.apply(
    lambda row: goverment(row['letter'], row['number'], row['region']), axis=1)

    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.day_of_week
    df['is_weekend'] = df['dayofweek'].isin([5,6]).astype(int)
    df['day_since'] = (df['date'].max()-df['date']).dt.days
    df.drop(columns=['date','id'], axis=1, inplace=True)

    df['let_1'] = df['letter'].str[0]
    df['let_2'] = df['letter'].str[1]
    df['let_3'] = df['letter'].str[2]
    df.drop(columns='letter', axis=1, inplace=True)

    df['region'] = pd.to_numeric(df['region'])

    letter = {'A':0,'M':1,'O':2,'B':3,'C':4,'X':5,'E':6,'K':7,'P':8,'H':9,'T':10,'Y':11}

    for i in ['let_1','let_2','let_3']:
        df[i] = df[i].map(letter)

    avg_price = df.groupby(['region_name','significance'])['price'].mean().reset_index()
    avg_price = avg_price.rename(columns={'price':'avg_price'})
    df = df.merge(avg_price, on=['region_name','significance'])

    df = pd.get_dummies(df, columns=['region_name'])

    df['combine'] = df['advanteg'] + df['significance'] + df['forbidden']

    df['price_diff_avg'] = df['price'] - df['avg_price']
    
    return df

In [85]:
df = parse(df_train,df_test)
df.head()

Unnamed: 0,price,number,region,forbidden,advanteg,significance,year,month,day,dayofweek,...,region_name_Ulyanovsk Oblast,region_name_Vladimir Oblast,region_name_Volgograd Oblast,region_name_Vologda Oblast,region_name_Voronezh Oblast,region_name_Yamalo-Nenets Autonomous Okrug,region_name_Yaroslavl Oblast,region_name_Zabaykalsky Krai,combine,price_diff_avg
0,65000,59,797,-1,-1,0,2024,12,26,3,...,False,False,False,False,False,False,False,False,-2,-344898.608485
1,100000,800,790,-1,-1,0,2024,7,12,4,...,False,False,False,False,False,False,False,False,-2,-176465.256246
2,290000,212,77,-1,-1,0,2024,4,18,3,...,False,False,False,False,False,False,False,False,-2,-119898.608485
3,680000,1,199,-1,-1,0,2025,1,3,4,...,False,False,False,False,False,False,False,False,-2,270101.391515
4,750000,1,199,-1,-1,0,2025,1,10,4,...,False,False,False,False,False,False,False,False,-2,340101.391515


In [86]:
df.corr()['price'].sort_values(ascending=False)

price                        1.000000
price_diff_avg               0.976677
avg_price                    0.214715
significance                 0.147784
combine                      0.140166
                               ...   
region_name_Moscow Oblast   -0.036905
day_since                   -0.040945
let_2                       -0.042148
let_1                       -0.048211
region                      -0.108162
Name: price, Length: 106, dtype: float64

In [89]:
train,test = df[:len(df_train)], df[len(df_train):]

In [88]:
train.corr()['price'].sort_values(ascending=False)

price                        1.000000
price_diff_avg               0.979687
avg_price                    0.231938
significance                 0.160626
combine                      0.152402
                               ...   
region_name_Moscow Oblast   -0.041576
let_2                       -0.044976
day_since                   -0.047711
let_1                       -0.052223
region                      -0.115608
Name: price, Length: 106, dtype: float64

In [90]:
X = train.drop(columns='price')
y = train['price']

In [124]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, r2_score
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from catboost import CatBoostRegressor

In [92]:
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size=0.2)

In [123]:
scaled = MinMaxScaler()
X_train_scaled = scaled.fit_transform(X_train)
X_test_scaled = scaled.transform(X_test)

In [94]:
def smape(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0  # Hindari pembagian nol
    return np.mean(diff) * 100

In [125]:
model = RandomForestRegressor()
model.fit(X_train_scaled,Y_train)
accuracy = model.predict(X_test_scaled)
accuracy =  smape(Y_test,accuracy)
accuracy

np.float64(0.41617140186070223)

In [96]:
model_1 = XGBRegressor()
model_1.fit(X_train, Y_train)
accuracy = model_1.predict(X_test)
accuracy = smape(Y_test,accuracy)
accuracy

np.float64(3.8378485205110677)

In [None]:
model_2 = LGBMRegressor()
model_2.fit(X_train, Y_train)
accuracy = model_2.predict(X_test)
accuracy = smape(Y_test,accuracy)
accuracy

In [128]:
accuracy = model_2.predict(test)
final = pd.DataFrame()
final['id'] = df_test['id']
final['price'] = accuracy
final.to_csv("Submission.csv", index=False)

In [98]:
model_3 = DecisionTreeRegressor()
model_3.fit(X_train, Y_train)
accuracy = model_3.predict(X_test)
accuracy = smape(Y_test,accuracy)
accuracy

np.float64(0.4174754912863935)

In [99]:
model_4 = CatBoostRegressor()
model_4.fit(X_train_scaled,Y_train)
accuracy = model_4.predict(X_test_scaled)
accuracy = smape(Y_test, accuracy)
accuracy

Learning rate set to 0.073708
0:	learn: 1788348.2128534	total: 286ms	remaining: 4m 45s
1:	learn: 1712880.7974643	total: 363ms	remaining: 3m 1s
2:	learn: 1635560.5586664	total: 492ms	remaining: 2m 43s
3:	learn: 1567716.5043241	total: 580ms	remaining: 2m 24s
4:	learn: 1499969.4969906	total: 666ms	remaining: 2m 12s
5:	learn: 1443114.6476010	total: 732ms	remaining: 2m 1s
6:	learn: 1388330.1592045	total: 788ms	remaining: 1m 51s
7:	learn: 1338479.1525354	total: 880ms	remaining: 1m 49s
8:	learn: 1293077.8394101	total: 955ms	remaining: 1m 45s
9:	learn: 1247775.4990629	total: 1.07s	remaining: 1m 46s
10:	learn: 1206589.9379724	total: 1.19s	remaining: 1m 46s
11:	learn: 1170330.5918110	total: 1.26s	remaining: 1m 43s
12:	learn: 1134393.1074976	total: 1.64s	remaining: 2m 4s
13:	learn: 1103389.5502225	total: 2.25s	remaining: 2m 38s
14:	learn: 1073534.0286638	total: 2.79s	remaining: 3m 2s
15:	learn: 1045330.2362389	total: 3.17s	remaining: 3m 14s
16:	learn: 1020254.3321914	total: 3.32s	remaining: 3m 12

np.float64(5.915474423123523)