In [4]:
import numpy as np, pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
from google.colab import drive

In [5]:
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
train = pd.read_csv("/content/drive/MyDrive/dsn/train.csv")
test =  pd.read_csv("/content/drive/MyDrive/dsn/test.csv")


In [7]:
train.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [8]:
test.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,188533,Land,Rover LR2 Base,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes
1,188534,Land,Rover Defender SE,2020,9142,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,Silver,Black,None reported,Yes
2,188535,Ford,Expedition Limited,2022,28121,Gasoline,3.5L V6 24V PDI DOHC Twin Turbo,10-Speed Automatic,White,Ebony,None reported,
3,188536,Audi,A6 2.0T Sport,2016,61258,Gasoline,2.0 Liter TFSI,Automatic,Silician Yellow,Black,None reported,
4,188537,Audi,A6 2.0T Premium Plus,2018,59000,Gasoline,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Black,None reported,Yes


In [10]:
assert 'price' in train.columns and 'id' in test.columns, "train must contain 'price' and test must contain 'id'"

y = train["price"].copy()
train_ids = train["id"]
test_ids = test["id"]

X = train.drop(["id", "price"], axis=1)
X_test = test.drop(["id"], axis=1)

In [11]:
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

for c in num_cols:
    med = X[c].median()
    X[c] = X[c].fillna(med)
    X_test[c] = X_test[c].fillna(med)

for c in cat_cols:
    X[c] = X[c].astype(str).fillna("NA")
    X_test[c] = X_test[c].astype(str).fillna("NA")

enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
all_cat = pd.concat([X[cat_cols], X_test[cat_cols]], axis=0)
enc.fit(all_cat)

X_cat = enc.transform(X[cat_cols]) if cat_cols else np.empty((len(X),0))
X_test_cat = enc.transform(X_test[cat_cols]) if cat_cols else np.empty((len(X_test),0))

X_enc = np.hstack([X[num_cols].values, X_cat])
X_test_enc = np.hstack([X_test[num_cols].values, X_test_cat])

In [12]:

y_log = np.log1p(y.values)

In [13]:

kf = KFold(n_splits=5, shuffle=True, random_state=42)
preds = np.zeros(X_test_enc.shape[0])
cv_scores = []

params = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.03,   # lower lr
    "num_leaves": 127,       # larger leaves
    "max_depth": -1,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.9,
    "bagging_freq": 5,
    "min_child_samples": 20,
    "lambda_l1": 1.0,
    "lambda_l2": 1.0,
    "seed": 42,
    "verbosity": -1,
}

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_enc), 1):
    X_tr, X_val = X_enc[tr_idx], X_enc[val_idx]
    y_tr, y_val = y_log[tr_idx], y_log[val_idx]

    dtrain = lgb.Dataset(X_tr, label=y_tr)
    dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)

    model = lgb.train(
        params,
        dtrain,
        num_boost_round=10000,
        valid_sets=[dtrain, dval],
        callbacks=[early_stopping(200), log_evaluation(500)]
    )

    val_pred_log = model.predict(X_val, num_iteration=model.best_iteration)
    val_rmse = np.sqrt(mean_squared_error(np.expm1(y_val), np.expm1(val_pred_log)))
    cv_scores.append(val_rmse)
    print(f"Fold {fold} RMSE: {val_rmse:.4f}")

    preds += np.expm1(model.predict(X_test_enc, num_iteration=model.best_iteration)) / kf.n_splits

print("CV RMSE per fold:", cv_scores)
print("Mean CV RMSE:", np.mean(cv_scores))

Training until validation scores don't improve for 200 rounds
[500]	training's rmse: 0.452077	valid_1's rmse: 0.4903
Early stopping, best iteration is:
[615]	training's rmse: 0.446186	valid_1's rmse: 0.490247
Fold 1 RMSE: 68531.5017
Training until validation scores don't improve for 200 rounds
[500]	training's rmse: 0.452682	valid_1's rmse: 0.48708
Early stopping, best iteration is:
[531]	training's rmse: 0.450916	valid_1's rmse: 0.486973
Fold 2 RMSE: 68991.1029
Training until validation scores don't improve for 200 rounds
[500]	training's rmse: 0.452439	valid_1's rmse: 0.489979
Early stopping, best iteration is:
[576]	training's rmse: 0.448414	valid_1's rmse: 0.489821
Fold 3 RMSE: 74413.7709
Training until validation scores don't improve for 200 rounds
[500]	training's rmse: 0.452676	valid_1's rmse: 0.487716
Early stopping, best iteration is:
[572]	training's rmse: 0.448745	valid_1's rmse: 0.487569
Fold 4 RMSE: 77140.4095
Training until validation scores don't improve for 200 rounds
[

In [16]:
submission = pd.DataFrame({"id": test_ids, "price": preds})
submission.to_csv('/content/drive/MyDrive/dsn/submission.csv', index=False)

print("Saved submission.csv")

Saved submission.csv
