In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import ElasticNet
import numpy as np
import re
import time


In [3]:
# Load datasets
train_data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')


In [4]:
# Features and target variable
X_train = train_data.drop(columns=['price', 'id'])
y_train = train_data['price']
X_test = test_data.drop(columns=['id'])

In [5]:
# Function to extract numeric horsepower
def extract_hp(engine_str):
    match = re.search(r'(\d+\.\d+|\d+)HP', engine_str)
    if match:
        return float(match.group(1))
    else:
        return np.nan

# Apply function to extract horsepower
if 'engine' in X_train.columns:
    X_train['engine_hp'] = X_train['engine'].apply(extract_hp)
if 'engine' in X_test.columns:
    X_test['engine_hp'] = X_test['engine'].apply(extract_hp)

In [6]:
# Drop the original 'engine' feature
X_train.drop(columns=['engine'], inplace=True, errors='ignore')
X_test.drop(columns=['engine'], inplace=True, errors='ignore')

# Drop rows with NaN values in 'engine_hp' (if any)
X_train = X_train.dropna(subset=['engine_hp'])
y_train = y_train.loc[X_train.index]

In [14]:
# Define numerical and categorical features
numeric_features = ['model_year', 'milage', 'engine_hp']
categorical_features = ['brand', 'model', 'fuel_type', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']

# Create transformers for numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a single preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline that combines preprocessing and the LightGBM model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', lgb.LGBMRegressor(random_state=42))
])

# Define hyperparameter grid for tuning
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__num_leaves': [31, 50, 100]
}

# Set up GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Train the model using GridSearchCV
start_time = time.time()
grid_search.fit(X_train, y_train)
end_time = time.time()

elapsed_time = end_time - start_time
print(f"GridSearchCV training time: {elapsed_time:.2f} seconds")

# Get the best model
best_model = grid_search.best_estimator_

# Predict on the test set
y_test_pred = best_model.predict(X_test)



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009107 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1983
[LightGBM] [Info] Number of data points in the train set: 50216, number of used features: 733
[LightGBM] [Info] Start training from score 37478.310419
GridSearchCV training time: 217.72 seconds


In [15]:
# Since we don't have true price labels for the test set, we'll calculate RMSE on a portion of the training data held out as validation set
# Split the training data into a smaller training set and a validation set
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train the best model on the split training data
best_model.fit(X_train_split, y_train_split)

# Predict on the validation set
y_val_pred = best_model.predict(X_val)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f"Validation RMSE: {rmse:.2f}")

# Save the predictions to a CSV file in the format [id, price]
submission = pd.DataFrame({
    'id': test_data['id'],
    'price': y_test_pred
})
submission.to_csv('/content/predicted_prices.csv', index=False)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.127616 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1750
[LightGBM] [Info] Number of data points in the train set: 40172, number of used features: 617
[LightGBM] [Info] Start training from score 37318.183635
Validation RMSE: 66131.66


In [30]:
print("Length of ID column in test_data:", len(test_data['id']))
print("Length of predicted prices array:", len(y_pred))


Length of ID column in test_data: 36183
Length of predicted prices array: 33577


In [16]:
# Feature importance
importances = best_model.named_steps['regressor'].feature_importances_
feature_names = numeric_features + list(best_model.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(categorical_features))

# Create a DataFrame for visualization
feature_importances = pd.DataFrame({'feature': feature_names, 'importance': importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)

print(feature_importances)

                              feature  importance
1                              milage        1992
0                          model_year        1482
2                           engine_hp        1421
43                      brand_Porsche         296
1557                   model_Tahoe LT         252
...                               ...         ...
639         model_Expedition Platinum           0
638          model_Expedition Max XLT           0
637      model_Expedition Max Limited           0
636   model_Expedition Max King Ranch           0
1867                  clean_title_Yes           0

[1868 rows x 2 columns]
