In [6]:
import re
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from category_encoders import TargetEncoder
from catboost import CatBoostRegressor
import lightgbm as lgb

# Load training and test data
train_data = pd.read_csv('/Users/vijeethvj8/Documents/Kaggle projects/playground-series-s4e9/train.csv')
test_data = pd.read_csv('/Users/vijeethvj8/Documents/Kaggle projects/playground-series-s4e9/test.csv')

# Check if the data loaded correctly
print("Training Data:")
print(train_data.head())
print("\nTest Data:")
print(test_data.head())


Training Data:
   id          brand              model  model_year  milage      fuel_type  \
0   0           MINI      Cooper S Base        2007  213000       Gasoline   
1   1        Lincoln              LS V8        2002  143250       Gasoline   
2   2      Chevrolet  Silverado 2500 LT        2002  136731  E85 Flex Fuel   
3   3        Genesis   G90 5.0 Ultimate        2017   19500       Gasoline   
4   4  Mercedes-Benz        Metris Base        2021    7388       Gasoline   

                                              engine  \
0       172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel   
1       252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel   
2  320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...   
3       420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel   
4       208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel   

                     transmission ext_col int_col  \
0                             A/T  Yellow    Gray   
1                             A/T  Silver   Beige   
2                   

In [7]:
# Define a function to extract horsepower from the 'engine' column
def extract_horsepower(engine_str):
    match = re.search(r"(\d+\.\d+|\d+)HP", engine_str)
    return float(match.group(1)) if match else np.nan

# Apply horsepower extraction to both train and test data
train_data['horsepower'] = train_data['engine'].apply(lambda x: extract_horsepower(x) if pd.notnull(x) else np.nan)
test_data['horsepower'] = test_data['engine'].apply(lambda x: extract_horsepower(x) if pd.notnull(x) else np.nan)

# Drop the original 'engine' column as we now have horsepower
train_data = train_data.drop('engine', axis=1)
test_data = test_data.drop('engine', axis=1)

# Add a placeholder 'price' column to the test data to match the train data structure
test_data['price'] = 0  # Placeholder value for consistency with the training set

print("\nTraining Data Columns after horsepower extraction:")
print(train_data.columns)

print("\nTest Data Columns after horsepower extraction:")
print(test_data.columns)



Training Data Columns after horsepower extraction:
Index(['id', 'brand', 'model', 'model_year', 'milage', 'fuel_type',
       'transmission', 'ext_col', 'int_col', 'accident', 'clean_title',
       'price', 'horsepower'],
      dtype='object')

Test Data Columns after horsepower extraction:
Index(['id', 'brand', 'model', 'model_year', 'milage', 'fuel_type',
       'transmission', 'ext_col', 'int_col', 'accident', 'clean_title',
       'horsepower', 'price'],
      dtype='object')


In [8]:
# Encode categorical variables with TargetEncoder
encoder = TargetEncoder(cols=['brand', 'model', 'fuel_type', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title'])

# Fit the encoder on the training data and transform both training and test data
train_data_encoded = encoder.fit_transform(train_data, train_data['price'])
test_data_encoded = encoder.transform(test_data)

# Drop the placeholder 'price' column from the test set
test_data_encoded = test_data_encoded.drop(columns=['price'])

print("\nTraining Data Columns after encoding:")
print(train_data_encoded.columns)

print("\nTest Data Columns after encoding:")
print(test_data_encoded.columns)



Training Data Columns after encoding:
Index(['id', 'brand', 'model', 'model_year', 'milage', 'fuel_type',
       'transmission', 'ext_col', 'int_col', 'accident', 'clean_title',
       'price', 'horsepower'],
      dtype='object')

Test Data Columns after encoding:
Index(['id', 'brand', 'model', 'model_year', 'milage', 'fuel_type',
       'transmission', 'ext_col', 'int_col', 'accident', 'clean_title',
       'horsepower'],
      dtype='object')


In [9]:
# Prepare features and target variable
X = train_data_encoded.drop(columns=['price'])
y = train_data_encoded['price']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Confirm that the split was successful
print("Shapes of X_train and X_val:")
print(X_train.shape, X_val.shape)


Shapes of X_train and X_val:
(150826, 12) (37707, 12)


In [10]:
# Train CatBoost Model
catboost_model = CatBoostRegressor(verbose=100, random_seed=42)

# Train CatBoost model on training data
catboost_model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=100)

# Evaluate CatBoost Model
catboost_predictions = catboost_model.predict(X_val)
catboost_rmse = np.sqrt(mean_squared_error(y_val, catboost_predictions))
print(f"CatBoost Validation RMSE: {catboost_rmse}")


Learning rate set to 0.111922
0:	learn: 78727.8439478	test: 73437.2812691	best: 73437.2812691 (0)	total: 65.8ms	remaining: 1m 5s
100:	learn: 70923.2382318	test: 67882.8564584	best: 67838.6431071 (63)	total: 582ms	remaining: 5.18s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 67838.64311
bestIteration = 63

Shrink model to first 64 iterations.
CatBoost Validation RMSE: 67838.64310712501


In [11]:
# Train LightGBM Model
lgb_train_data = lgb.Dataset(X_train, label=y_train)
lgb_eval_data = lgb.Dataset(X_val, label=y_val, reference=lgb_train_data)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'feature_fraction': 0.9
}

# Train LightGBM model
lightgbm_model = lgb.train(params,
                           lgb_train_data,
                           num_boost_round=1000,
                           valid_sets=[lgb_eval_data],
                           callbacks=[
                               lgb.early_stopping(stopping_rounds=100),
                               lgb.log_evaluation(period=100)
                           ])

# Evaluate LightGBM Model
lightgbm_predictions = lightgbm_model.predict(X_val, num_iteration=lightgbm_model.best_iteration)
lightgbm_rmse = np.sqrt(mean_squared_error(y_val, lightgbm_predictions))
print(f"LightGBM Validation RMSE: {lightgbm_rmse}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003130 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1480
[LightGBM] [Info] Number of data points in the train set: 150826, number of used features: 12
[LightGBM] [Info] Start training from score 43890.785316
Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 67563.6
Early stopping, best iteration is:
[88]	valid_0's rmse: 67549.4
LightGBM Validation RMSE: 67549.40574413467


In [15]:
# Check if the 'price' column exists in test_data_encoded, and add it if necessary
if 'price' not in test_data_encoded.columns:
    test_data_encoded['price'] = 0  # Add a placeholder column for consistency

# Use the best model to predict on the test set
try:
    test_features = test_data_encoded.drop(columns=['price'])
except KeyError:
    # The 'price' column might not exist, continue without dropping
    test_features = test_data_encoded

# Generate predictions
test_predictions = best_model.predict(test_features)

# Step 11: Save predictions to a CSV file
test_data_encoded['price'] = test_predictions
test_data_encoded[['id', 'price']].to_csv('test_predictions.csv', index=False)

print("Predictions saved to 'test_predictions.csv'")


Predictions saved to 'test_predictions.csv'
