In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import ElasticNet
import numpy as np
import re
import time


In [3]:
# Load datasets
train_data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')


In [4]:
# Features and target variable
X_train = train_data.drop(columns=['price', 'id'])
y_train = train_data['price']
X_test = test_data.drop(columns=['id'])

In [5]:
# Function to extract numeric horsepower
def extract_hp(engine_str):
    match = re.search(r'(\d+\.\d+|\d+)HP', engine_str)
    if match:
        return float(match.group(1))
    else:
        return np.nan

# Apply function to extract horsepower
if 'engine' in X_train.columns:
    X_train['engine_hp'] = X_train['engine'].apply(extract_hp)
if 'engine' in X_test.columns:
    X_test['engine_hp'] = X_test['engine'].apply(extract_hp)

In [6]:
# Drop the original 'engine' feature
X_train.drop(columns=['engine'], inplace=True, errors='ignore')
X_test.drop(columns=['engine'], inplace=True, errors='ignore')

# Drop rows with NaN values in 'engine_hp' (if any)
X_train = X_train.dropna(subset=['engine_hp'])
y_train = y_train.loc[X_train.index]

In [20]:
# Fill missing values in 'engine_hp' with the mean of the column
X_train['engine_hp'].fillna(X_train['engine_hp'].mean(), inplace=True)
X_test['engine_hp'].fillna(X_test['engine_hp'].mean(), inplace=True)

# Define numerical and categorical features
numeric_features = ['model_year', 'milage', 'engine_hp']
categorical_features = ['brand', 'model', 'fuel_type', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']

# Create transformers for numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a single preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline that combines preprocessing and the ElasticNet model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', ElasticNet(random_state=42))
])

# Define hyperparameter grid for tuning
param_grid = {
    'regressor__alpha': [0.1, 1.0, 10.0],
    'regressor__l1_ratio': [0.1, 0.5, 0.9]
}

# Set up GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Train the model using GridSearchCV
start_time = time.time()
grid_search.fit(X_train, y_train)
end_time = time.time()

elapsed_time = end_time - start_time
print(f"GridSearchCV training time: {elapsed_time:.2f} seconds")

# Get the best model
best_model = grid_search.best_estimator_

# Predict on the test set
y_test_pred = best_model.predict(X_test)

GridSearchCV training time: 169.02 seconds


In [21]:
# Since we don't have true price labels for the test set, we'll calculate RMSE on a portion of the training data held out as validation set
# Split the training data into a smaller training set and a validation set
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train the best model on the split training data
best_model.fit(X_train_split, y_train_split)

# Predict on the validation set
y_val_pred = best_model.predict(X_val)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f"Validation RMSE: {rmse:.2f}")

# Save the predictions to a CSV file in the format [id, price]
submission = pd.DataFrame({
    'id': test_data['id'],
    'price': y_test_pred
})
submission.to_csv('/content/predicted_prices_Elasticnet.csv', index=False)

Validation RMSE: 66195.88


In [30]:
print("Length of ID column in test_data:", len(test_data['id']))
print("Length of predicted prices array:", len(y_pred))


Length of ID column in test_data: 36183
Length of predicted prices array: 33577


In [19]:
# Feature importance
importances = best_model.named_steps['regressor'].feature_importances_
feature_names = numeric_features + list(best_model.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(categorical_features))

# Create a DataFrame for visualization
feature_importances = pd.DataFrame({'feature': feature_names, 'importance': importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)

print(feature_importances)

AttributeError: 'ElasticNet' object has no attribute 'feature_importances_'