<a href="https://colab.research.google.com/github/Sumit-21107003/used_car_price_pred/blob/main/used_car_price_176.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, IsolationForest
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [None]:
data = pd.read_csv('train1.csv')

In [None]:
def process_engine_column(engine_col):
    horsepower = engine_col.str.extract(r'(\d+\.?\d*)HP')[0].astype(float)  # Extract horsepower
    engine_size = engine_col.str.extract(r'(\d+\.?\d*)L')[0].astype(float)  # Extract engine size
    return horsepower, engine_size

data['horsepower'], data['engine_size'] = process_engine_column(data['engine'])
data = data.drop(columns=['engine'])  # Drop the original 'engine' column

In [None]:
X = data.drop(columns=['price'])  # Replace 'price' with your target variable
y = data['price']

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

In [None]:
X_train_imputed = X_train.copy()
X_val_imputed = X_val.copy()

In [None]:
num_imputer = SimpleImputer(strategy='mean')
X_train_imputed[numerical_cols] = num_imputer.fit_transform(X_train[numerical_cols])
X_val_imputed[numerical_cols] = num_imputer.transform(X_val[numerical_cols])

In [None]:
# Outlier Detection with Aligned Indices
isolation_forest = IsolationForest(contamination=0.05, random_state=42)
outliers_train = isolation_forest.fit_predict(X_train_imputed[numerical_cols])
non_outlier_indices = np.where(outliers_train == 1)[0]

X_train_no_outliers = X_train_imputed.iloc[non_outlier_indices]
y_train_no_outliers = y_train.iloc[non_outlier_indices]

In [None]:
# Preprocessing pipelines for numerical and categorical features
numerical_preprocessor = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# Combine preprocessors in a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_preprocessor, numerical_cols),
        ('cat', categorical_preprocessor, categorical_cols)
    ]
)

In [None]:
# Create a pipeline with preprocessing and model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [None]:
# Log-transform the target variable
y_train_transformed = np.log1p(y_train_no_outliers)
y_val_transformed = np.log1p(y_val)

In [None]:
# Subsample the data (50% for faster training)
X_train_sampled = X_train_no_outliers.sample(frac=0.5, random_state=42)
y_train_sampled = y_train_no_outliers.loc[X_train_sampled.index]

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
randomized_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_grid,
    n_iter=5,
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

randomized_search.fit(X_train_sampled, y_train_sampled)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

In [None]:
# Expand parameter grid for GridSearchCV
param_grid = {
    'regressor__n_estimators': [100, 200, 300, 500],
    'regressor__max_depth': [10, 20, 30, 50],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 5],
    'regressor__max_features': ['auto', 'sqrt', 'log2'],
}

In [None]:
# Grid search for hyperparameter optimization
grid_search = GridSearchCV(pipeline, param_grid, cv=3, verbose=2, n_jobs=-1)
grid_search.fit(X_train_no_outliers, y_train_no_outliers)

Fitting 3 folds for each of 432 candidates, totalling 1296 fits


KeyboardInterrupt: 

In [None]:
# Subsample the training data to speed up training
X_train_sampled = X_train_no_outliers.sample(frac=0.3, random_state=42)  # 30% of data
y_train_sampled = y_train_no_outliers.loc[X_train_sampled.index]

In [None]:
# Randomized hyperparameter grid
param_dist = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [5, 10, 20],
    'regressor__min_samples_split': [2, 5],
    'regressor__min_samples_leaf': [1, 2],
    'regressor__max_features': ['sqrt', 'log2']
}

In [None]:
# RandomizedSearchCV for faster hyperparameter tuning
randomized_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=10,  # Test only 10 combinations
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

In [None]:
# Train the model on the subsampled dataset
randomized_search.fit(X_train_sampled, y_train_sampled)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


  _data = np.array(data, dtype=dtype, copy=copy,


In [None]:
# Best model
best_model = randomized_search.best_estimator_

In [None]:
# Check and align lengths of X_val_imputed and y_val
if len(X_val_imputed) != len(y_val):
    print("Mismatch in validation data lengths!")
    print("Length of X_val_imputed:", len(X_val_imputed))
    print("Length of y_val:", len(y_val))

    # Align X_val_imputed to y_val
    X_val_imputed = X_val_imputed.iloc[:len(y_val)]
    print("Lengths after alignment:")
    print("Length of X_val_imputed:", len(X_val_imputed))
    print("Length of y_val:", len(y_val))

Mismatch in validation data lengths!
Length of X_val_imputed: 19840
Length of y_val: 19839
Lengths after alignment:
Length of X_val_imputed: 19839
Length of y_val: 19839


In [None]:
# Predict on validation set
y_pred = best_model.predict(X_val_imputed)

In [None]:
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
rmse_percent = (rmse / np.mean(y_val)) * 100

In [None]:
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"RMSE Percentage: {rmse_percent:.2f}%")

Root Mean Squared Error (RMSE): 76170.26642058314
RMSE Percentage: 176.16%


In [None]:
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
import numpy as np

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
pip install dask[dataframe]

Collecting dask-expr<1.2,>=1.1 (from dask[dataframe])
  Downloading dask_expr-1.1.19-py3-none-any.whl.metadata (2.6 kB)
INFO: pip is looking at multiple versions of dask-expr to determine which version is compatible with other requirements. This could take a while.
  Downloading dask_expr-1.1.18-py3-none-any.whl.metadata (2.6 kB)
  Downloading dask_expr-1.1.16-py3-none-any.whl.metadata (2.5 kB)
Downloading dask_expr-1.1.16-py3-none-any.whl (243 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.2/243.2 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dask-expr
Successfully installed dask-expr-1.1.16


In [None]:
# Log-transform target variable to stabilize variance
y_train_no_outliers = np.log1p(y_train_no_outliers)
y_val_transformed = np.log1p(y_val)

In [None]:
# Update pipeline to use LightGBM
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LGBMRegressor(random_state=42))
])

In [None]:
# Hyperparameter tuning with RandomizedSearchCV
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [10, 20, 30],
    'regressor__learning_rate': [0.01, 0.1, 0.2],
    'regressor__num_leaves': [31, 50, 70],
}

In [None]:
randomized_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_grid,
    n_iter=20,
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

In [None]:
randomized_search.fit(X_train_no_outliers, y_train_no_outliers)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009805 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3136
[LightGBM] [Info] Number of data points in the train set: 75388, number of used features: 1147
[LightGBM] [Info] Start training from score 10.326790


In [None]:
best_model = randomized_search.best_estimator_
y_pred_transformed = best_model.predict(X_val_imputed)

In [None]:
y_pred = np.expm1(y_pred_transformed)

In [None]:
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
rmse_percent = (rmse / np.mean(y_val)) * 100

In [None]:
print(f"Best Parameters: {randomized_search.best_params_}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"RMSE Percentage: {rmse_percent:.2f}%")

Best Parameters: {'regressor__num_leaves': 50, 'regressor__n_estimators': 200, 'regressor__max_depth': 10, 'regressor__learning_rate': 0.1}
Root Mean Squared Error (RMSE): 75263.83451541924
RMSE Percentage: 174.07%


In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [None]:
pipeline.fit(X_train_no_outliers, y_train_no_outliers)
y_pred = pipeline.predict(X_val_imputed)

In [None]:
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
rmse_percent = (rmse / np.mean(y_val)) * 100
print(f"RMSE Percentage: {rmse_percent}")

RMSE Percentage: 207.6043927531001


In [1]:
!git config --global user.name "Sumit-21107003"
!git config --global user.email "asamanta99678@gmail.com"

In [2]:
!git clone https://github.com/Sumit-21107003/used_car_price_pred.git

Cloning into 'used_car_price_pred'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), done.


In [3]:
!cp /content/used_car_price_176.ipynb /content/used_car_price_pred/

cp: cannot stat '/content/used_car_price_176.ipynb': No such file or directory
