<a href="https://www.kaggle.com/code/pratishthachaturvedi/used-car-price-prediction-part2?scriptVersionId=197322547" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from catboost import CatBoostRegressor
import xgboost as xgb
import lightgbm as lgb

# Loading and checking data

In [2]:
df_train=pd.read_csv("/kaggle/input/playground-series-s4e9/train.csv", index_col='id')

In [3]:
df_test=pd.read_csv("/kaggle/input/playground-series-s4e9/test.csv", index_col='id')

In [None]:
df_train.columns

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train.info()

In [None]:
pd.set_option('display.float_format', '{:.2f}'.format)         ##using this to change scientic format to general
df_train.describe()

## Null percentage

In [81]:
pd.set_option('display.float_format', '{:.6f}'.format)
df_train.isnull().sum() + (df_train == '').sum()

brand               0
model               0
model_year          0
milage              0
fuel_type        5083
engine              0
transmission        0
ext_col             0
int_col             0
accident         2452
clean_title     21419
price               0
dtype: int64

In [None]:
pd.set_option('display.float_format', '{:.6f}'.format)
((df_train.isnull().sum() + (df_train == '').sum()) / len(df_train) * 100).round(10)

In [None]:
df_train.duplicated().sum()

## Unique values count

In [None]:
df_train.select_dtypes(include='object').nunique().sort_values()

# Data Exploration

## Correlation

In [None]:
!pip install dython

In [None]:
from dython.nominal import associations
associations_df = associations(df_train, nominal_columns='all', plot=False)
corr_matrix = associations_df['corr']
plt.figure(figsize=(20, 8))
plt.gcf().set_facecolor('#FFFDD0') 
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix including Categorical Features')
plt.show()

In [None]:
# Group by 'brand' and calculate the mean price, then sort by price
sorted_data = df_train.groupby('brand')['price'].mean().reset_index().sort_values(by='price')

# Plot the sorted data
plt.figure(figsize=(12, 6))
sns.barplot(x='brand', y='price', data=sorted_data, errorbar=None)
plt.title('Average Price by Car Brand')
plt.xlabel('Brand')
plt.ylabel('Average Price')
plt.xticks(rotation=90)                                          # Rotate brand names for better readability
plt.show()

In [None]:
# Group by 'brand' and calculate the mean price, then sort by price
sorted_data = df_train.groupby('model')['price'].mean().reset_index().sort_values(by='price')

# Plot the sorted data
plt.figure(figsize=(12, 6))
sns.barplot(x='model', y='price', data=sorted_data, errorbar=None)
plt.title('Average Price by Car Model')
plt.xlabel('Model')
plt.ylabel('Average Price')
plt.xticks(rotation=90)                                          # Rotate brand names for better readability
plt.show()

## Price col nature

In [None]:
df_train['price'].drop_duplicates().nlargest(10)

In [None]:
plt.figure(figsize=(10, 6))
plt.boxplot(df_train['price'])
plt.title('Boxplot of Car Prices')
plt.ylabel('Price')
plt.show()

## Other graphs

In [None]:
sorted_data = df_train.groupby('transmission')['price'].mean().reset_index().sort_values(by='price')

# Plot the sorted data
plt.figure(figsize=(12, 6))
sns.barplot(x='transmission', y='price', data=sorted_data, errorbar=None)
plt.title('Average Price by Car transmission')
plt.xlabel('transmission')
plt.ylabel('Average Price')
plt.xticks(rotation=90)                                          
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='accident', y='price', data=df_train, errorbar=None)
plt.title('Average Price by Accident History')
plt.xlabel('Accident History')
plt.ylabel('Average Price')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Count frequency of each value in int_col
frequency = df_train['int_col'].value_counts().reset_index()
frequency.columns = ['int_col', 'count']

# Sort values in descending order
frequency_sorted = frequency.sort_values(by='count', ascending=False)

# Select the top 20 values
top_20_frequency = frequency_sorted.head(20)

# Plot
plt.figure(figsize=(12, 8))
sns.barplot(x='int_col', y='count', data=top_20_frequency, errorbar=None)
plt.title('Top 20 Most Frequent Accident Histories')
plt.xlabel('Accident History')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

In [None]:
avg_price = df_train.groupby('int_col')['price'].mean().reset_index()

# Sort values in descending order
avg_price_sorted = avg_price.sort_values(by='price', ascending=False)

# Select the top 20 values
top_30_avg_price = avg_price_sorted.head(30)

# Plot
plt.figure(figsize=(12, 8))
sns.barplot(x='int_col', y='price', data=top_30_avg_price, errorbar=None)
plt.title('Top 20 Average Prices by Accident History')
plt.xlabel('Accident History')
plt.ylabel('Average Price')
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='fuel_type', data=df_train)
plt.title('Count of Cars by Fuel Type')
plt.xlabel('Fuel Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='transmission', data=df_train)
plt.title('Count of Cars by Transmission Type')
plt.xlabel('Transmission')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()

# Data Preprocessing

## Replacing â€“ with blank

#### â€“ is coming as - in python

In [None]:
df_train['fuel_type'].unique()

In [4]:
df_train = df_train.replace('–', '', regex=True)

## clean_title column variance

In [None]:
df_train['clean_title'] = df_train['clean_title'].map({'Yes': 1, 'No': 0}).fillna(0)
df_train['clean_title'].var()      ##as it has 0 variance, we can drop this

##  Transmission Column Reclassification

In [None]:
# Get unique values in the 'transmission' column, sort them, and convert to DataFrame

df_train['transmission'].value_counts()

In [5]:
# Create a dictionary with the mappings, using regular expressions
transmission_map = {
    r'A/T': 'Automatic',
    r'M/T': 'Manual',
    r'At/Mt': 'Automatic',
    r' Mt': ' Manual',
    r' AT': ' Automatic'
}

# Use the replace() function with regex=True to replace values in the 'transmission' column
df_train['transmission'] = df_train['transmission'].replace(transmission_map, regex=True)

In [None]:
df_train['transmission'].nunique()

In [6]:
# List of values to be replaced by 'Other'
values_to_replace = ['2', '–', 'Variable', 'F']

# Replace these values with 'Other' in the 'transmission' column
df_train['transmission'] = df_train['transmission'].replace(values_to_replace, 'Other')
df_train['transmission'] = df_train['transmission'].replace('', 'Unknown')

## Filling Fuel_type blanks using Engine

In [8]:
df_train = df_train.replace('not supported', '', regex=True)
df_train['fuel_type'].replace('', np.nan, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['fuel_type'].replace('', np.nan, inplace=True)


In [None]:
df_train.isnull().sum() + (df_train == '').sum()

In [9]:
def fill_fuel_type(row):
    if pd.isnull(row['fuel_type']):  # Only process rows where fuel_type is missing
        engine_description = row['engine'].lower() if isinstance(row['engine'], str) else ''
        
        if "cylinder engine flex fuel capability" in engine_description:
            return "E85 Flex Fuel"
        elif "hybrid" in engine_description:
            return "Hybrid"
        elif "cylinder engine gasoline fuel" in engine_description:
            return "Gasoline"
        elif "electric motor electric fuel system" in engine_description:
            return "Gasoline"
        elif "diesel" in engine_description:
            return "Diesel"
        elif "cylinder engine plug-in electric/gas" in engine_description:
            return "Plug-In Hybrid"
        else:
            return "Unknown"
    return row['fuel_type']  # Return original value if no change needed

# Apply the function to fill missing fuel_type values
df_train['fuel_type'] = df_train.apply(fill_fuel_type, axis=1)

## Calculating Car Age using model_year

In [10]:
from datetime import datetime
current_year = datetime.now().year
df_train['car_age'] = current_year - df_train['model_year']

## Accident col Reclassification

In [71]:
df_train['accident'].unique()

array(['No', 'Yes', 'Unknown'], dtype=object)

In [11]:
# Replace blank strings and nulls with 'Unknown' in the 'accident' column
df_train['accident'].replace('', 'Unknown', inplace=True)
df_train['accident'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['accident'].replace('', 'Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['accident'].fillna('Unknown', inplace=True)


In [12]:
accident_mapping = {
    'None reported': 'No',
    'At least 1 accident or damage reported': 'Yes'
}

# Apply the mapping to the 'accident' column
df_train['accident'] = df_train['accident'].replace(accident_mapping)

## Color

In [13]:
df_train[['ext_col', 'int_col']] = df_train[['ext_col', 'int_col']].apply(lambda col: col.str.title())

In [14]:
df_train['ext_col'].replace({'': 'Unknown', None: 'Unknown'}, inplace=True)
df_train['int_col'].replace({'': 'Unknown', None: 'Unknown'}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['ext_col'].replace({'': 'Unknown', None: 'Unknown'}, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['int_col'].replace({'': 'Unknown', None: 'Unknown'}, inplace=True)


## Dropping Cols

In [15]:
df_train = df_train.drop(['model_year', 'engine', 'clean_title', 'model'], axis=1)  
df_train.head(5)

Unnamed: 0_level_0,brand,milage,fuel_type,transmission,ext_col,int_col,accident,price,car_age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,MINI,213000,Gasoline,Automatic,Yellow,Gray,No,4200,17
1,Lincoln,143250,Gasoline,Automatic,Silver,Beige,Yes,4999,22
2,Chevrolet,136731,E85 Flex Fuel,Automatic,Blue,Gray,No,13900,22
3,Genesis,19500,Gasoline,Transmission w/Dual Shift Mode,Black,Black,No,45000,7
4,Mercedes-Benz,7388,Gasoline,7-Speed Automatic,Black,Beige,No,97500,3


# Applying same changes as train to test data

In [118]:
df_test.head(2)

Unnamed: 0_level_0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
188533,Land,Rover LR2 Base,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes
188534,Land,Rover Defender SE,2020,9142,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,Silver,Black,None reported,Yes


In [16]:
df_test = df_test.replace('–', '', regex=True)

df_test['transmission'] = df_test['transmission'].replace(transmission_map, regex=True)
df_test['transmission'] = df_test['transmission'].replace(values_to_replace, 'Other')
df_test['transmission'] = df_test['transmission'].replace('', 'Unknown')

df_test = df_test.replace('not supported', '', regex=True)
df_test['fuel_type'].replace('', np.nan, inplace=True)
df_test['fuel_type'] = df_test.apply(fill_fuel_type, axis=1)

df_test['accident'].replace('', 'Unknown', inplace=True)
df_test['accident'].fillna('Unknown', inplace=True)
df_test['accident'] = df_test['accident'].replace(accident_mapping)

df_test['car_age'] = current_year - df_test['model_year']

df_test[['ext_col', 'int_col']] = df_test[['ext_col', 'int_col']].apply(lambda col: col.str.title())

df_test['ext_col'].replace({'': 'Unknown', None: 'Unknown'}, inplace=True)
df_test['int_col'].replace({'': 'Unknown', None: 'Unknown'}, inplace=True)

df_test = df_test.drop(['model_year', 'engine', 'clean_title', 'model'], axis=1)  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test['fuel_type'].replace('', np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test['accident'].replace('', 'Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we 

In [17]:
pd.set_option('display.float_format', '{:.6f}'.format)
df_test.isnull().sum() + (df_test == '').sum()

brand           0
milage          0
fuel_type       0
transmission    0
ext_col         0
int_col         0
accident        0
car_age         0
dtype: int64

# Modeling

## Train test split

In [None]:
df_train.columns

In [18]:
X = df_train[['brand', 'milage', 'fuel_type', 'transmission','accident', 'car_age', 
              'ext_col', 'int_col']]
y = df_train['price']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## CatBoostRegressor

In [20]:
# Identify the categorical columns (these are object or categorical type columns)
categorical_cols = ['brand', 'fuel_type', 'transmission', 'ext_col', 'int_col', 
                    'accident']

# Initialize CatBoostRegressor with some hyperparameters
model = CatBoostRegressor(
    iterations=600,               # Number of boosting iterations
    learning_rate=0.1,            # Learning rate
    depth=6,                      # Depth of each tree
    eval_metric='RMSE',           # Evaluation metric
    random_seed=42,               # For reproducibility
    cat_features=categorical_cols # Specify which columns are categorical
)

# Train the model
model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=100)

# Evaluate the model
y_pred = model.predict(X_test)

# Optionally, print model performance (e.g., RMSE)

rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Test RMSE: {rmse}")

0:	learn: 79024.2569903	test: 73690.9932325	best: 73690.9932325 (0)	total: 273ms	remaining: 2m 43s
100:	learn: 73834.9701673	test: 68581.9240376	best: 68581.4352783 (99)	total: 10.5s	remaining: 51.7s
200:	learn: 73227.2223340	test: 68548.0295513	best: 68535.6086160 (163)	total: 20s	remaining: 39.6s
300:	learn: 72786.5572909	test: 68535.0514116	best: 68535.0514116 (300)	total: 30.3s	remaining: 30.1s
400:	learn: 72317.5786445	test: 68518.2422354	best: 68515.0850524 (397)	total: 41.6s	remaining: 20.6s
500:	learn: 71833.9769973	test: 68505.0277488	best: 68495.0259855 (430)	total: 52.8s	remaining: 10.4s
599:	learn: 71463.4052026	test: 68530.0513801	best: 68495.0259855 (430)	total: 1m 3s	remaining: 0us

bestTest = 68495.02599
bestIteration = 430

Shrink model to first 431 iterations.
Test RMSE: 68495.02598553944


In [125]:
df_train['price'].mean()

43878.01617753921

# Hyperparameter tuning

## Method 1 - Random Search CV

#### Don't run this, it will take too much time

In [21]:
from sklearn.model_selection import RandomizedSearchCV

In [22]:
param_dist = {
    'iterations': [400, 600, 800, 1000],
    'learning_rate': [0.01, 0.1, 0.3],
    'depth': [4, 6, 8, 10]
}

In [23]:
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=10,  # Number of parameter settings to sample
    scoring='neg_root_mean_squared_error',
    cv=3,  # Number of folds in cross-validation
    verbose=100,
    n_jobs=-1,  # Use all available CPUs
    random_state=42
)

In [None]:
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV 2/3; 1/10] START depth=8, iterations=600, learning_rate=0.01................
0:	learn: 78622.9121249	total: 447ms	remaining: 4m 27s
1:	learn: 78536.1061687	total: 884ms	remaining: 4m 24s
2:	learn: 78451.2147036	total: 1.19s	remaining: 3m 56s
3:	learn: 78368.7704048	total: 1.58s	remaining: 3m 55s
4:	learn: 78287.8621169	total: 1.96s	remaining: 3m 52s
5:	learn: 78208.8797967	total: 2.32s	remaining: 3m 49s
6:	learn: 78128.5423933	total: 2.76s	remaining: 3m 54s
7:	learn: 78050.7413849	total: 3.14s	remaining: 3m 52s
8:	learn: 77976.7221378	total: 3.58s	remaining: 3m 55s
9:	learn: 77902.5840128	total: 4.04s	remaining: 3m 58s
10:	learn: 77824.8443664	total: 4.43s	remaining: 3m 57s
11:	learn: 77755.6906001	total: 4.94s	remaining: 4m 2s
12:	learn: 77681.2990187	total: 5.45s	remaining: 4m 6s
13:	learn: 77609.9094511	total: 5.9s	remaining: 4m 7s
14:	learn: 77540.8059862	total: 6.27s	remaining: 4m 4s
15:	learn: 77475.8433170	total: 6

In [None]:
best_params = random_search.best_params_
print(f"Best Parameters: {best_params}")

## LightGBM

In [None]:
# List of categorical columns
categorical_cols = ['brand', 'fuel_type', 'transmission', 'ext_col', 'int_col', 'accident']

# Convert categorical columns to 'category' dtype
for col in categorical_cols:
    if col in X.columns:
        X[col] = X[col].astype('category')


# Initialize LightGBM model
model = lgb.LGBMRegressor(
    n_estimators=500,            # Number of boosting iterations
    learning_rate=0.1,           # Learning rate
    max_depth=6,                 # Maximum depth of a tree
    random_state=42              # For reproducibility
)

# Train the model
model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='rmse',
    categorical_feature=categorical_cols  # Pass the names of categorical columns directly
)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Test RMSE: {rmse}")

## XGBRegressor

In [None]:
# Convert categorical columns to 'category' dtype for XGBoost
categorical_cols = ['brand', 'fuel_type', 'transmission', 'ext_col', 'int_col', 'accident']
for col in categorical_cols:
    X[col] = X[col].astype('category')

# Define the XGBRegressor model
model = xgb.XGBRegressor(
    objective='reg:squarederror',  # Objective for regression
    max_depth=6,                   # Maximum depth of a tree
    learning_rate=0.1,             # Learning rate
    n_estimators=500,              # Number of boosting rounds
    random_state=42,               # Random seed
    tree_method='hist',            # Faster histogram-based optimization
    enable_categorical=True        # Enable native handling of categorical data
)

# Train the model
model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],   # Validation set for evaluation
    early_stopping_rounds=10,      # Stop early if no improvement
    eval_metric='rmse',            # Metric to evaluate
    verbose=True                   # Verbose output
)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Test RMSE: {rmse}")