In [36]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

In [45]:
# Load datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [46]:
# Display first few rows of each dataset
train_df.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,Yes,11000
1,1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,Yes,8250
2,2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,Yes,15000
3,3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,Yes,63500
4,4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,Yes,7850


In [47]:
test_df.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,54273,Mercedes-Benz,E-Class E 350,2014,73000,Gasoline,302.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,A/T,White,Beige,None reported,Yes
1,54274,Lexus,RX 350 Base,2015,128032,Gasoline,275.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,8-Speed A/T,Silver,Black,None reported,Yes
2,54275,Mercedes-Benz,C-Class C 300,2015,51983,Gasoline,241.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Blue,White,None reported,Yes
3,54276,Land,Rover Range Rover 5.0L Supercharged Autobiogra...,2018,29500,Gasoline,518.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,White,White,At least 1 accident or damage reported,Yes
4,54277,BMW,X6 xDrive40i,2020,90000,Gasoline,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,White,Black,At least 1 accident or damage reported,Yes


In [48]:
train_df.info(), test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54273 entries, 0 to 54272
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            54273 non-null  int64 
 1   brand         54273 non-null  object
 2   model         54273 non-null  object
 3   model_year    54273 non-null  int64 
 4   milage        54273 non-null  int64 
 5   fuel_type     54273 non-null  object
 6   engine        54273 non-null  object
 7   transmission  54273 non-null  object
 8   ext_col       54273 non-null  object
 9   int_col       54273 non-null  object
 10  accident      54273 non-null  object
 11  clean_title   54273 non-null  object
 12  price         54273 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 5.4+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36183 entries, 0 to 36182
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id           

(None, None)

In [18]:
# extracting features from the engine cColumnTransformer
def extract_engine_features(df):
    '''for extracting the engine horsepower(hp) and number of cylinders in the engine'''
    df['engine_hp'] = df['engine'].str.extract(r'(\d+\.\d*)HP').astype('float')
    df['engine_cylinders'] = df['engine'].str.extract(r'(\d+)\s?[Cc]ylinder').astype('float')
    return df.drop(['engine'], axis=1)

# apply the functions
train_df = extract_engine_features(train_df)
test_df = extract_engine_features(test_df)

In [49]:
#separate features and target variable from the training data
X = train_df.drop(columns=['id','price'], axis=1)
y = train_df['price']

In [50]:
# identify categorical and numerical columns
num_cols = X.select_dtypes(include='number').columns.to_list()
cat_cols = X.select_dtypes(include='object').columns.to_list()

In [51]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)

In [52]:
# Splitting the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [53]:
# Define the model pipeline

model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(random_state=42))
    ]
)

model.fit(X_train, y_train)

In [24]:
# Fill NaN values with the median of their respective columns
#train_df['engine_hp'].fillna(train_df['engine_hp'].median(), inplace=True)
#train_df['engine_cylinders'].fillna(train_df['engine_cylinders'].median(), inplace=True)

In [25]:
#X = train_df.drop(columns=['id', 'price'])
#y = train_df['price']

In [26]:
# Splitting the data into training and validation sets
#X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
#model.fit(X_train, y_train)

In [54]:
y_val_pred = model.predict(X_val)

rmse = root_mean_squared_error(y_val, y_val_pred)
rmse

55548.81394282367

In [55]:
# Preprocess test data
#test_df['engine_hp'].fillna(test_df['engine_hp'].median(), inplace=True)
#test_df['engine_cylinders'].fillna(test_df['engine_cylinders'].median(), inplace=True)

# Separate features from test data
X_test = test_df.drop(columns=['id'])

# Predict on test data
y_test_pred = model.predict(X_test)

# Create submission DataFrame
submission_df2 = test_df[['id']].copy()
submission_df2['price'] = y_test_pred

# Save predictions to CSV
submission_df2.to_csv('car_price_predictions2.csv', index=False)

In [56]:
submission_df.shape

(36183, 2)