In [83]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor,ExtraTreesRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [2]:
df = pd.read_csv('../data/ready_to_transform_car_data.csv')

In [3]:
df.head()

Unnamed: 0,brand,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,price,car_age,horsepower,engine_size,cylinder_count
0,Ford,51000,E85 Flex Fuel,Automatic,Black,Black,Accident reported,Yes,10300,12,300.0,3.7,6.0
1,Hyundai,34742,Gasoline,Automatic,Other,Grey,Accident reported,Yes,38005,4,,3.8,
2,Lexus,22372,Gasoline,Automatic,Blue,Black,None reported,Unknown,54598,3,,,
3,INFINITI,88900,Hybrid,Automatic,Black,Black,None reported,Yes,15500,10,354.0,3.5,6.0
4,Audi,9835,Gasoline,Automatic,White,Black,None reported,Unknown,34999,4,,2.0,


In [4]:
X = df.drop('price',axis=1)
y = df['price']

## Train | Test Split

In [14]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [104]:
# residuals = y_test - y_pred
# plt.scatter(y_pred, residuals, alpha=0.5)
# plt.axhline(0, color='red', linestyle='--')
# plt.xlabel("Predicted Values")
# plt.ylabel("Residuals")
# plt.title("Residuals vs. Predicted Values")
# plt.show()

In [106]:
# plt.hist(residuals, bins=30, edgecolor='k')
# plt.title("Histogram of Residuals")
# plt.show()

In [105]:
baseline_pred = np.full_like(y_test, y_test.mean())  # Predict the mean
baseline_rmse = mean_squared_error(y_test, baseline_pred)**0.5

print(f"Baseline RMSE: {baseline_rmse:.4f}")

Baseline RMSE: 47357.5266


## Linear regression

In [86]:
step1 = ColumnTransformer([
    ('imputer', SimpleImputer(), ['horsepower','engine_size','cylinder_count']),
    ('encoder', OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'), ['brand','fuel_type','transmission','ext_col','int_col','accident','clean_title'])
],remainder='passthrough')

step2 = StandardScaler()

step3 = LinearRegression()

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2),
    ('step3',step3)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
print('RMSE',mean_squared_error(y_test,y_pred)**0.5)

R2 score 0.4351247342421606
MAE 18069.869473647304
RMSE 35593.057071832074


## Ridge regression

In [87]:
step1 = ColumnTransformer([
    ('imputer', SimpleImputer(), ['horsepower','engine_size','cylinder_count']),
    ('encoder', OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'), ['brand','fuel_type','transmission','ext_col','int_col','accident','clean_title'])
],remainder='passthrough')

step2 = StandardScaler()

step3 = Ridge(alpha=10)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2),
    ('step3',step3)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
print('RMSE',mean_squared_error(y_test,y_pred)**0.5)

R2 score 0.4351687956053596
MAE 18034.447582117806
RMSE 35591.668881116835


## lasso regression

In [109]:
step1 = ColumnTransformer([
    ('imputer', SimpleImputer(), ['horsepower','engine_size','cylinder_count']),
    ('encoder', OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'), ['brand','fuel_type','transmission','ext_col','int_col','accident','clean_title'])
],remainder='passthrough')

step2 = StandardScaler()

step3 = Lasso(alpha=0.001)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2),
    ('step3',step3)
])

# pipe.fit(X_train,y_train)

param = {'step3__alpha':[0.0001,0.001,0.01,0.1,1,10,100,1000]}

grid_model = GridSearchCV(pipe,param_grid=param,scoring='neg_mean_squared_error',cv=5)

grid_model.fit(X_train,y_train)

y_pred = grid_model.predict(X_test)

print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
print('RMSE',mean_squared_error(y_test,y_pred)**0.5)

R2 score 0.4354770801877782
MAE 17845.1031966173
RMSE 35581.95459808906


## knn 

In [111]:
step1 = ColumnTransformer([
    ('imputer', SimpleImputer(), ['horsepower','engine_size','cylinder_count']),
    ('encoder', OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'), ['brand','fuel_type','transmission','ext_col','int_col','accident','clean_title'])
],remainder='passthrough')

step2 = StandardScaler()

step3 = KNeighborsRegressor(n_neighbors=3)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2),
    ('step3',step3)
])

# pipe.fit(X_train,y_train)

param = {'step3__n_neighbors':[1,2,3,4,5,7,10,15,20,25]}

grid_model = GridSearchCV(pipe,param_grid=param,scoring='neg_mean_squared_error',cv=5)

grid_model.fit(X_train,y_train)

y_pred = grid_model.predict(X_test)

print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
print('RMSE',mean_squared_error(y_test,y_pred)**0.5)

R2 score 0.40921654098960347
MAE 17590.82040306759
RMSE 36400.15016296732


## Decision tree

In [112]:
step1 = ColumnTransformer([
    ('imputer', SimpleImputer(), ['horsepower','engine_size','cylinder_count']),
    ('encoder', OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'), ['brand','fuel_type','transmission','ext_col','int_col','accident','clean_title'])
],remainder='passthrough')

step2 = StandardScaler()

step3 = DecisionTreeRegressor(max_depth=8)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2),
    ('step3',step3)
])

pipe.fit(X_train,y_train)

param = {'step3__max_depth': [5, 10, 20],
        'step3__max_leaf_nodes': [10, 20, 50]}

grid_model = GridSearchCV(pipe,param_grid=param,scoring='neg_mean_squared_error',cv=5)

grid_model.fit(X_train,y_train)

y_pred = grid_model.predict(X_test)

print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
print('RMSE',mean_squared_error(y_test,y_pred)**0.5)

R2 score 0.6205769621343817
MAE 11735.703764141657
RMSE 29170.969292400085


## SVM

In [113]:
step1 = ColumnTransformer([
    ('imputer', SimpleImputer(), ['horsepower','engine_size','cylinder_count']),
    ('encoder', OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'), ['brand','fuel_type','transmission','ext_col','int_col','accident','clean_title'])
],remainder='passthrough')

step2 = StandardScaler()

step3 = SVR(kernel='rbf',C=10000,epsilon=0.1)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2),
    ('step3',step3)
])

# pipe.fit(X_train,y_train)

param = {'step3__C': [1, 10, 100, 1000],
        'step3__epsilon': [0.1, 0.2, 0.5]}

grid_model = GridSearchCV(pipe,param_grid=param,scoring='neg_mean_squared_error',cv=5)

grid_model.fit(X_train,y_train)

y_pred = grid_model.predict(X_test)

print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
print('RMSE',mean_squared_error(y_test,y_pred)**0.5)

R2 score 0.3248001281085853
MAE 14147.19318875562
RMSE 38913.93841431248


## random forest

In [116]:
step1 = ColumnTransformer([
    ('imputer', SimpleImputer(), ['horsepower','engine_size','cylinder_count']),
    ('encoder', OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'), ['brand','fuel_type','transmission','ext_col','int_col','accident','clean_title'])
],remainder='passthrough')

step2 = StandardScaler()

step3 = RandomForestRegressor(n_estimators=100,
                              random_state=3,
                              max_samples=0.5,
                              max_features=0.75,
                              max_depth=15)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2),
    ('step3',step3)
])

pipe.fit(X_train,y_train)

# param = {'step3__n_estimators': [50, 100, 200],
#         'step3__max_depth': [10, 20, 30],
#         'step3__min_samples_split': [2, 5, 10],
#         'step3__min_samples_leaf': [1, 2, 5]}

# grid_model = GridSearchCV(pipe,param_grid=param,scoring='neg_mean_squared_error',cv=5)

# grid_model.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
print('RMSE',mean_squared_error(y_test,y_pred)**0.5)

R2 score 0.7433221874512855
MAE 10061.054439299693
RMSE 23992.9238919611


## extra tree

In [94]:
step1 = ColumnTransformer([
    ('imputer', SimpleImputer(), ['horsepower','engine_size','cylinder_count']),
    ('encoder', OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'), ['brand','fuel_type','transmission','ext_col','int_col','accident','clean_title'])
],remainder='passthrough')

step2 = StandardScaler()

step3 = ExtraTreesRegressor(n_estimators=100,
                              random_state=3,
                              max_samples=0.5,
                              max_features=0.75,
                              max_depth=15,
                              bootstrap=True)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2),
    ('step3',step3)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
print('RMSE',mean_squared_error(y_test,y_pred)**0.5)

R2 score 0.6930281052995197
MAE 10239.612063729723
RMSE 26238.458633754926


## adaboost

In [97]:
step1 = ColumnTransformer([
    ('imputer', SimpleImputer(), ['horsepower','engine_size','cylinder_count']),
    ('encoder', OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'), ['brand','fuel_type','transmission','ext_col','int_col','accident','clean_title'])
],remainder='passthrough')

step2 = StandardScaler()

step3 = AdaBoostRegressor(n_estimators=15,learning_rate=1.0)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2),
    ('step3',step3)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
print('RMSE',mean_squared_error(y_test,y_pred)**0.5)

R2 score 0.5250585684724851
MAE 19531.151499637348
RMSE 32636.91046865923


## gradient boost

In [98]:
step1 = ColumnTransformer([
    ('imputer', SimpleImputer(), ['horsepower','engine_size','cylinder_count']),
    ('encoder', OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'), ['brand','fuel_type','transmission','ext_col','int_col','accident','clean_title'])
],remainder='passthrough')

step2 = StandardScaler()

step3 = GradientBoostingRegressor(n_estimators=500)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2),
    ('step3',step3)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
print('RMSE',mean_squared_error(y_test,y_pred)**0.5)

R2 score 0.7745331446801628
MAE 9769.633412182504
RMSE 22486.940223989164


## xg boost

In [132]:
step1 = ColumnTransformer([
    ('imputer', SimpleImputer(), ['horsepower','engine_size','cylinder_count']),
    ('encoder', OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'), ['brand','fuel_type','transmission','ext_col','int_col','accident','clean_title'])
],remainder='passthrough')

step2 = StandardScaler()

step3 = XGBRegressor(n_estimators=45,max_depth=5,learning_rate=0.5)
# step3 = XGBRegressor(n_estimators=45,max_depth=5,learning_rate=0.5)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2),
    ('step3',step3)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
print('RMSE',mean_squared_error(y_test,y_pred)**0.5)

R2 score 0.8024368384412627
MAE 9566.210063348548
RMSE 21049.510231627923


## voting regressor

In [134]:
from sklearn.ensemble import VotingRegressor,StackingRegressor

step1 = ColumnTransformer([
    ('imputer', SimpleImputer(), ['horsepower','engine_size','cylinder_count']),
    ('encoder', OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'), ['brand','fuel_type','transmission','ext_col','int_col','accident','clean_title'])
],remainder='passthrough')

rf = RandomForestRegressor(n_estimators=350,random_state=3,max_samples=0.5,max_features=0.75,max_depth=15)
gbdt = GradientBoostingRegressor(n_estimators=100,max_features=0.5)
xgb = XGBRegressor(n_estimators=25,learning_rate=0.3,max_depth=5)
et = ExtraTreesRegressor(n_estimators=100,random_state=3,max_samples=0.5,max_features=0.75,max_depth=10,bootstrap=True)

step2 = StandardScaler()

step3 = VotingRegressor([('rf', rf), ('gbdt', gbdt), ('xgb',xgb), ('et',et)],weights=[5,1,1,1])

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2),
    ('step3',step3)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
print('RMSE',mean_squared_error(y_test,y_pred)**0.5)

R2 score 0.7480440912335946
MAE 9986.406062452412
RMSE 23771.209823651978


## stacking

In [103]:
step1 = ColumnTransformer([
    ('imputer', SimpleImputer(), ['horsepower','engine_size','cylinder_count']),
    ('encoder', OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'), ['brand','fuel_type','transmission','ext_col','int_col','accident','clean_title'])
],remainder='passthrough')


estimators = [
    ('rf', RandomForestRegressor(n_estimators=350,random_state=3,max_samples=0.5,max_features=0.75,max_depth=15)),
    ('gbdt',GradientBoostingRegressor(n_estimators=100,max_features=0.5)),
    ('xgb', XGBRegressor(n_estimators=25,learning_rate=0.3,max_depth=5))
]

step2 = StandardScaler()

step3 = StackingRegressor(estimators=estimators, final_estimator=Ridge(alpha=100))

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2),
    ('step3',step3)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
print('RMSE',mean_squared_error(y_test,y_pred)**0.5)

R2 score 0.771067572931609
MAE 9898.322545103862
RMSE 22659.100617915916
