# US Cars Dataset Price Prediction

US Cars' data was scraped from AUCTION EXPORT.com. This dataset included Information about 28 brands of clean and used vehicles for sale in US. Twelve initial features were assembled for each car in the dataset.

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split,cross_val_score,RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
import lightgbm as lgb
pd.set_option('display.max_rows', None)
plt.rcParams['figure.figsize']=(16, 8.27) #set graphs size to A4 dimensions
sns.set_style('darkgrid')

In [None]:
dataset=pd.read_csv('/kaggle/input/usa-cers-dataset/USA_cars_datasets.csv')

In [None]:
dataset.info()

In [None]:
dataset.head(10)

### Drop unnecessary columns 

In [None]:
dataset.drop(['Unnamed: 0','vin','lot'],axis=1,inplace=True)

### Create and transform features 

We are going to extract how many minutes left from the "Condition" column.

In [None]:
dataset['value']= dataset['condition'] .str.split(' ').str[0]
dataset['days']= dataset['condition'] .str.split(' ').str[1]

def days_to_min_converter(time):
    return int(time)*1440

def hours_to_min_converter(time):
    return int(time)*60


temp_data=pd.concat([dataset[dataset['days']=='days']['value'].apply(days_to_min_converter),
           dataset[dataset['days']=='hours']['value'].apply(hours_to_min_converter),
           dataset[dataset['days']=='minutes']['value'].astype(int)]).rename('Minutes_Left',inplace=True)



dataset=pd.concat([dataset,temp_data],axis=1)
dataset['Minutes_Left'].fillna(-200,inplace=True)

dataset.drop(['condition','value','days'],axis=1,inplace=True)

Convert the Year column to represent which year of car's registration is in progress (e.g first, second, third, etc...)

In [None]:
def year_transform(year):
    return 2021-year

dataset['year']=dataset['year'].apply(year_transform)

Create new feature which represents Mileage per Year for each car.

In [None]:
dataset['miles/year']=dataset['mileage']/dataset['year']

Find out numerical and categorical variables

In [None]:
categorical_features=[feature for feature in dataset.columns if dataset[feature].dtype=='O']

numerical_features=[feature for feature in dataset.columns if dataset[feature].dtype!='O']

# EDA

Create some visualizations and print some information about features distribution

In [None]:
dataframes=[]
for feature in categorical_features:
    dataframe=dataset[feature].value_counts().rename_axis(feature).reset_index(name='counts')
    dataframes.append(dataframe)

for i in range(len(dataframes)):
    print(dataframes[i],'\n')

In [None]:
for feature in numerical_features:
    sns.distplot(dataset[feature])
    plt.show()

### SPLITTING DATASET TO TRAIN AND TEST SET AND APPLYING SOME FEATURE ENGINEERING TECHNIQUES

In [None]:
X=dataset.drop('price',axis=1)
y=dataset['price']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

train_set=pd.concat([X_train,y_train],axis=1)
test_set=pd.concat([X_test,y_test],axis=1)

### APPLYING TARGET MEAN ENCODING FOR CATEGORICAL FEATURES AND SCALING THE TRAIN AND TEST SET

In [None]:
for feature in categorical_features:
    feature_labels=train_set.groupby(feature)['price'].mean().sort_values().index
    feature_labels={k:i for i,k in enumerate(feature_labels,0)}
    train_set[feature]=train_set[feature].map(feature_labels)
    test_set[feature]=test_set[feature].map(feature_labels)

test_set.dropna(inplace=True)

scaler=StandardScaler()

scaled_X_train=pd.DataFrame(scaler.fit_transform(train_set.drop('price',axis=1)), columns=X_train.columns)
scaled_X_train.index=train_set.index
scaled_X_test=pd.DataFrame(scaler.transform(test_set.drop('price',axis=1)), columns=X_test.columns)
scaled_X_test.index=test_set.index


scaled_train=pd.concat([scaled_X_train,train_set['price']],axis=1)
scaled_test=pd.concat([scaled_X_test,test_set['price']],axis=1)

# FEATURE SELECTION

### Now we are going to check feature importances using Random Forest 

In [None]:
reg=RandomForestRegressor()
reg.fit(scaled_train.drop('price',axis=1),scaled_train['price'])

feat_importances = pd.Series(reg.feature_importances_, index=scaled_train.drop('price',axis=1).columns)
feat_importances.nlargest(scaled_train.drop('price',axis=1).shape[1]).plot(kind='barh')
plt.show()

As we can see 'Country' is not a useful feature and we drop it from train and test set

In [None]:
scaled_train.drop('country',axis=1,inplace=True)
scaled_test.drop('country',axis=1,inplace=True)

# MODEL SELECTION

In [None]:
X_train=scaled_train.drop('price',axis=1)
y_train=scaled_train['price']

X_test=scaled_test.drop('price',axis=1)
y_test=scaled_test['price']




lm=LinearRegression()
svr=SVR()
rf=RandomForestRegressor()
xgb_reg=xgb.XGBRegressor()
lgb_reg=lgb.LGBMRegressor()


score_lm=cross_val_score(lm,X_train,y_train,cv=10,scoring='neg_mean_squared_error')
score_svr=cross_val_score(svr,X_train,y_train,cv=10,scoring='neg_mean_squared_error')
score_rf=cross_val_score(rf,X_train,y_train,cv=10,scoring='neg_mean_squared_error')
score_xgb_reg=cross_val_score(xgb_reg,X_train,y_train,cv=10,scoring='neg_mean_squared_error')
score_lgb_reg=cross_val_score(lgb_reg,X_train,y_train,cv=10,scoring='neg_mean_squared_error')


scores=pd.DataFrame({'Model':['Linear Regression','SVR','Random Forest','XGBoost','LightGBM'],
                    'Mean Squared Error':[-score_lm.mean(),-score_svr.mean(),-score_rf.mean(),
                                           -score_xgb_reg.mean(),-score_lgb_reg.mean()]})

scores

We are going to evaluate and apply Hyper Parameter tuning for Random Forest only.

#### Evaluating the model's performance on the test set

In [None]:
rf.fit(X_train,y_train)
y_pred_rf=rf.predict(X_test)

print('MSE: ',mean_squared_error(y_test,y_pred_rf))
print('R2: ',r2_score(y_test,y_pred_rf))
print('MAE: ',mean_absolute_error(y_test,y_pred_rf))
print('RMSE: ',np.sqrt(mean_squared_error(y_test,y_pred_rf)))

#### Hyperparameter tuning for Random Forest

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1,2,4,8,16,32]

#learning_rate = [0.1, 0.01, 0.001]

# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               #'learning_rate':learning_rate,
               'min_samples_leaf':min_samples_leaf,
               'bootstrap': bootstrap}




random_rf=RandomizedSearchCV(rf,cv=10,param_distributions=random_grid,scoring='neg_mean_squared_error',n_jobs=-1,verbose=1)
random_rf.fit(X_train,y_train)


#### Εvaluating best model's performance according to hyperparameter tuning

In [None]:
best_rf=random_rf.best_estimator_
best_rf.fit(X_train,y_train)
y_pred_bestrf=best_rf.predict(X_test)

print('MSE: ',mean_squared_error(y_test,y_pred_bestrf))
print('R2: ',r2_score(y_test,y_pred_bestrf))
print('MAE: ',mean_absolute_error(y_test,y_pred_bestrf))
print('RMSE: ',np.sqrt(mean_squared_error(y_test,y_pred_bestrf)))

In [None]:
sns.distplot(y_test-y_pred_bestrf)
plt.show()

In [None]:
sns.scatterplot(y_test,y_pred_bestrf)
plt.show()

From the scatterplot above we see that relationship between actual and predicted values tends to be linear, so we have built a good model.