In [441]:
import pandas as pd
import numpy as np
import functions as f 

#import the data
df_train = pd.read_csv('cars_train.csv')

In [442]:
zero_price = df_train.loc[df_train['price'] == 0]
df_train.drop(zero_price.index, inplace = True)

In [443]:
# get the threshold for the outliers for every numerical variable
lower_price, upper_price = f.detect_outliers(df_train['price'])
lower_id, upper_id = f.detect_outliers(df_train['id'])
lower_year, upper_year = f.detect_outliers(df_train['year'])
lower_odometer, upper_odometer = f.detect_outliers(df_train['odometer'])
lower_lat, upper_lat = f.detect_outliers(df_train['lat'])
lower_long, upper_long = f.detect_outliers(df_train['long'])

In [444]:
#gathering all the records that contain outliers to drop them
outliers = df_train.loc[(df_train['price'] < lower_price) | (df_train['price'] > upper_price) |
                        (df_train['id'] < lower_id) | (df_train['id'] > upper_id) |
                        (df_train['year'] < lower_year) | (df_train['year'] > upper_year) |
                        (df_train['odometer'] < lower_odometer) | (df_train['odometer'] > upper_odometer) |
                        (df_train['lat'] < lower_lat) | (df_train['lat'] > upper_lat) |
                        (df_train['long'] < lower_long) | (df_train['long'] > upper_long)]

df_train.drop(outliers.index, inplace=True)

In [445]:
#seperate the price from the features for the train
train_price = df_train['price']
train_features = df_train.drop(['price','id'], axis=1)

In [446]:
# fill the empty rows for the categorical data for the training set with the most frequent value
train_features['paint_color'] = f.fill_cat(train_features['paint_color'])
train_features['type'] = f.fill_cat(train_features['type'])
train_features['drive']=f.fill_cat(train_features['drive'])
train_features['transmission']= f.fill_cat(train_features['transmission'])
train_features['fuel']=f.fill_cat(train_features['fuel'])
train_features['cylinders']=f.fill_cat(train_features['cylinders'])
train_features['condition']= f.fill_cat(train_features['condition'])
train_features['model']=f.fill_cat(train_features['model'])
train_features['manufacturer']=f.fill_cat(train_features['manufacturer'])
train_features['posting_date']=f.fill_cat(train_features['posting_date'])

In [447]:
#create a dict that contains every term and its replacment in the model column to replace the terms
model_dic=f.replace(train_features['model'])
train_features['model'] = train_features['model'].replace(model_dic)

#replace all the values that their count is less than 1000 with other
train_list = f.features_list(train_features['model'], 1000)
train_features['model']=train_features['model'].replace(train_list,'other')

In [448]:
#create a list of all the categories of the column that their count is less than 1000 and group them into one group 'other'
train_list1 = f.features_list(train_features['manufacturer'],1000)
train_features['manufacturer'] = train_features['manufacturer'].replace(train_list1,'other')

In [449]:
#create a list of all the categories of the column that their count is less than 1000 and group them into one group 'other'
train_list2 = f.features_list(train_features['state'],1000)
train_features['state'] = train_features['state'].replace(train_list2,'other')

In [450]:
#put condition into 3 categories
mapping = {'like new':'new', 'excellent':'good','salvage':'fair'}
train_features['condition']= train_features['condition'].replace(mapping)
train_features['condition'].value_counts()

good    267363
new      16399
fair      5771
Name: condition, dtype: int64

In [451]:
#clean the cylinders column from the word cylinders
train_features['cylinders']=train_features['cylinders'].str.replace('cylinders', '')
train_features['cylinders']=train_features['cylinders'].astype('object')

#put cylinders into 3 categories 
mapping1 = {'4 ':'average','6 ':'average','8 ':'average','5 ':'average','10 ':'more_power','12 ':'more_power','3 ':'lower_power'}
train_features['cylinders'] = train_features['cylinders'].replace(mapping1)
train_features['cylinders'].value_counts()

average        287510
more_power       1066
other             611
lower_power       346
Name: cylinders, dtype: int64

### The Baseline Model: Linear Regression

In [452]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression

In [453]:
#Isoliate the columns that I will be using
train_features_baseline = train_features.drop(['region'], axis=1)

# seperating the numerical and categorical values for the baseline model
train_features_num_base = train_features_baseline.select_dtypes(include=[np.number])
train_features_cat_base = train_features_baseline.select_dtypes(exclude=[np.number])
print('The numerical variables:', train_features_num_base.shape)
print('The categorical variables:', train_features_cat_base.shape) 

The numerical variables: (289533, 4)
The categorical variables: (289533, 11)


In [454]:
train_prepared = f.pipeline(train_features_num_base,train_features_cat_base, train_features_baseline)

In [455]:
#Train the model and cross validate the model
lr = LinearRegression()
lr.fit(train_prepared, train_price)
pred = lr.predict(train_prepared)
train_rmse = mean_squared_error(train_price, pred, squared=False)
print(f"mse train baseline: {train_rmse:.1f}")

base_scores = cross_val_score(lr, train_prepared, train_price,
                             scoring="neg_mean_squared_error", cv=5)
base_rmse = np.sqrt(-base_scores)
print('RMSE Scores for the Baseline Model:')
f.display_scores(base_rmse)

mse train baseline: 1707.8
RMSE Scores for the Baseline Model:
Scores: [12465.  8803.  9709.  8726.  8753.]
Mean:  9691.0
Stdev: 1435.0


### Feature Engineering 

In [456]:
#bin the years in the year column into three bins 
train_features['year_bins']= f.bin_years(train_features['year'])

#fill the missing category of the years that we binned
train_features['year_bins'] = f.fill_cat(train_features['year_bins'])

In [457]:
# bin the paint color
color_mapping={'white':'netural','black':'dark','silver':'netural','red':'light','blue':'dark','grey':'light','green':'light',
              'brown':'dark','orange':'light','yellow':'light','purple':'dark'}

train_features['paint_color']= train_features['paint_color'].replace(color_mapping)
train_features['paint_color'].value_counts()

netural    168687
dark        65988
light       49006
custom       5852
Name: paint_color, dtype: int64

In [458]:
train_features.drop(['odometer','posting_date','lat','long','region'], axis=1, inplace=True) 

In [459]:
# seperating the numerical and categorical values for the baseline model
train_features_num_m = train_features.select_dtypes(include=[np.number])
train_features_cat_m = train_features.select_dtypes(exclude=[np.number])
print('The numerical variables:', train_features_num_m.shape)
print('The categorical variables:', train_features_cat_m.shape) 

The numerical variables: (289533, 1)
The categorical variables: (289533, 11)


In [460]:
train_prepared1 = f.pipeline(train_features_num_m,train_features_cat_m, train_features)

In [461]:
#Train and cross validate the model
lr1 = LinearRegression()
lr1.fit(train_prepared1, train_price)
pred1 = lr1.predict(train_prepared1)
train_rmse1 = mean_squared_error(train_price, pred1, squared=False)
print(f"mse train after feature engineering: {train_rmse1:.1f}")

scores = cross_val_score(lr1, train_prepared1, train_price,
                             scoring="neg_mean_squared_error", cv=5)
rmse = np.sqrt(-scores)
print('RMSE Scores for the Baseline Model:')
f.display_scores(rmse)

mse train after feature engineering: 7606.7
RMSE Scores for the Baseline Model:
Scores: [7598. 7634. 7637. 7587. 7604.]
Mean:  7612.0
Stdev: 20.0


## Feature Selection for Linear Regression

In [462]:
# extracting the features coefficients 
coef = list(lr1.coef_)

# get the encoded feature names from one hot encoding 
encoder = OneHotEncoder()
encode = encoder.fit_transform(train_features_cat_m)
columns = encoder.get_feature_names()

In [463]:
#combine the numeric and categorical feature names 
attributes = list(train_features_num_m) + list(columns)

#combine the feature names with their coefficients 
variables = pd.DataFrame((zip(coef)), columns= ['coefficient'], index= attributes).sort_values(by='coefficient',ascending=False)

In [464]:
## I will inspect first the variables that affect the price positivly by 600 and less and the ones that affect the price negatively by -600 and above
features_to_drop = variables.loc[(variables['coefficient']<600) & (variables['coefficient']>-600)].T

features_to_drop

Unnamed: 0,x9_sd,x9_in,x0_lincoln,x9_mo,x1_odyssey,x9_id,x9_other,x4_electric,x9_nv,x0_ram,...,x9_pa,x8_netural,x1_sienna,x9_tx,x9_va,x5_manual,x0_gmc,x9_ga,x9_ms,x9_ny
coefficient,557.896491,549.014902,546.425267,541.148534,475.447784,474.840548,419.633611,395.975221,392.074944,386.390704,...,-340.922448,-357.916365,-400.814339,-406.320981,-421.556553,-483.727044,-489.449747,-509.547531,-518.327331,-545.531424


In [465]:
X_train = pd.DataFrame(train_prepared1.todense(), columns=attributes)

In [466]:
X_train.drop(features_to_drop, axis=1, inplace=True)

In [467]:
lr3 = LinearRegression()
lr3.fit(X_train, train_price)
pred2 = lr3.predict(X_train)
train_rmse2 = mean_squared_error(train_price, pred2, squared=False)
print(f"mse train after feature selection: {train_rmse2:.1f}")

scores2 = cross_val_score(lr3, X_train, train_price,
                             scoring="neg_mean_squared_error", cv=5)
rmse2 = np.sqrt(-scores2)
print('RMSE Scores for the Baseline Model:')
f.display_scores(rmse2)

mse train after feature selection: 7617.7
RMSE Scores for the Baseline Model:
Scores: [7608. 7642. 7645. 7597. 7615.]
Mean:  7621.0
Stdev: 19.0


## Model Optimization and Selection

### SVM

In [468]:
from sklearn.svm import LinearSVR
from sklearn.model_selection import GridSearchCV

svr= LinearSVR()
param_grid_svr = [
    {'C': [0.01, 1, 5], 'max_iter': [100, 400, 1000]}]

svr_search = GridSearchCV(svr, param_grid_svr, cv=5,
                           scoring='neg_mean_squared_error')
svr_search.fit(X_train, train_price)

GridSearchCV(cv=5, estimator=LinearSVR(),
             param_grid=[{'C': [0.01, 1, 5], 'max_iter': [100, 400, 1000]}],
             scoring='neg_mean_squared_error')

In [469]:
svr_best_score = svr_search.best_score_
print(('SVR Mean RMSE >>', np.round(np.sqrt(-svr_best_score))))

svr_std = svr_search.cv_results_['std_test_score'][svr_search.best_index_]
print(('SVR Std RMSE >>', np.round(np.sqrt(svr_std))))

('SVR Mean RMSE >>', 8176.0)
('SVR Std RMSE >>', 932.0)


### Decision Tree

In [470]:
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor()
param_grid_dtr = [
    {'splitter': ['random','best'], 'max_depth': [100, 400, 1000]}]

dtr_search = GridSearchCV(dtr, param_grid_dtr, cv=5, scoring='neg_mean_squared_error')
dtr_search.fit(X_train, train_price)

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(),
             param_grid=[{'max_depth': [100, 400, 1000],
                          'splitter': ['random', 'best']}],
             scoring='neg_mean_squared_error')

In [471]:
dtr_best_score = dtr_search.best_score_
print(('Decision Tree Mean RMSE >>', np.round(np.sqrt(-dtr_best_score))))

dtr_std = dtr_search.cv_results_['std_test_score'][dtr_search.best_index_]
print(('Decision Tree Std RMSE >>', np.round(np.sqrt(dtr_std))))

('Decision Tree Mean RMSE >>', 5702.0)
('Decision Tree Std RMSE >>', 471.0)


### Random Forest

In [474]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor()
param_grid_rfr = [
    {'max_depth': [100,200], 'n_estimators': [100,200]}]

rfr_search = GridSearchCV(rfr, param_grid_rfr, cv=5, scoring='neg_mean_squared_error')
rfr_search.fit(X_train, train_price)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_depth': [100], 'n_estimators': [100]}],
             scoring='neg_mean_squared_error')

In [475]:
rfr_best_score = rfr_search.best_score_
print(('Random Forest Mean RMSE >>',np.round(np.sqrt(-rfr_best_score))))

rfr_std = rfr_search.cv_results_['std_test_score'][rfr_search.best_index_]
print(('Random Forest Std RMSE >>', np.round(np.sqrt(rfr_std))))

('Random Forest Mean RMSE >>', 5268.0)
('Random Forest Std RMSE >>', 432.0)


### GradientBoostingRegressor

In [476]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor()
param_grid_gbr = [
    {'n_estimators': [100, 200]}]

    
gbr_search = GridSearchCV(gbr, param_grid_gbr, cv=5, scoring='neg_mean_squared_error')
gbr_search.fit(X_train, train_price)

GridSearchCV(cv=5, estimator=GradientBoostingRegressor(),
             param_grid=[{'n_estimators': [100, 200]}],
             scoring='neg_mean_squared_error')

In [477]:
gbr_best_score = gbr_search.best_score_
print(('Gradient Boosting Mean RMSE >>', np.round(np.sqrt(-gbr_best_score))))

gbr_std = gbr_search.cv_results_['std_test_score'][gbr_search.best_index_]
print(('Gradient Boosting Mean RMSE >>', np.round(np.sqrt(gbr_std))))

('Gradient Boosting Mean RMSE >>', 6469.0)
('Gradient Boosting Mean RMSE >>', 374.0)


## Feature Selection For Random Forest

In [479]:
# get the most important features used in the random forest classifier 
feature_importance = rfr_search.best_estimator_.feature_importances_

In [480]:
# create a dataframe of the feature names and their importance for the classifier 
variables1 = pd.DataFrame((zip(feature_importance)), columns= ['importance'], index= X_train.columns).sort_values(by='importance',ascending=False)

In [482]:
# I will inspect the variables that are less than 0.001 important for price
features_to_drop1 = variables1.loc[(variables1['importance']<0.001)].T

features_to_drop1

Unnamed: 0,x0_mazda,x1_200,x0_buick,x1_highlander,x1_fusion,x9_ri,x7_mini-van,x9_ne,x1_malibu,x1_civic,...,x1_cr-v,x10_Old,x0_mercury,x1_passat,x1_optima,x1_santa,x3_lower_power,x1_sonata,x1_liberty,x10_New
importance,0.000994,0.000982,0.000916,0.000899,0.000891,0.000857,0.000848,0.000847,0.000827,0.000824,...,0.000252,0.00025,0.000205,0.00018,0.000172,0.000163,0.000161,0.000161,9.5e-05,1.3e-05


In [484]:
X_train.drop(features_to_drop1, axis=1, inplace=True)

In [486]:
forest_model = rfr_search.best_estimator_
forest_scores = cross_val_score(forest_model, X_train, train_price,
                                scoring="neg_mean_squared_error", cv=5)

forest_rmse_scores = np.sqrt(-forest_scores)
print(('Random Forest RMSE:'))
display_scores(forest_rmse_scores)

Random Forest RMSE:
Scores: [5383. 5419. 5418. 5382. 5421.]
Mean:  5405.0
Stdev: 18.0


## Pipeline, Interpretable Model, and Final Testing 

### Optimizing the interpretable model: Decision Tree

In [489]:
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor()
param_grid_dtr = [
    {'splitter': ['random','best'], 'max_depth': [1000, 3000]}]

dtr_search = GridSearchCV(dtr, param_grid_dtr, cv=5, scoring='neg_mean_squared_error')
dtr_search.fit(X_train, train_price)

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(),
             param_grid=[{'max_depth': [1000, 3000],
                          'splitter': ['random', 'best']}],
             scoring='neg_mean_squared_error')

In [490]:
dtr_best_score = dtr_search.best_score_
print(('Decision Tree Mean RMSE >>', np.round(np.sqrt(-dtr_best_score))))

dtr_std = dtr_search.cv_results_['std_test_score'][dtr_search.best_index_]
print(('Decision Tree Std RMSE >>', np.round(np.sqrt(dtr_std))))

('Decision Tree Mean RMSE >>', 5797.0)
('Decision Tree Std RMSE >>', 475.0)


In [537]:
final_model = dtr_search.best_estimator_

### Evaluate on test

In [491]:
#import the test
df_test = pd.read_csv('cars_test.csv')

#remove zero prices for test
zero_price_test = df_test.loc[df_test['price'] == 0]
df_test.drop(zero_price_test.index, inplace = True)

In [492]:
# get the threshold for the outliers for every numerical variable
lower_price1, upper_price1 = f.detect_outliers(df_test['price'])
lower_id1, upper_id1 = f.detect_outliers(df_test['id'])
lower_year1, upper_year1 = f.detect_outliers(df_test['year'])
lower_odometer1, upper_odometer1 = f.detect_outliers(df_test['odometer'])
lower_lat1, upper_lat1 = f.detect_outliers(df_test['lat'])
lower_long1, upper_long1 = f.detect_outliers(df_test['long'])

In [493]:
#gathering all the records that contain outliers to drop them
outliers1 = df_test.loc[(df_test['price'] < lower_price1) | (df_test['price'] > upper_price1) |
                        (df_test['id'] < lower_id1) | (df_test['id'] > upper_id1) |
                        (df_test['year'] < lower_year1) | (df_test['year'] > upper_year1) |
                        (df_test['odometer'] < lower_odometer1) | (df_test['odometer'] > upper_odometer1) |
                        (df_test['lat'] < lower_lat1) | (df_test['lat'] > upper_lat1) |
                        (df_test['long'] < lower_long1) | (df_test['long'] > upper_long1)]

df_test.drop(outliers1.index, inplace=True)

In [495]:
#seprate the test data and drop all the columns that I decided to drop during training 
test_price = df_test['price']
test_features = df_test.drop(['price','id','odometer','posting_date','lat','long','region'], axis=1)

In [496]:
# fill the empty rows for the categorical data for the testing set with the most frequent value
test_features['type'] = f.fill_cat(test_features['type'])
test_features['drive'] = f.fill_cat(test_features['drive'])
test_features['transmission'] = f.fill_cat(test_features['transmission'])
test_features['fuel'] = f.fill_cat(test_features['fuel'])
test_features['cylinders'] = f.fill_cat(test_features['cylinders'])
test_features['condition'] = f.fill_cat(test_features['condition'])
test_features['model'] = f.fill_cat(test_features['model'])
test_features['manufacturer'] = f.fill_cat(test_features['manufacturer'])
test_features['paint_color'] = f.fill_cat(test_features['paint_color'])

In [497]:
#create a list of all the categories of the column that their count is less than 1000 and group them into one group 'other'
test_list1 = f.features_list(test_features['manufacturer'],1000)
test_features['manufacturer'] = test_features['manufacturer'].replace(test_list1,'other')

In [498]:
#put condition into 3 categories
mapping_test1 = {'like new':'new', 'excellent':'good','salvage':'fair'}
test_features['condition']= test_features['condition'].replace(mapping_test1)
test_features['condition'].value_counts()

good    114487
new       6903
fair      2552
Name: condition, dtype: int64

In [499]:
#clean the cylinders column from the word cylinders
test_features['cylinders']=test_features['cylinders'].str.replace('cylinders', '')
test_features['cylinders']=test_features['cylinders'].astype('object')

#put cylinders into 3 categories 
mapping_test = {'4 ':'average','6 ':'average','8 ':'average','5 ':'average','10 ':'more_power','12 ':'more_power','3 ':'lower_power'}
test_features['cylinders'] = test_features['cylinders'].replace(mapping_test)
test_features['cylinders'].value_counts()

average        123033
more_power        454
other             297
lower_power       158
Name: cylinders, dtype: int64

In [503]:
#replace all the values that their count is less than 1000 with other
test_list = f.features_list(test_features['model'], 1000)
test_features['model']=test_features['model'].replace(test_list,'other')

#create a dict that contains every term and its replacment in the model column to replace the terms
model_dic_test=f.replace(test_features['model'])
test_features['model'] = test_features['model'].replace(model_dic_test)

#group the categories with the least effect as others
mapping_test = {'odyssey':'other','expedition':'other','colorado':'other','grand caravan':'other','acadia':'other','murano':'other',
           '300':'other','sienna':'other'}
test_features['model'] = test_features['model'].replace(mapping_test)

In [504]:
# bin the paint color
color_mapping_test={'white':'netural','black':'dark','silver':'netural','red':'light','blue':'dark','grey':'light','green':'light',
              'brown':'dark','orange':'light','yellow':'light','purple':'dark'}

test_features['paint_color']= test_features['paint_color'].replace(color_mapping_test)
test_features['paint_color'].value_counts()

netural    72378
dark       28379
light      20634
custom      2551
Name: paint_color, dtype: int64

In [527]:
test_features['year_bins'] = f.bin_years(test_features['year'])

test_features['year_bins'] = f.fill_cat(test_features['year_bins'])

In [528]:
# seperating the numerical and categorical values for the test 
test_features_num = test_features.select_dtypes(include=[np.number])
test_features_cat = test_features.select_dtypes(exclude=[np.number])
print('The numerical variables:', test_features_num.shape)
print('The categorical variables:', test_features_cat.shape)

The numerical variables: (123942, 1)
The categorical variables: (123942, 11)


In [552]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif 

select = SelectKBest(mutual_info_classif, k=89)

numeric_pipeline = Pipeline([('imputer', imputer),('scaler', scaler)])
    
num_attribs = list(train_features_num_m)
cat_attribs = list(train_features_cat_m)
    
pipeline = ColumnTransformer([
        ("num", numeric_pipeline, num_attribs),
        ("cat", cat_encoder, cat_attribs)])

train_prepared= pipeline.fit_transform(train_features)
test_prepared = pipeline.transform(test_features) 

In [554]:

full_pipeline = Pipeline([('preperation', pipeline), ('select', select), ('final_model', final_model)])

train_pred= full_pipeline.fit(train_features, train_price)
test_pred = full_pipeline.predict(test_features)  

In [556]:
final_mse = mean_squared_error(test_price, test_pred)
final_rmse = np.sqrt(final_mse)
final_rmse

8040.17359177304