In [50]:
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import numpy as np

In [51]:
df = pd.read_csv('train.csv')

df.drop(columns=['id', 'clean_title'], inplace=True)

In [52]:
df.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,price
0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,11000
1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,8250
2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,15000
3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,63500
4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,7850


In [53]:
df.shape

(54273, 11)

##  Checking for missing values

In [54]:
df.isna().sum()

brand           0
model           0
model_year      0
milage          0
fuel_type       0
engine          0
transmission    0
ext_col         0
int_col         0
accident        0
price           0
dtype: int64

## Data anaylsis

In [55]:
df.describe()

Unnamed: 0,model_year,milage,price
count,54273.0,54273.0,54273.0
mean,2015.091979,72746.175667,39218.44
std,5.588909,50469.490448,72826.34
min,1974.0,100.0,2000.0
25%,2012.0,32268.0,15500.0
50%,2016.0,66107.0,28000.0
75%,2019.0,102000.0,45000.0
max,2024.0,405000.0,2954083.0


In [56]:
print(df.columns)

Index(['brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine',
       'transmission', 'ext_col', 'int_col', 'accident', 'price'],
      dtype='object')


In [57]:
numeric_columns = df.select_dtypes(include=[int, float]).columns
categorical_columns = df.select_dtypes(include=[object]).columns
print(numeric_columns)
print(categorical_columns
      )

Index(['model_year', 'milage', 'price'], dtype='object')
Index(['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col',
       'int_col', 'accident'],
      dtype='object')


In [58]:
# fig, axs = plt.subplots(3,3, figsize=(18,18))

# for ax,col in zip(axs.flat, categorical_columns):
#     count = df[col].value_counts()
#     ax.pie(count, labels = count.index, autopct='%1.1f%%' )
#     ax.set_title(col)
    

# plt.tight_layout()
# plt.show()


# preprocess data

In [59]:
X = df.drop(columns=['price'])
y = df['price']

X.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident
0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported
1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported
2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported
3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported
4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported


In [60]:
# go through all non numerical columns and see how many unique values it has
for col in X.select_dtypes(include=[object]).columns:
    print(f'{col} has {X[col].nunique()} unique values')

brand has 53 unique values
model has 1827 unique values
fuel_type has 7 unique values
engine has 1061 unique values
transmission has 46 unique values
ext_col has 260 unique values
int_col has 124 unique values
accident has 2 unique values


# Most frequent Preprocessor

## Columns affected:

1. int_col: Interior color of the car.
2. ext_col: Exterior color of the car.
3. fuel_type: fuel type. 

In [61]:
from sklearn.base import BaseEstimator, TransformerMixin


class most_frequent_value_preprocessor(BaseEstimator, TransformerMixin):
    '''
    Function to preprocess a column by keeping the values that have a
    frequency higher than a threshold and encoding the rest as 'Other'
    Also should deal with weird characters like '–'
    '''
    
    def __init__(self, column, selected_values, na_values = []):
        '''
        column: column name to be preprocessed
        selected_values: list of selected values to be encoded
        na_values: list of values to be encoded as missing Ex: ['–']
        '''
        self.column = column
        self.selected_values = selected_values
        self.na_values = na_values


    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        

        df = X.copy()

        # Initialize the column with 'Other'
        df['processed'] = 'Other'

        # Replace the selected values with the most frequent value
        for na_value in self.na_values:
            if na_value in self.selected_values:
                self.selected_values.remove(na_value)
                
            df.loc[df[self.column] == na_value, 'processed'] = self.selected_values[0]
 
        # Only keep the values with a frequency higher than the threshold
        mask = np.isin(df[self.column], self.selected_values)
        df.loc[mask, 'processed'] = df.loc[mask, self.column]

        # processed column
        return df[['processed']]



In [62]:
def most_frequent_value(column, threshold):
    '''
    Function to get the values in a column that 
    are above a certain threshold
    '''
    
    values = column.value_counts()
    selected_values = values[values > threshold].index.tolist()

    return selected_values

In [63]:
import re
class engine_features(BaseEstimator, TransformerMixin):
    '''
    Function seperate the engine column into 3 main columns
    1. Horse_power
    2. Engine_displacement(L): Total volume swept by all the pistons during one cycle. 
    3. Cylinder_configuration: The arrangement of the cylinders in the engine
    '''


    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        df = X.copy()
        new_columns = ['Horsepower', 'Engine_displacement', 'Cylinder_configuration']
        df[new_columns] = None     

        
        hp_pattern = re.compile(r'(\d+\.\d+HP)', re.IGNORECASE)
        cylinder_pattern = re.compile(r'((v\d+|flat\s\d+|straight\s\d+|\d+)\s*cylinder|v\d+|H\d+|I\d+)', re.IGNORECASE)
        volume_pattern = re.compile(r'(\d+\.\d+L)', re.IGNORECASE)


        df['Horsepower'] = df['engine'].str.extract(hp_pattern, expand=False)
        # idk why i need to index twice
        df['Cylinder_configuration'] = df['engine'].str.extract(cylinder_pattern, expand=False)[0]
        df['Engine_displacement'] = df['engine'].str.extract(volume_pattern, expand=False)

        # for each horsepower remove HP and convert the value to a float
        df['Horsepower'] = df['Horsepower'].str.replace('HP', '').astype(float)
        # fill missing horse power with median
        df['Horsepower'].fillna(df['Horsepower'].median(), inplace=True)

        # for each engine displacement remove L and convert the value to a float
        df['Engine_displacement'] = df['Engine_displacement'].str.replace('L', '').astype(float)
        # fill missing engine displacement with median
        df['Engine_displacement'].fillna(df['Engine_displacement'].median(), inplace=True)

        # remove empty spaces and convert to lower case and remove cylinder word
        df['Cylinder_configuration'] = df['Cylinder_configuration'].str.lower().str.replace(' ', '').str.replace('cylinder', '')
        # set electric vehicles to have an "electric" cylinder configuration
        df.loc[df['engine'].str.contains('Electric', case=False), 'Cylinder_configuration'] = 'electric'
        # fill missing cylinder configuration with mode
        # df['Cylinder_configuration'].fillna(df['Cylinder_configuration'].mode()[0], inplace=True)
        
        # df.drop(columns=['engine'], inplace=True)
      
        
        return df
        

In [70]:
engine_feature_transformer = engine_features()
X = engine_feature_transformer.fit_transform(X)


# print the value of engine for all missing values that isnt "-"
X.loc[X['Cylinder_configuration'].isna(), 'engine'].value_counts()  

–                                           335
6.0L W12 48V PDI DOHC Twin Turbo             32
3.0 Liter Turbo                              13
212.0HP 1.3L Rotary engine Gasoline Fuel     13
2.0 Liter Turbo                              12
232.0HP 1.3L Rotary engine Gasoline Fuel     11
2.0 Liter TFSI                               10
3.0 Liter                                     8
3.5 Liter                                     5
2.0 Liter                                     5
Standard Range Battery                        3
Intercooled Turbo Diesel V-8 6.7 L/406        3
5.6 Liter                                     3
2.5L                                          3
4.4 Liter DOHC Twin Turbo                     2
4.6 Liter                                     2
4.4 Liter Twin Turbo                          2
3.0 Liter Twin Turbo                          1
4.4 Liter GDI DOHC Twin Turbo                 1
3.0L                                          1
8.0L W16 64V GDI DOHC Twin Turbo        

In [29]:
# temp = []
# for col in X['engine'][:25]:
#     curr_str = ''
#     hp, cylinder, volume = '', '', ''

#     for char in col:
#         if char == ' ': continue
#         if 'HP' in curr_str.upper():
#             hp = curr_str
#             curr_str = ''
#         elif char == 'L':
#             volume = curr_str
#             curr_str = ''
#             continue
#         elif 'cylinder' in curr_str.lower():
#             cylinder = curr_str
#             curr_str = ''
            
#         curr_str += char
#     print(f'hp: {hp}, cylinder: {cylinder}, volume: {volume}')
#     temp.append((hp, cylinder, volume))





In [30]:
X.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident
0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported
1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported
2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported
3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported
4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported


In [31]:
# get the selected values
interior_vals = most_frequent_value(X['int_col'], 1000)
exterior_vals = most_frequent_value(X['ext_col'], 500)
fuel_vals = most_frequent_value(X['fuel_type'], 0)

transmission_vals = most_frequent_value(X['transmission'], 1000)
brand_vals = most_frequent_value(X['brand'], 500)
model_vals = most_frequent_value(X['model'], 200)

# Encoders
int_color_encoder = most_frequent_value_preprocessor('int_col', interior_vals, ['–'])
ext_color_encoder = most_frequent_value_preprocessor('ext_col', exterior_vals, ['–'])
fuel_encoder = most_frequent_value_preprocessor('fuel_type', fuel_vals, ['–', 'not supported'])
transmission_encoder = most_frequent_value_preprocessor('transmission', transmission_vals, ['–'])
brand_encoder = most_frequent_value_preprocessor('brand', brand_vals, ['–'])
model_encoder = most_frequent_value_preprocessor('model', model_vals, ['–'])


# numerical columns:

num_cols = X.select_dtypes(include=[int, float]).columns



In [32]:
engine_feature_transformer = engine_features()
X = engine_feature_transformer.fit_transform(X)

X.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,transmission,ext_col,int_col,accident,Horsepower,Engine_displacement,Cylinder_configuration
0,Ford,F-150 Lariat,2018,74349,Gasoline,10-Speed A/T,Blue,Gray,None reported,375.0,3.5,v6
1,BMW,335 i,2007,80000,Gasoline,6-Speed M/T,Black,Black,None reported,300.0,3.0,straight6
2,Jaguar,XF Luxury,2009,91491,Gasoline,6-Speed A/T,Purple,Beige,None reported,300.0,4.2,8
3,BMW,X7 xDrive40i,2022,2437,Hybrid,Transmission w/Dual Shift Mode,Gray,Brown,None reported,335.0,3.0,electric
4,Pontiac,Firebird Base,2001,111000,Gasoline,A/T,White,Black,None reported,200.0,3.8,v6


In [33]:
X['Cylinder_configuration'].value_counts()

v6           17136
8            16184
4             9900
straight6     4390
electric      2160
flat6         1437
i4            1022
v8             934
12             334
5              220
10             199
h6              81
i3              73
v12             64
3               49
i6              48
h4              21
v10             21
Name: Cylinder_configuration, dtype: int64

In [42]:
# import data pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import category_encoders as ce

label = LabelEncoder()
scaler = StandardScaler()
target_encoder = ce.TargetEncoder()

interior_color_pipeline = Pipeline([
    ('int_col', int_color_encoder),
    ('one_hot_encoder', OneHotEncoder())
])
exterior_color_pipeline = Pipeline([
    ('ext_col', ext_color_encoder),
    ('one_hot_encoder', OneHotEncoder())
])
fuel_pipeline = Pipeline([
    ('fuel_type', fuel_encoder),
    ('one_hot_encoder', OneHotEncoder())
])

transmission_pipeline = Pipeline([
    ('transmission', transmission_encoder),
    ('one_hot_encoder', OneHotEncoder())
])



# model_pipeline = Pipeline([
#     ('model', model_encoder),
#     ('one_hot_encoder', OneHotEncoder())
# ])

# brand_pipeline = Pipeline([
#     ('brand', brand_encoder),
#     ('one_hot_encoder', OneHotEncoder())
# ])

# target encode these two
target_encoder_cols = ['brand', 'model', 'Cylinder_configuration', 'int_col', 'ext_col']

accident_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder())
])

engine_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder())
])

column_transformer = ColumnTransformer([
    # ('brand', brand_pipeline, ['brand']),
    # ('model', model_pipeline, ['model']),
    # ('engine', engine_pipeline, ['engine']),
    # ('engine_features', engine_feature_pipeline, ['engine']),
    ('target_encoder', target_encoder, target_encoder_cols),
    ('scaler', scaler, num_cols),
    ('fuel_type', fuel_pipeline, ['fuel_type']),
    ('transmission_type', transmission_pipeline, ['transmission']),
    #('interior_color', interior_color_pipeline, ['int_col']),
    #('exterior_color', exterior_color_pipeline, ['ext_col']),
    ('accident', accident_pipeline, ['accident']),
    
], remainder='passthrough')

X = column_transformer.fit_transform(X,y)

X = pd.DataFrame(X)

In [43]:
X.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,38154.063227,43169.955645,32657.080649,26723.43746,37614.687018,60508.125743,0.520325,0.031759,0.0,0.0,1.0,0.0,0.0,0.0,1.0,375.0,3.5
1,40276.029448,15520.375566,35269.609112,41290.661314,38413.911328,33931.317579,-1.447877,0.143728,0.0,0.0,1.0,0.0,0.0,0.0,1.0,300.0,3.0
2,34840.403933,26187.727179,47139.973554,27862.756473,26120.155301,24939.541386,-1.090022,0.371412,0.0,0.0,1.0,0.0,0.0,0.0,1.0,300.0,4.2
3,40276.029448,68657.674465,51656.560185,49465.203662,45155.86509,45993.777037,1.236035,-1.393115,0.0,0.0,0.0,1.0,0.0,0.0,1.0,335.0,3.0
4,17526.114591,27810.962157,32657.080649,41290.661314,37774.187602,29971.027033,-2.521442,0.757966,0.0,0.0,1.0,0.0,0.0,0.0,1.0,200.0,3.8


#### Make(brand) and model

In [44]:
# average_price = df.groupby('brand')['price'].mean().reset_index()
# plt.figure(figsize=(30, 15))

# plt.bar(average_price['brand'], average_price['price'])
# plt.xlabel('Car Make')
# plt.ylabel('Average Price')
# plt.title('Average Price by Car Make')
# _ = plt.xticks(rotation=45)

In [45]:
# # expensive cars
# exotic_brands = list(average_price.sort_values(by='price', ascending=False).head(10)['brand'])
# brand_values = X['brand'].value_counts()
# selected_values = brand_values[brand_values > 500].index.tolist()

# def brand_encoder(row):

#     if row['brand'] in exotic_brands:
#         return 'Exotic'
    
#     if row['brand'] in selected_values:
#         return row['brand']
    
#     return 'Other'

# df['brand'] = df.apply(brand_encoder, axis=1)
# print(df['brand'].value_counts())



In [46]:
# average_price = df.groupby('model')['price'].mean().reset_index()
# plt.figure(figsize=(30, 15))

# plt.bar(average_price['model'], average_price['price'])
# plt.xlabel('Car Make')
# plt.ylabel('Average Price')
# plt.title('Average Price by Car Make')
# _ = plt.xticks(rotation=45)

In [47]:
# from sklearn.preprocessing import LabelEncoder

# q75 = average_price['price'].quantile(0.75)
# q50 = average_price['price'].quantile(0.50)


# expensive_price = average_price[average_price['price'] > q75]
# high_price = average_price[average_price['price'] > q50]
# low_price = average_price[average_price['price'] <= q50]

# def model_price_encoder(row):
    
#     if row['model'] in expensive_price['model'].values:
#         return 'Expensive'
    
#     if row['model'] in high_price['model'].values:
#         return 'High'
    
#     if row['model'] in low_price['model'].values:
#         return 'Low'
    
#     return 'Other'

# df['model_tier'] = df.apply(model_price_encoder, axis=1)
# print(df['model_tier'].value_counts())

# # target encode the model column
# lbEncoder = LabelEncoder()
# df['model'] = lbEncoder.fit_transform(df['model'])

# df['model'].value_counts()


#### Transmission

In [48]:
# print(df['transmission'].value_counts())
# df['transmission'] = lbEncoder.fit_transform(df['transmission'])

# df['engine'] = lbEncoder.fit_transform(df['engine'])

#### engine

In [49]:
df.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,price
0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,11000
1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,8250
2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,15000
3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,63500
4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,7850


In [50]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42, test_size=0.2)


In [51]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from xgboost import XGBRegressor


models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "AdaBoost Regressor": AdaBoostRegressor()
}

In [52]:
from sklearn.metrics import mean_squared_error, r2_score

model_list = []
r2_list = []

for model_name, model in models.items():
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse, r2 = mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred)
    print(f'Model: {model_name}')
    print("- Mean Squared Error: {:.4f}".format(mse))
    print("- R2 score: {:.4f}".format(r2))
    print('===============================================================\n\n')

    model_list.append(model_name)
    r2_list.append(r2)

Model: Linear Regression
- Mean Squared Error: 2247614929.3427
- R2 score: 0.2351


Model: Lasso
- Mean Squared Error: 2247619220.6104
- R2 score: 0.2351


Model: Ridge
- Mean Squared Error: 2247616580.3852
- R2 score: 0.2351


Model: K-Neighbors Regressor
- Mean Squared Error: 3345442344.0051
- R2 score: -0.1385


Model: Decision Tree
- Mean Squared Error: 7554711965.8018
- R2 score: -1.5709


Model: Random Forest Regressor
- Mean Squared Error: 2717042786.6516
- R2 score: 0.0754


Model: XGBRegressor
- Mean Squared Error: 2809276316.9281
- R2 score: 0.0440


Model: AdaBoost Regressor
- Mean Squared Error: 14652539790.5980
- R2 score: -3.9864




In [53]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
0,Linear Regression,0.235115
2,Ridge,0.235114
1,Lasso,0.235114
5,Random Forest Regressor,0.075364
6,XGBRegressor,0.043976
3,K-Neighbors Regressor,-0.138486
4,Decision Tree,-1.570941
7,AdaBoost Regressor,-3.9864


In [54]:
test = pd.read_csv('test.csv')

id = test['id']
test.drop(columns=['id', 'clean_title'], inplace=True)
test = engine_feature_transformer.fit_transform(test)

test = column_transformer.transform(test)



In [55]:
final_model = Lasso()
final_model.fit(X, y)




price = final_model.predict(test)


output = pd.DataFrame({'id': id, 'price': price})
output.to_csv('pred.csv', index=False)


In [56]:
# do grid search with XGBoost

from sklearn.model_selection import GridSearchCV

param_grid = {
    'booster': ['gbtree', 'gblinear'],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9]
    
    

}

xgb = XGBRegressor()

grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X, y)

print(grid_search.best_params_)

Fitting 5 folds for each of 486 candidates, totalling 2430 fits
{'booster': 'gbtree', 'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.8}


In [57]:
final_model = grid_search.best_estimator_
final_model.fit(X, y)


price = final_model.predict(test)


output = pd.DataFrame({'id': id, 'price': price})
output.to_csv('xg.csv', index=False)

In [58]:
# fine tune the model with random forest

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestRegressor()

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X,y)

print(grid_search.best_params_)

final_model = grid_search.best_estimator_
final_model.fit(X, y)


price = final_model.predict(test)


output = pd.DataFrame({'id': id, 'price': price})
output.to_csv('rf.csv', index=False)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
{'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 100}
