# Training Car Prediction Model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [2]:
# Load the dataset
df = pd.read_csv('encoded_data.csv')

In [3]:
# Checking the data head
df.head()

Unnamed: 0.1,Unnamed: 0,car_price,manufacturing_year,mileage_km,engine_size,car_age,region_akwa ibom,region_anambra,region_delta,region_edo,...,car_city_ojodu,car_city_onitsha,car_city_oshimili south,car_city_owerri,car_city_port-harcourt,car_city_surulere,car_city_uyo,car_city_victoria island,car_city_wuse,car_city_yaba
0,0,12937500,2013,272474,3500,12,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,6750000,2012,102281,5000,13,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,50625000,2018,127390,5700,7,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,3600000,2007,139680,1800,18,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,3262500,2005,220615,3500,20,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Checking more information on the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2892 entries, 0 to 2891
Columns: 472 entries, Unnamed: 0 to car_city_yaba
dtypes: int64(472)
memory usage: 10.4 MB


In [5]:
# Checking for null values
df.isnull().sum()

Unnamed: 0                  0
car_price                   0
manufacturing_year          0
mileage_km                  0
engine_size                 0
                           ..
car_city_surulere           0
car_city_uyo                0
car_city_victoria island    0
car_city_wuse               0
car_city_yaba               0
Length: 472, dtype: int64

In [6]:
df.columns

Index(['Unnamed: 0', 'car_price', 'manufacturing_year', 'mileage_km',
       'engine_size', 'car_age', 'region_akwa ibom', 'region_anambra',
       'region_delta', 'region_edo',
       ...
       'car_city_ojodu', 'car_city_onitsha', 'car_city_oshimili south',
       'car_city_owerri', 'car_city_port-harcourt', 'car_city_surulere',
       'car_city_uyo', 'car_city_victoria island', 'car_city_wuse',
       'car_city_yaba'],
      dtype='object', length=472)

In [7]:
# Checking for duplicate values
df.duplicated().sum()

0

In [8]:
# Checking statitical summary of the data
df.describe()

Unnamed: 0.1,Unnamed: 0,car_price,manufacturing_year,mileage_km,engine_size,car_age,region_akwa ibom,region_anambra,region_delta,region_edo,...,car_city_ojodu,car_city_onitsha,car_city_oshimili south,car_city_owerri,car_city_port-harcourt,car_city_surulere,car_city_uyo,car_city_victoria island,car_city_wuse,car_city_yaba
count,2892.0,2892.0,2892.0,2892.0,2892.0,2892.0,2892.0,2892.0,2892.0,2892.0,...,2892.0,2892.0,2892.0,2892.0,2892.0,2892.0,2892.0,2892.0,2892.0,2892.0
mean,1445.782158,4972894.0,2007.775588,243487.5,3078.259682,17.224412,0.017289,0.007607,0.009682,0.020401,...,0.036307,0.007607,0.009682,0.010373,0.029046,0.011757,0.017289,0.011757,0.034578,0.022822
std,835.394663,6151255.0,4.559054,1401190.0,3723.386871,4.559054,0.130369,0.086902,0.097936,0.141392,...,0.187085,0.086902,0.097936,0.101338,0.167964,0.107807,0.130369,0.107807,0.18274,0.14936
min,0.0,577500.0,1988.0,0.0,25.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,722.75,2158125.0,2005.0,131143.2,2300.0,14.75,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1445.5,3196000.0,2007.0,194633.0,3000.0,18.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2168.25,5250000.0,2010.25,263294.2,3500.0,20.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2893.0,98700000.0,2023.0,74026750.0,158713.0,37.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
# all column names
df.columns

Index(['Unnamed: 0', 'car_price', 'manufacturing_year', 'mileage_km',
       'engine_size', 'car_age', 'region_akwa ibom', 'region_anambra',
       'region_delta', 'region_edo',
       ...
       'car_city_ojodu', 'car_city_onitsha', 'car_city_oshimili south',
       'car_city_owerri', 'car_city_port-harcourt', 'car_city_surulere',
       'car_city_uyo', 'car_city_victoria island', 'car_city_wuse',
       'car_city_yaba'],
      dtype='object', length=472)

In [10]:
# Exploring columns to drop
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0.1,Unnamed: 0,car_price,manufacturing_year,mileage_km,engine_size,car_age,region_akwa ibom,region_anambra,region_delta,region_edo,region_imo,region_kaduna,region_kwara,region_lagos,region_ogun,region_ondo,region_oyo,region_rivers,manufacturer_aston martin,manufacturer_audi,manufacturer_bmw,manufacturer_cadillac,manufacturer_changan,manufacturer_chevrolet,manufacturer_chrysler,manufacturer_citroen,manufacturer_daihatsu,manufacturer_dodge,manufacturer_dongfeng,manufacturer_ford,manufacturer_gac,manufacturer_genesis,manufacturer_haima,manufacturer_honda,manufacturer_hummer,manufacturer_hyundai,manufacturer_infiniti,manufacturer_isuzu,manufacturer_jaguar,manufacturer_jeep,manufacturer_kia,manufacturer_land rover,manufacturer_lexus,manufacturer_lincoln,manufacturer_maserati,manufacturer_mazda,manufacturer_mercedes-benz,manufacturer_mercury,manufacturer_mini,manufacturer_mitsubishi,manufacturer_nissan,manufacturer_opel,manufacturer_peugeot,manufacturer_pontiac,manufacturer_porsche,manufacturer_renault,manufacturer_rover,manufacturer_scion,manufacturer_subaru,manufacturer_suzuki,manufacturer_toyota,manufacturer_vauxhall,manufacturer_volkswagen,manufacturer_volvo,car_model_190e,car_model_200,car_model_206,car_model_207,car_model_230e,car_model_3,car_model_3 series,car_model_300,car_model_300c,car_model_301,car_model_307,car_model_308,car_model_318i,car_model_320i,car_model_323,car_model_323i,car_model_325i,car_model_328i,car_model_335i,car_model_4-runner,car_model_406,car_model_407,car_model_408,car_model_5,car_model_5 series,car_model_520i,car_model_525i,car_model_528i,car_model_530i,car_model_535i,car_model_550i,car_model_6,car_model_607,car_model_626,car_model_7 series,car_model_a-class,car_model_a3,car_model_a4,car_model_a6,car_model_accent,car_model_accord,car_model_accord crosstour,car_model_almera,car_model_alsvin,car_model_altima,car_model_alto,car_model_armada,car_model_astra,car_model_asx,car_model_avalon,car_model_avanza,car_model_avensis,car_model_aveo,car_model_azera,car_model_b-class,car_model_baleno,car_model_beetle,car_model_borrego,car_model_c-class,car_model_c180,car_model_c200,car_model_c220,car_model_c230,car_model_c240,car_model_c250,car_model_c270,car_model_c280,car_model_c300,car_model_c320,car_model_c350,car_model_c4,car_model_c400,car_model_caliber,car_model_camaro,car_model_camry,car_model_captiva,car_model_caravan,car_model_carisma,car_model_cayenne,car_model_cc,car_model_cerato,car_model_challenger,car_model_charger,car_model_cherokee,car_model_ciaz,car_model_city,car_model_civic,car_model_cla-class,car_model_clio,car_model_clk,car_model_cls,car_model_coaster,car_model_commander,car_model_compass,car_model_cooper,car_model_corolla,car_model_corolla verso,car_model_county,car_model_cr-v,car_model_creta,car_model_crosstour,car_model_cruze,car_model_cs35,car_model_csx,car_model_cts,car_model_cx-7,car_model_cx-9,car_model_db9,car_model_discovery,car_model_durango,car_model_dzire,car_model_e-150,car_model_e200,car_model_e220,car_model_e240,car_model_e250,car_model_e300,car_model_e320,car_model_e350,car_model_e400,car_model_eclipse cross,car_model_ecosport,car_model_edge,car_model_elantra,car_model_element,car_model_epica,car_model_equinox,car_model_ertiga,car_model_es,car_model_escalade,car_model_escape,car_model_every wagon,car_model_ex,car_model_explorer,car_model_f-150,car_model_fj cruiser,car_model_focus,car_model_forte,car_model_fortuner,car_model_freestyle,car_model_frontier,car_model_fusion,car_model_fx,car_model_fx35,car_model_g,car_model_g-class,car_model_g35,car_model_g80,car_model_ga3,car_model_ga4,car_model_galant,car_model_gl-class,car_model_gla 250,car_model_glc-class,car_model_gle-class,car_model_glk-class,car_model_gls-class,car_model_golf,car_model_golf gti,car_model_grand,car_model_grand caravan,car_model_grand cherokee,car_model_grand vitara,car_model_grandeur,car_model_gs,car_model_gs3,car_model_gs4,car_model_gs8,car_model_gx,car_model_h3,car_model_h300,car_model_harrier,car_model_hiace,car_model_highlander,car_model_hijet,car_model_hilux,car_model_impala,car_model_is,car_model_ix35,car_model_jazz,car_model_jetta,car_model_jimny,car_model_journey,car_model_kremer,car_model_l200,car_model_laguna,car_model_lancer,car_model_land,car_model_land cruiser,car_model_land cruiser prado,car_model_levante,car_model_liberty,car_model_lingzhi m3,car_model_lr2,car_model_lr3,car_model_lr4,car_model_ls,car_model_lx,car_model_m class,car_model_magnum,car_model_malibu,car_model_mariner,car_model_matrix,car_model_maxima,car_model_mdx,car_model_megane,car_model_micra,car_model_millenia,car_model_mohave,car_model_mpv,car_model_murano,car_model_mustang,car_model_navigator,car_model_np300,car_model_odyssey,car_model_optima,car_model_orlando,car_model_outback,car_model_outlander,car_model_pacifica,car_model_pajero,car_model_passat,car_model_pathfinder,car_model_pilot,car_model_polo,car_model_previa,car_model_primera,car_model_prius,car_model_q5,car_model_q50,car_model_q7,car_model_qashqai,car_model_quest,car_model_qx4,car_model_qx56,car_model_qx80,car_model_r-class,car_model_rabbit,car_model_ram,car_model_range rover,car_model_range rover evoque,car_model_range rover sport,car_model_range rover velar,car_model_range rover vogue,car_model_ranger,car_model_rav4,car_model_rdx,car_model_ridgeline,car_model_rio,car_model_rogue,car_model_rondo,car_model_routan,car_model_rx,car_model_rx 300,car_model_rx 330,car_model_rx 350,car_model_rx 400h,car_model_s-class,car_model_s-presso,car_model_s-type,car_model_s40,car_model_s5,car_model_s60,car_model_s80,car_model_santa fe,car_model_sc,car_model_sedona,car_model_sentra,car_model_sequoia,car_model_sharan,car_model_shuma,car_model_shuttle,car_model_sienna,car_model_slk-class,car_model_solara,car_model_sonata,car_model_sorento,car_model_soul,car_model_spacewagon,car_model_sportage,car_model_sprinter,car_model_stream,car_model_suburban,car_model_sunny,car_model_swift,car_model_sx4,car_model_tacoma,car_model_taurus,car_model_tc,car_model_tiburon,car_model_tiguan,car_model_tiida,car_model_titan,car_model_tl,car_model_torrent,car_model_touareg,car_model_touran,car_model_town&country,car_model_trajet,car_model_transit,car_model_transporter,car_model_traverse,car_model_trax,car_model_tribeca,car_model_tribute,car_model_trooper,car_model_tsx,car_model_tucson,car_model_tundra,car_model_uplander,car_model_v40,car_model_van,car_model_vectra,car_model_veloster,car_model_venza,car_model_veracruz,car_model_versa,car_model_verso,car_model_vibe,car_model_vitara,car_model_vivaro,car_model_wrangler,car_model_x-trail,car_model_x1,car_model_x3,car_model_x4,car_model_x5,car_model_x6,car_model_xc60,car_model_xc70,car_model_xc90,car_model_xj,car_model_xterra,car_model_yaris,car_model_z4,car_model_zafira,car_model_zdx,color_black,color_blue,color_brown,color_burgandy,color_gold,color_gray,color_green,color_ivory,color_matt black,color_off white,color_orange,color_pink,color_purple,color_red,color_silver,color_teal,color_white,color_yellow,car_condition_foreign used,car_condition_nigerian used,selling_cond_imported,selling_cond_registered,bought_cond_imported,bought_cond_registered,fuel_type_electric,fuel_type_hybrid,fuel_type_petrol,transmission_automatic,transmission_cvt,transmission_manual,car_city_ajah,car_city_akure,car_city_alimosho,car_city_amuwo-odofin,car_city_apapa,car_city_benin city,car_city_garki 2,car_city_gwarinpa,car_city_ibadan,car_city_ibeju,car_city_ifako-ijaiye,car_city_ikeja,car_city_ikorodu,car_city_ikotun/igando,car_city_ikoyi,car_city_ilorin east,car_city_ilupeju,car_city_ipaja,car_city_isolo,car_city_jabi,car_city_kaduna / kaduna state,car_city_katampe,car_city_kosofe,car_city_kubwa,car_city_lekki,car_city_lugbe district,car_city_magodo,car_city_mararaba,car_city_mushin,car_city_ogba,car_city_ogudu,car_city_ojodu,car_city_onitsha,car_city_oshimili south,car_city_owerri,car_city_port-harcourt,car_city_surulere,car_city_uyo,car_city_victoria island,car_city_wuse,car_city_yaba
0,0,12937500,2013,272474,3500,12,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,6750000,2012,102281,5000,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,50625000,2018,127390,5700,7,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,3600000,2007,139680,1800,18,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,3262500,2005,220615,3500,20,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Data Cleaning and Exploration

In [12]:
# Checking why num_of_seats column is an object
#df['seat'].unique()

In [13]:
# Checking how many seat columns has '5 or 7', '7 or 8'
#df['seat'].value_counts()

In [14]:
# dropping the '5 or 7', '7 or 8' values since only one row exist for each
#df = df[~df['seat'].isin(['5 or 7', '7 or 8'])]

In [15]:
# Changing the 'seat' column to numeric while skipping missing values
#df['seat'] = pd.to_numeric(df['seat'], errors='coerce')

In [20]:
# dropping unnecessary columns
#cols_to_drop = ['Unnamed: 0', 'car_id', 'trim', 'description', 'trim', 'drive_train', 'reg_city',
 #      'seat', 'num_cylinder', 'horse_power', 'body_build']

In [21]:
#df.drop(columns=cols_to_drop, inplace=True)

In [19]:
rename_cols = {
    'amount': 'car_price',
    'make': 'manufacturer',
    'model': 'car_model',
    'year_of_man': 'manufacturing_year',
    'condition': 'car_condition',
    'mileage': 'mileage_km'
    }

df.rename(columns=rename_cols, inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.head(2)

In [None]:
# Change all text columns to lowercase
text_cols = df.select_dtypes(include=['object']).columns
for col in text_cols:
    df[col] = df[col].str.lower()

#### Exploring column by column

In [None]:
# Make region copy to use for feature enginering and call it car city
df['car_city'] = df['region'].copy()
df.head()

In [None]:
# Changing the region names to their respective states
orig_region = {
    'lagos state, ikeja': 'lagos',
    'abuja (fct), garki 2': 'abuja',
    'lagos state, lekki': 'lagos',
    'abuja (fct), lugbe district': 'abuja',
    'lagos state, isolo': 'lagos',
    'abuja (fct), gwarinpa': 'abuja',
    'abuja (fct), kubwa': 'abuja',
    'lagos state, ikotun/igando': 'lagos',
    'lagos state, ikoyi': 'lagos',
    'abuja (fct), mararaba': 'abuja',
    'abuja (fct), jabi': 'abuja',
    'lagos state, ojodu': 'lagos',
    'lagos state, ajah': 'lagos',
    'oyo state, ibadan': 'oyo',
    'kwara state, ilorin east': 'kwara',
    'lagos state, amuwo-odofin': 'lagos',
    'lagos state, alimosho': 'lagos',
    'rivers state, port-harcourt': 'rivers',
    'delta state, oshimili south': 'delta',
    'lagos state, ipaja': 'lagos',
    'lagos state, ikorodu': 'lagos',
    'lagos state, mushin': 'lagos',
    'abuja (fct), katampe': 'abuja',
    'akwa ibom state, uyo': 'akwa ibom',
    'lagos state, surulere': 'lagos',
    'lagos state, ogba': 'lagos',
    'anambra state, onitsha': 'anambra',
    'lagos state, yaba': 'lagos',
    'lagos state, ibeju': 'lagos',
    'abuja (fct), wuse': 'abuja',
    'ogun state, ado-odo/ota': 'ogun',
    'lagos state, ilupeju': 'lagos',
    'lagos state, ifako-ijaiye': 'lagos',
    'lagos state, kosofe': 'lagos',
    'lagos state, magodo': 'lagos',
    'lagos state, ogudu': 'lagos',
    'imo state, owerri': 'imo',
    'kaduna state, kaduna / kaduna state': 'kaduna',
    'edo state, benin city': 'edo',
    'ondo state, akure': 'ondo',
    'lagos state, apapa': 'lagos',
    'lagos state, victoria island': 'lagos'
}

In [None]:
df['region'] = df['region'].map(orig_region)

In [None]:
df.head(2)

In [None]:
df['manufacturer'].unique()

In [None]:
df['car_model'].nunique()

In [None]:
df['car_model'].unique()

In [None]:
df['color'].unique()

In [None]:
df['car_condition'].unique()

In [None]:
df['selling_cond'].unique()

In [None]:
df['bought_cond'].unique()

In [None]:
# Checking if there is any difference in selling and bought condition columns
df[(df['selling_cond'] == 'imported') & (df['bought_cond'] == 'registered')]

In [None]:
df[(df['selling_cond'] == 'imported') & (df['bought_cond'] == 'brand new')]

In [None]:
df[(df['selling_cond'] == 'registered') & (df['bought_cond'] == 'imported')]

In [None]:
df[(df['selling_cond'] == 'registered') & (df['bought_cond'] == 'brand new')]

In [None]:
df[(df['selling_cond'] == 'brand new') & (df['bought_cond'] == 'registered')]

In [None]:
df[(df['selling_cond'] == 'brand new') & (df['bought_cond'] == 'imported')]

In [None]:
tbr = ['0','1','abc189gb']


In [None]:
df.head()

In [None]:
df.columns

In [None]:
# dropping unnecessary columns
cols_to_drop = ['Unnamed: 0', 'car_id', 'trim', 'description', 'trim', 'drive_train', 'reg_city',
       'seat', 'num_cylinder', 'horse_power', 'body_build']

In [None]:
df.head(10)

car city
Car age
average price per kilometer = avg_price_per_km
price per number of seats in car
price per number of cylinder in car
price per number of horse power

## Feature Engineering

In [None]:
# Dictionary mapping original region names to their respective cities extracted from the region description
city_region = {
    'lagos state, ikeja': 'ikeja',
    'abuja (fct), garki 2': 'garki 2',
    'lagos state, lekki': 'lekki',
    'abuja (fct), lugbe district': 'lugbe district',
    'lagos state, isolo': 'isolo',
    'abuja (fct), gwarinpa': 'gwarinpa',
    'abuja (fct), kubwa': 'kubwa',
    'lagos state, ikotun/igando': 'ikotun/igando',
    'lagos state, ikoyi': 'ikoyi',
    'abuja (fct), mararaba': 'mararaba',
    'abuja (fct), jabi': 'jabi',
    'lagos state, ojodu': 'ojodu',
    'lagos state, ajah': 'ajah',
    'oyo state, ibadan': 'ibadan',
    'kwara state, ilorin east': 'ilorin east',
    'lagos state, amuwo-odofin': 'amuwo-odofin',
    'lagos state, alimosho': 'alimosho',
    'rivers state, port-harcourt': 'port-harcourt',
    'delta state, oshimili south': 'oshimili south',
    'lagos state, ipaja': 'ipaja',
    'lagos state, ikorodu': 'ikorodu',
    'lagos state, mushin': 'mushin',
    'abuja (fct), katampe': 'katampe',
    'akwa ibom state, uyo': 'uyo',
    'lagos state, surulere': 'surulere',
    'lagos state, ogba': 'ogba',
    'anambra state, onitsha': 'onitsha',
    'lagos state, yaba': 'yaba',
    'lagos state, ibeju': 'ibeju',
    'abuja (fct), wuse': 'wuse',
    'ogun state, ado-odo/ota': 'ado-odo/ota',
    'lagos state, ilupeju': 'ilupeju',
    'lagos state, ifako-ijaiye': 'ifako-ijaiye',
    'lagos state, kosofe': 'kosofe',
    'lagos state, magodo': 'magodo',
    'lagos state, ogudu': 'ogudu',
    'imo state, owerri': 'owerri',
    'kaduna state, kaduna / kaduna state': 'kaduna / kaduna state',
    'edo state, benin city': 'benin city',
    'ondo state, akure': 'akure',
    'lagos state, apapa': 'apapa',
    'lagos state, victoria island': 'victoria island'
}

df['car_city'] = df['car_city'].map(city_region)

In [None]:
current_year = datetime.now().year

In [None]:
##Calculate Car Age
current_year = datetime.now().year
df['car_age'] = current_year - df['manufacturing_year']


# Data Cleaning

In [None]:
df.columns

In [None]:
#handling missing values for mileage
df['mileage_km'] = df['mileage_km'].fillna(df['mileage_km'].mean())

In [None]:
df['engine_size'] = df['engine_size'].fillna(df['engine_size'].median())

In [None]:
df['selling_cond'] = df['selling_cond'].fillna(df['selling_cond'].ffill())

In [None]:
df['bought_cond'] = df['bought_cond'].fillna(df['bought_cond'].ffill())

In [None]:
df['transmission'] = df['transmission'].fillna(df['transmission'].ffill())

In [None]:
df['fuel_type'] = df['fuel_type'].fillna(df['fuel_type'].bfill())

In [None]:
df['color'] = df['color'].fillna(df['color'].ffill())

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
# Check the correlation between the price and other columns
correlation_matrix = df.corr(numeric_only=True)
print(correlation_matrix['car_price'].sort_values(ascending=False))

## One Hot Encoding

In [23]:
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
print(numerical_cols)

Index(['Unnamed: 0', 'car_price', 'manufacturing_year', 'mileage_km',
       'engine_size', 'car_age', 'region_akwa ibom', 'region_anambra',
       'region_delta', 'region_edo',
       ...
       'car_city_ojodu', 'car_city_onitsha', 'car_city_oshimili south',
       'car_city_owerri', 'car_city_port-harcourt', 'car_city_surulere',
       'car_city_uyo', 'car_city_victoria island', 'car_city_wuse',
       'car_city_yaba'],
      dtype='object', length=472)


In [24]:
categorical_cols = df.select_dtypes(include=['object']).columns
print(categorical_cols)

Index([], dtype='object')


In [25]:
endcoded_df = pd.get_dummies(df, columns=categorical_cols, drop_first=True).astype(int)
endcoded_df.head()

Unnamed: 0.1,Unnamed: 0,car_price,manufacturing_year,mileage_km,engine_size,car_age,region_akwa ibom,region_anambra,region_delta,region_edo,region_imo,region_kaduna,region_kwara,region_lagos,region_ogun,region_ondo,region_oyo,region_rivers,manufacturer_aston martin,manufacturer_audi,manufacturer_bmw,manufacturer_cadillac,manufacturer_changan,manufacturer_chevrolet,manufacturer_chrysler,manufacturer_citroen,manufacturer_daihatsu,manufacturer_dodge,manufacturer_dongfeng,manufacturer_ford,manufacturer_gac,manufacturer_genesis,manufacturer_haima,manufacturer_honda,manufacturer_hummer,manufacturer_hyundai,manufacturer_infiniti,manufacturer_isuzu,manufacturer_jaguar,manufacturer_jeep,manufacturer_kia,manufacturer_land rover,manufacturer_lexus,manufacturer_lincoln,manufacturer_maserati,manufacturer_mazda,manufacturer_mercedes-benz,manufacturer_mercury,manufacturer_mini,manufacturer_mitsubishi,manufacturer_nissan,manufacturer_opel,manufacturer_peugeot,manufacturer_pontiac,manufacturer_porsche,manufacturer_renault,manufacturer_rover,manufacturer_scion,manufacturer_subaru,manufacturer_suzuki,manufacturer_toyota,manufacturer_vauxhall,manufacturer_volkswagen,manufacturer_volvo,car_model_190e,car_model_200,car_model_206,car_model_207,car_model_230e,car_model_3,car_model_3 series,car_model_300,car_model_300c,car_model_301,car_model_307,car_model_308,car_model_318i,car_model_320i,car_model_323,car_model_323i,car_model_325i,car_model_328i,car_model_335i,car_model_4-runner,car_model_406,car_model_407,car_model_408,car_model_5,car_model_5 series,car_model_520i,car_model_525i,car_model_528i,car_model_530i,car_model_535i,car_model_550i,car_model_6,car_model_607,car_model_626,car_model_7 series,car_model_a-class,car_model_a3,car_model_a4,car_model_a6,car_model_accent,car_model_accord,car_model_accord crosstour,car_model_almera,car_model_alsvin,car_model_altima,car_model_alto,car_model_armada,car_model_astra,car_model_asx,car_model_avalon,car_model_avanza,car_model_avensis,car_model_aveo,car_model_azera,car_model_b-class,car_model_baleno,car_model_beetle,car_model_borrego,car_model_c-class,car_model_c180,car_model_c200,car_model_c220,car_model_c230,car_model_c240,car_model_c250,car_model_c270,car_model_c280,car_model_c300,car_model_c320,car_model_c350,car_model_c4,car_model_c400,car_model_caliber,car_model_camaro,car_model_camry,car_model_captiva,car_model_caravan,car_model_carisma,car_model_cayenne,car_model_cc,car_model_cerato,car_model_challenger,car_model_charger,car_model_cherokee,car_model_ciaz,car_model_city,car_model_civic,car_model_cla-class,car_model_clio,car_model_clk,car_model_cls,car_model_coaster,car_model_commander,car_model_compass,car_model_cooper,car_model_corolla,car_model_corolla verso,car_model_county,car_model_cr-v,car_model_creta,car_model_crosstour,car_model_cruze,car_model_cs35,car_model_csx,car_model_cts,car_model_cx-7,car_model_cx-9,car_model_db9,car_model_discovery,car_model_durango,car_model_dzire,car_model_e-150,car_model_e200,car_model_e220,car_model_e240,car_model_e250,car_model_e300,car_model_e320,car_model_e350,car_model_e400,car_model_eclipse cross,car_model_ecosport,car_model_edge,car_model_elantra,car_model_element,car_model_epica,car_model_equinox,car_model_ertiga,car_model_es,car_model_escalade,car_model_escape,car_model_every wagon,car_model_ex,car_model_explorer,car_model_f-150,car_model_fj cruiser,car_model_focus,car_model_forte,car_model_fortuner,car_model_freestyle,car_model_frontier,car_model_fusion,car_model_fx,car_model_fx35,car_model_g,car_model_g-class,car_model_g35,car_model_g80,car_model_ga3,car_model_ga4,car_model_galant,car_model_gl-class,car_model_gla 250,car_model_glc-class,car_model_gle-class,car_model_glk-class,car_model_gls-class,car_model_golf,car_model_golf gti,car_model_grand,car_model_grand caravan,car_model_grand cherokee,car_model_grand vitara,car_model_grandeur,car_model_gs,car_model_gs3,car_model_gs4,car_model_gs8,car_model_gx,car_model_h3,car_model_h300,car_model_harrier,car_model_hiace,car_model_highlander,car_model_hijet,car_model_hilux,car_model_impala,car_model_is,car_model_ix35,car_model_jazz,car_model_jetta,car_model_jimny,car_model_journey,car_model_kremer,car_model_l200,car_model_laguna,car_model_lancer,car_model_land,car_model_land cruiser,car_model_land cruiser prado,car_model_levante,car_model_liberty,car_model_lingzhi m3,car_model_lr2,car_model_lr3,car_model_lr4,car_model_ls,car_model_lx,car_model_m class,car_model_magnum,car_model_malibu,car_model_mariner,car_model_matrix,car_model_maxima,car_model_mdx,car_model_megane,car_model_micra,car_model_millenia,car_model_mohave,car_model_mpv,car_model_murano,car_model_mustang,car_model_navigator,car_model_np300,car_model_odyssey,car_model_optima,car_model_orlando,car_model_outback,car_model_outlander,car_model_pacifica,car_model_pajero,car_model_passat,car_model_pathfinder,car_model_pilot,car_model_polo,car_model_previa,car_model_primera,car_model_prius,car_model_q5,car_model_q50,car_model_q7,car_model_qashqai,car_model_quest,car_model_qx4,car_model_qx56,car_model_qx80,car_model_r-class,car_model_rabbit,car_model_ram,car_model_range rover,car_model_range rover evoque,car_model_range rover sport,car_model_range rover velar,car_model_range rover vogue,car_model_ranger,car_model_rav4,car_model_rdx,car_model_ridgeline,car_model_rio,car_model_rogue,car_model_rondo,car_model_routan,car_model_rx,car_model_rx 300,car_model_rx 330,car_model_rx 350,car_model_rx 400h,car_model_s-class,car_model_s-presso,car_model_s-type,car_model_s40,car_model_s5,car_model_s60,car_model_s80,car_model_santa fe,car_model_sc,car_model_sedona,car_model_sentra,car_model_sequoia,car_model_sharan,car_model_shuma,car_model_shuttle,car_model_sienna,car_model_slk-class,car_model_solara,car_model_sonata,car_model_sorento,car_model_soul,car_model_spacewagon,car_model_sportage,car_model_sprinter,car_model_stream,car_model_suburban,car_model_sunny,car_model_swift,car_model_sx4,car_model_tacoma,car_model_taurus,car_model_tc,car_model_tiburon,car_model_tiguan,car_model_tiida,car_model_titan,car_model_tl,car_model_torrent,car_model_touareg,car_model_touran,car_model_town&country,car_model_trajet,car_model_transit,car_model_transporter,car_model_traverse,car_model_trax,car_model_tribeca,car_model_tribute,car_model_trooper,car_model_tsx,car_model_tucson,car_model_tundra,car_model_uplander,car_model_v40,car_model_van,car_model_vectra,car_model_veloster,car_model_venza,car_model_veracruz,car_model_versa,car_model_verso,car_model_vibe,car_model_vitara,car_model_vivaro,car_model_wrangler,car_model_x-trail,car_model_x1,car_model_x3,car_model_x4,car_model_x5,car_model_x6,car_model_xc60,car_model_xc70,car_model_xc90,car_model_xj,car_model_xterra,car_model_yaris,car_model_z4,car_model_zafira,car_model_zdx,color_black,color_blue,color_brown,color_burgandy,color_gold,color_gray,color_green,color_ivory,color_matt black,color_off white,color_orange,color_pink,color_purple,color_red,color_silver,color_teal,color_white,color_yellow,car_condition_foreign used,car_condition_nigerian used,selling_cond_imported,selling_cond_registered,bought_cond_imported,bought_cond_registered,fuel_type_electric,fuel_type_hybrid,fuel_type_petrol,transmission_automatic,transmission_cvt,transmission_manual,car_city_ajah,car_city_akure,car_city_alimosho,car_city_amuwo-odofin,car_city_apapa,car_city_benin city,car_city_garki 2,car_city_gwarinpa,car_city_ibadan,car_city_ibeju,car_city_ifako-ijaiye,car_city_ikeja,car_city_ikorodu,car_city_ikotun/igando,car_city_ikoyi,car_city_ilorin east,car_city_ilupeju,car_city_ipaja,car_city_isolo,car_city_jabi,car_city_kaduna / kaduna state,car_city_katampe,car_city_kosofe,car_city_kubwa,car_city_lekki,car_city_lugbe district,car_city_magodo,car_city_mararaba,car_city_mushin,car_city_ogba,car_city_ogudu,car_city_ojodu,car_city_onitsha,car_city_oshimili south,car_city_owerri,car_city_port-harcourt,car_city_surulere,car_city_uyo,car_city_victoria island,car_city_wuse,car_city_yaba
0,0,12937500,2013,272474,3500,12,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,6750000,2012,102281,5000,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,50625000,2018,127390,5700,7,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,3600000,2007,139680,1800,18,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,3262500,2005,220615,3500,20,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Prepare data for modeling


In [26]:
from sklearn.model_selection import train_test_split

X = endcoded_df.drop('car_price', axis=1)
y = endcoded_df['car_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=87)

## Train and evaluate linear regression model

Train a Linear Regression model on the training data and evaluate its performance on the testing data.


In [27]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Instantiate and train the Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_lr = lr_model.predict(X_test)

# Evaluate the model
mae_lr = mean_absolute_error(y_test, y_pred_lr)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

# Print the evaluation metrics
print(f"Linear Regression Model Performance:")
print(f"Mean Absolute Error (MAE): {mae_lr}")
print(f"Mean Squared Error (MSE): {mse_lr}")
print(f"R-squared (R2): {r2_lr}")
print(f"Score: {lr_model.score(X_test, y_test)}")

Linear Regression Model Performance:
Mean Absolute Error (MAE): 2277878.110502962
Mean Squared Error (MSE): 25901007417696.836
R-squared (R2): 0.18719285509484795
Score: 0.18719285509484795


## Train and evaluate Decision Tree Regressor model

Train a Decision Tree Regressor model on the training data and evaluate its performance on the testing data.

In [28]:
from sklearn.tree import DecisionTreeRegressor

# Instantiate and train the Decision Tree Regressor model
dt_model = DecisionTreeRegressor(random_state=87)
dt_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_dt = dt_model.predict(X_test)

# Evaluate the model
mae_dt = mean_absolute_error(y_test, y_pred_dt)
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

# Print the evaluation metrics
print(f"Decision Tree Regressor Model Performance:")
print(f"Mean Absolute Error (MAE): {mae_dt}")
print(f"Mean Squared Error (MSE): {mse_dt}")
print(f"R-squared (R2): {r2_dt}")
print(f"Score: {dt_model.score(X_test, y_test)}")

Decision Tree Regressor Model Performance:
Mean Absolute Error (MAE): 1735168.5587557603
Mean Squared Error (MSE): 12798179341039.936
R-squared (R2): 0.5983765634124598
Score: 0.5983765634124598


## Train and evaluate Random Forest Regressor model

Train a Random Forest Regressor model on the training data and evaluate its performance on the testing data.

In [29]:
from sklearn.ensemble import RandomForestRegressor

# Instantiate and train the Random Forest Regressor model
rf_model = RandomForestRegressor(random_state=87)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Print the evaluation metrics
print(f"Random Forest Regressor Model Performance:")
print(f"Mean Absolute Error (MAE): {mae_rf}")
print(f"Mean Squared Error (MSE): {mse_rf}")
print(f"R-squared (R2): {r2_rf}")
print(f"Score: {rf_model.score(X_test, y_test)}")

Random Forest Regressor Model Performance:
Mean Absolute Error (MAE): 1337624.4379608296
Mean Squared Error (MSE): 8361605221315.394
R-squared (R2): 0.7376020029970818
Score: 0.7376020029970818


## Train and evaluate Gradient Boosting Regressor model

Train a Gradient Boosting Regressor model on the training data and evaluate its performance on the testing data.

In [30]:
from sklearn.ensemble import GradientBoostingRegressor

# Instantiate and train the Gradient Boosting Regressor model
gbr_model = GradientBoostingRegressor(random_state=87)
gbr_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_gbr = gbr_model.predict(X_test)

# Evaluate the model
mae_gbr = mean_absolute_error(y_test, y_pred_gbr)
mse_gbr = mean_squared_error(y_test, y_pred_gbr)
r2_gbr = r2_score(y_test, y_pred_gbr)

# Print the evaluation metrics
print(f"Gradient Boosting Regressor Model Performance:")
print(f"Mean Absolute Error (MAE): {mae_gbr}")
print(f"Mean Squared Error (MSE): {mse_gbr}")
print(f"R-squared (R2): {r2_gbr}")
print(f"Score: {gbr_model.score(X_test, y_test)}")

Gradient Boosting Regressor Model Performance:
Mean Absolute Error (MAE): 1390299.5986203447
Mean Squared Error (MSE): 7989663087408.271
R-squared (R2): 0.7492740286853343
Score: 0.7492740286853343


## Train and evaluate XGBoost Regressor model

Train an XGBoost Regressor model on the training data and evaluate its performance on the testing data.

In [31]:
import xgboost as xgb

# Instantiate and train the XGBoost Regressor model
xgb_model = xgb.XGBRegressor(random_state=87)
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

# Print the evaluation metrics
print(f"XGBoost Regressor Model Performance:")
print(f"Mean Absolute Error (MAE): {mae_xgb}")
print(f"Mean Squared Error (MSE): {mse_xgb}")
print(f"R-squared (R2): {r2_xgb}")
print(f"Score: {xgb_model.score(X_test, y_test)}")

ModuleNotFoundError: No module named 'xgboost'

## Compare model performance

Compare the performance of all trained models using appropriate evaluation metrics.

In [None]:
# Create a dictionary to store the performance metrics
performance_metrics = {
    'Model': ['Linear Regression', 'Decision Tree Regressor', 'Random Forest Regressor', 'Gradient Boosting Regressor', 'XGBoost Regressor'],
    'MAE': [mae_lr, mae_dt, mae_rf, mae_gbr, mae_xgb],
    'MSE': [mse_lr, mse_dt, mse_rf, mse_gbr, mse_xgb],
    'R2 Score': [r2_lr, r2_dt, r2_rf, r2_gbr, r2_xgb],
    'Score': [lr_model.score(X_test, y_test), dt_model.score(X_test, y_test), rf_model.score(X_test, y_test), gbr_model.score(X_test, y_test), xgb_model.score(X_test, y_test)]
}

# Create a DataFrame from the dictionary
performance_df = pd.DataFrame(performance_metrics)

# Print the performance DataFrame
print("Model Performance Comparison:")
display(performance_df)

### Saving the model

In [32]:
import pickle as pk

In [33]:
with open('random_forest_model.pkl', 'wb') as file:
    pk.dump(rf_model, file)

In [34]:
df.to_csv('cleaned_data.csv')

In [35]:
endcoded_df.to_csv('encoded_data.csv')