In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import OrdinalEncoder
from sklearn import preprocessing

from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

import warnings
warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/MyDrive/dataset/

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/MyDrive/dataset


In [3]:
df=pd.read_csv('Copy of vehicles.csv') ## Deleted Data file because of lack of space ! To get original data, go to this link : https://www.kaggle.com/austinreese/craigslist-carstrucks-data
df=pd.DataFrame(df)
df.head() ## Data is uploaded on my Google Drive !

Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,...,size,type,paint_color,image_url,description,county,state,lat,long,posting_date
0,7222695916,https://prescott.craigslist.org/cto/d/prescott...,prescott,https://prescott.craigslist.org,6000,,,,,,...,,,,,,,az,,,
1,7218891961,https://fayar.craigslist.org/ctd/d/bentonville...,fayetteville,https://fayar.craigslist.org,11900,,,,,,...,,,,,,,ar,,,
2,7221797935,https://keys.craigslist.org/cto/d/summerland-k...,florida keys,https://keys.craigslist.org,21000,,,,,,...,,,,,,,fl,,,
3,7222270760,https://worcester.craigslist.org/cto/d/west-br...,worcester / central MA,https://worcester.craigslist.org,1500,,,,,,...,,,,,,,ma,,,
4,7210384030,https://greensboro.craigslist.org/cto/d/trinit...,greensboro,https://greensboro.craigslist.org,4900,,,,,,...,,,,,,,nc,,,


In [4]:
## Put price column at the last !
df= df.reindex(columns=['id', 'url', 'region', 'region_url', 'year', 'manufacturer',
       'model', 'condition', 'cylinders', 'fuel', 'odometer', 'title_status',
       'transmission', 'vin', 'drive', 'size', 'type', 'paint_color',
       'image_url', 'description', 'county', 'state', 'lat', 'long','price'])

# Removing useless features

In [5]:
df2=df.copy()
df2=df2.drop(columns=['url','region_url','vin','image_url','description','county','state'])
df2.head()

Unnamed: 0,id,region,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,lat,long,price
0,7222695916,prescott,,,,,,,,,,,,,,,,6000
1,7218891961,fayetteville,,,,,,,,,,,,,,,,11900
2,7221797935,florida keys,,,,,,,,,,,,,,,,21000
3,7222270760,worcester / central MA,,,,,,,,,,,,,,,,1500
4,7210384030,greensboro,,,,,,,,,,,,,,,,4900


# 2) Checking for NULL Values

In [6]:
df2.isnull().sum()

id                   0
region               0
year              1205
manufacturer     17646
model             5277
condition       174104
cylinders       177678
fuel              3013
odometer          4400
title_status      8242
transmission      2556
drive           130567
size            306361
type             92858
paint_color     130203
lat               6549
long              6549
price                0
dtype: int64

In [7]:
## Separating numeric variable and categorical variable
categorical_cols=['region','manufacturer','model','condition','cylinders','fuel','title_status','transmission','drive','size','type','paint_color']
numerical_cols=['year','odometer','lat','long']

In [8]:
df2.head()

Unnamed: 0,id,region,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,lat,long,price
0,7222695916,prescott,,,,,,,,,,,,,,,,6000
1,7218891961,fayetteville,,,,,,,,,,,,,,,,11900
2,7221797935,florida keys,,,,,,,,,,,,,,,,21000
3,7222270760,worcester / central MA,,,,,,,,,,,,,,,,1500
4,7210384030,greensboro,,,,,,,,,,,,,,,,4900


In [9]:
## The categorical data seems to have some order in them. Eg : More cylinders would result in more price. Hence we are using Label Encoder !!
## Else we could have used One-Hot-Encoding

check_imputer=df2.copy()
check_imputer=check_imputer.drop(columns=['id'])
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()

## l = encoder.fit_transform(check_imputer[categorical_cols])
## test = l.transform(check_imputer[categorical_cols]) 

## Running a function because of reshaping issues 

# # Encoding categorical features ignoring the NaN values !!
def encode(data):
    nonulls = np.array(data.dropna())
    impute_reshape = nonulls.reshape(-1,1)
    #encode date
    impute_ordinal = encoder.fit_transform(impute_reshape)
    #Assigning encoded values to non-null values
    data.loc[data.notnull()] = np.squeeze(impute_ordinal)
    return data

## loop to label each column
for i in range(len(categorical_cols)):
    encode(check_imputer[categorical_cols[i]])


In [10]:
check_imputer.head()

Unnamed: 0,region,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,lat,long,price
0,277,,,,,,,,,,,,,,,,6000
1,105,,,,,,,,,,,,,,,,11900
2,111,,,,,,,,,,,,,,,,21000
3,396,,,,,,,,,,,,,,,,1500
4,131,,,,,,,,,,,,,,,,4900


In [11]:
df2.isnull().sum()[numerical_cols]

year        1205
odometer    4400
lat         6549
long        6549
dtype: int64

In [18]:
df2 = dataset = df2.dropna(how='all') ## Dropping any row with all NaN values !!
df2['odometer'].fillna((df2['odometer'].mean()), inplace=True) ## Filling with mean value of odometer
df2 = dataset = df2.dropna(subset = ["lat", "long"], how='any') ## Dropping lat and long NULL values. Doesn't make sense for me to use a regression model to predict missing latitude and longitude for now
## Might implement latitude and longitude prediction model later!
## Dropping year too!


In [21]:
df2.shape

(420331, 18)

In [20]:
#Missing values after filling
df2.isnull().sum()[numerical_cols]

year        1137
odometer       0
lat            0
long           0
dtype: int64