In [1]:
#set the wd to file location
setwd(getSrcDirectory(function(){})[1])

# dataClean Notebook
The purpose of this file is to define a function which cleans the data by removing columns. It may be expanded to increase the scope. It requires the following files:
<ul>
    <li> 01-vehicles-download.csv </li>
</ul>
It will produce the following files
<ul>
    <li> 02-vehicles-cleaned.csv </li>
</ul>
    

In [2]:
#Import Necessary libraries
library('tidyverse')
library('dplyr')
library('forcats')
#Read Data
vehicles = read.csv('./01-vehicles-download.csv',header=TRUE)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.3.6     [32m✔[39m [34mpurrr  [39m 0.3.5
[32m✔[39m [34mtibble [39m 3.1.8     [32m✔[39m [34mdplyr  [39m 1.1.0
[32m✔[39m [34mtidyr  [39m 1.2.1     [32m✔[39m [34mstringr[39m 1.4.1
[32m✔[39m [34mreadr  [39m 2.1.3     [32m✔[39m [34mforcats[39m 0.5.2
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


## cleanData Function
The cleanData function takes the following inputs:
<ul>
    <li> df: the dataframe to be cleaned </li>
    <li>  vars: a list of vars to drop </li>
</ul>
The cleanData function creates a new dataframe object which is a copy of the original data frame with the following changes:
<ul>
    <li> it does not contain the columns in the vars list </li>
    <li> prices are within (1000,200000) </li>
    <li> observations with 1234 style prices are dropped </li>
    <li> missing valued rows are dropped </li>

</ul>

In [50]:
cleanData = function(df,vars){
    cleanedVehicles <<- df[ , !(names(df) %in% vars)]%>% #remove certain columns 
    filter(price>1000, price < 200000)%>% #remove cars outside of (1000,1000000)
    filter(price != 1234,price != 12345, price != 123456, price != 54321) %>% #remoce cars with prices following a pattern
    filter(odometer < 500000) %>%
    drop_na()%>% #drop missing values
    mutate_if(sapply(., is.character), as.factor) %>% #factorizes <chr> variables
    distinct()  # drops duplicate values
  
    
    return(cleanedVehicles)
    }

## Applying the cleanData function and saving the data
The above function is applied to our dataframe, then the data is saved

In [51]:
#applying the cleanData function to the vehicles dataset to filter for variables
filteredVars = c('id','url','region','region_url','VIN','image_url','description','lat','long','size','county','posting_date','model','clean')

cleanVehicles = cleanData(vehicles,filteredVars)

In [52]:
#save the data
write.csv(cleanedVehicles, '02-vehicles-clean.csv',row.names=FALSE)
head(cleanVehicles)

Unnamed: 0_level_0,price,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,type,paint_color,state
Unnamed: 0_level_1,<dbl>,<int>,<fct>,<fct>,<fct>,<fct>,<int>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>
1,33590,2014,gmc,good,8 cylinders,gas,57923,clean,other,,pickup,white,al
2,22590,2010,chevrolet,good,8 cylinders,gas,71229,clean,other,,pickup,blue,al
3,39590,2020,chevrolet,good,8 cylinders,gas,19160,clean,other,,pickup,red,al
4,30990,2017,toyota,good,8 cylinders,gas,41124,clean,other,,pickup,red,al
5,15000,2013,ford,excellent,6 cylinders,gas,128000,clean,automatic,rwd,truck,black,al
6,27990,2012,gmc,good,8 cylinders,gas,68696,clean,other,4wd,pickup,black,al


In [75]:
jpeg('priceHist.jpg')
hist(cleanVehicles$price, xlab = 'Price', main = 'Distribution of Price')
dev.off()

jpeg('logPriceHist.jpg')
hist(log(cleanVehicles$price), xlab ='log(Price)',main = 'Distribution of log(Price)')
dev.off()