## Exploratory Data Analysis

In [1]:
import pandas as pd
import plotly.express as px

In [4]:
#read data into dataframe
df = pd.read_csv(r'C:\Users\trkjr\OneDrive\Code_Learning_Master\vehicles_project\vehicles_us.csv') 

In [147]:
#check the data for things we would want to fix
display(df.sample(10))
df.info()

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
38525,11950,,subaru forester,excellent,4.0,gas,69822.0,automatic,SUV,,1.0,2019-04-18,75
26179,5995,,volkswagen jetta,excellent,5.0,gas,127692.0,automatic,sedan,,,2019-01-22,44
50549,1500,1998.0,ford expedition,good,8.0,gas,220000.0,automatic,SUV,black,,2018-07-31,46
17902,7800,2014.0,nissan maxima,excellent,8.0,gas,90000.0,automatic,sedan,purple,,2018-12-14,41
34708,1200,2003.0,ford focus se,fair,4.0,gas,167000.0,automatic,sedan,blue,,2018-11-19,29
23711,6995,2011.0,gmc sierra 1500,excellent,6.0,gas,188571.0,automatic,pickup,white,,2019-02-26,57
12721,10990,2016.0,nissan rogue,excellent,4.0,gas,122846.0,automatic,SUV,,,2018-11-30,9
38364,7950,2007.0,ram 1500,excellent,8.0,gas,131550.0,automatic,pickup,red,,2019-01-22,103
10424,5000,2001.0,ford f-150,like new,8.0,gas,,automatic,pickup,red,1.0,2018-11-11,86
41982,9588,2016.0,ford focus,good,3.0,gas,96711.0,automatic,sedan,black,,2018-06-09,18


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


In [148]:
#The model field should be split up into two different columns so we can create visualizations or drop downs based on the brand
#split the column up into 5 columns by spaces
df[['make','model','filler','filler1','filler2']] = df['model'].str.split(' ',expand=True)

In [149]:
#combine the last 4 columns back into the model column
df['model'] = df['model'] + ' ' + df['filler'].fillna('') + ' ' + df['filler1'].fillna('') + ' ' + df['filler2'].fillna('')

In [150]:
#drop the filler columns
df.drop(['filler','filler1','filler2'],axis=1,inplace=True)

In [151]:
#preview new dataframe
df.sample(5)

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed,make
44133,17900,2008.0,silverado 2500hd,good,8.0,diesel,207408.0,automatic,truck,blue,1.0,2019-01-06,26,chevrolet
48192,7900,2009.0,highlander,excellent,4.0,gas,175000.0,automatic,SUV,grey,,2018-07-16,14,toyota
21702,3000,2002.0,escalade,fair,8.0,gas,130000.0,automatic,SUV,black,1.0,2018-06-05,8,cadillac
4935,48000,2016.0,escalade,excellent,8.0,gas,43870.0,automatic,SUV,,1.0,2018-11-02,16,cadillac
45072,1,2018.0,f-150,excellent,,gas,18362.0,automatic,truck,blue,1.0,2018-10-20,94,ford


##### Missing Values
###### - There are a good amount of NaN model years, I don't love the idea of filling them with 0, but I would like to convert the column to int and I can always exclude 0's from visualizations later on.
###### - There are a lot of NaN cylinder values as well. We could replace this with the most common cylinder based on the type of car. I will remove this if needed.
###### - There are lots of missing odometer values. i will replace these with 0.
###### - There are missing paint_colors. I can fill these with 'black', but I think it's fine to leave them as NaN
###### - There are 10s of thousands of missing 4wd values. If we check below the unique values of this field, it's either 1 or NaN. I will fill these missing values with 0

In [152]:
#check unique values of is_4wd
print(df['is_4wd'].unique())

[ 1. nan]


In [153]:
#replace those missing values with 0
df['is_4wd'] = df['is_4wd'].fillna(0).astype(int)

In [154]:
#replace those missing values in odometer with 0
df['odometer'] = df['odometer'].fillna(0).astype(int)

In [155]:
#Fill missing model_year values and convert to int
df['model_year'] = df['model_year'].fillna(0).astype(int)

In [156]:
#check the unique values of cylinder and type to see what we need to assign default values to.
print(df['cylinders'].unique())
print(df['type'].unique())

[ 6.  4.  8. nan  5. 10.  3. 12.]
['SUV' 'pickup' 'sedan' 'truck' 'coupe' 'van' 'convertible' 'hatchback'
 'wagon' 'mini-van' 'other' 'offroad' 'bus']


In [157]:
#see the average cylinders for each type of car.
df_cyl_avg = df[['type','cylinders']]
df_cyl_avg.groupby(['type']).mean().round()

#I will round convertibles down to 6,pickups + trucks to 8 since no 7 cylinder engines exist in the data, and buses to 10 since 9 doesn't exist.

Unnamed: 0_level_0,cylinders
type,Unnamed: 1_level_1
SUV,6.0
bus,9.0
convertible,7.0
coupe,6.0
hatchback,4.0
mini-van,6.0
offroad,6.0
other,6.0
pickup,7.0
sedan,5.0


In [158]:
#create a function for assigning default values of cylinders for each car type.
def cyl(x):
    if x == 'SUV':
        return 6
    elif x == 'bus':
        return 10
    elif x == 'convertible':
        return 6
    elif x == 'coupe':
        return 6
    elif x == 'hatchback':
        return 4
    elif x == 'mini-van':
        return 6
    elif x == 'offroad':
        return 6
    elif x == 'pickup':
        return 8
    elif x == 'sedan':
        return 5
    elif x == 'truck':
        return 8
    elif x == 'van':
        return 6
    elif x == 'wagon':
        return 5
    else:
        return 6 #this is for 'other'

In [159]:
df['cylinders'] = df['cylinders'].fillna(df['type'].apply(cyl)).astype(int)

##### Data Types

In [160]:
df['price'] = df['price'].astype(float)

In [161]:
#checking the dataframe to make sure data types and missing values look good and look at a sample
display(df.sample(10))
df.info()

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed,make
40705,2650.0,2005,civic,good,4,gas,0,automatic,coupe,grey,0,2018-07-27,101,honda
26034,3000.0,0,accord,excellent,6,gas,200000,automatic,sedan,brown,1,2018-07-12,7,honda
44914,8995.0,2007,f-150,excellent,8,gas,122030,automatic,pickup,,1,2018-12-28,71,ford
13775,5995.0,2012,sentra,excellent,4,gas,110000,automatic,sedan,blue,0,2018-07-11,17,nissan
40557,7991.0,2011,f-150,good,8,gas,265961,automatic,truck,grey,1,2019-03-25,30,ford
14974,3999.0,2011,f150,good,8,gas,218000,automatic,pickup,,0,2018-10-20,34,ford
42789,34500.0,0,silverado 3500hd,excellent,8,diesel,82237,automatic,SUV,white,1,2018-08-01,30,chevrolet
2755,18500.0,2016,explorer,excellent,6,gas,60016,automatic,SUV,brown,1,2018-07-17,36,ford
47662,6995.0,2005,outback,excellent,4,gas,113000,automatic,wagon,green,1,2019-04-09,37,subaru
39203,7988.0,2005,forester,excellent,4,gas,119964,automatic,SUV,silver,1,2019-02-09,12,subaru


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  float64
 1   model_year    51525 non-null  int32  
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     51525 non-null  int32  
 5   fuel          51525 non-null  object 
 6   odometer      51525 non-null  int32  
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        51525 non-null  int32  
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
 13  make          51525 non-null  object 
dtypes: float64(1), int32(4), int64(1), object(8)
memory usage: 4.7+ MB
