### Used Vehicle Information for Sale (US) ####

In [5]:
import pandas as pd
import plotly.express as plt
import streamlit as st
import numpy as np
import matplotlib.pyplot as mplt
from matplotlib.backends.backend_agg import FigureCanvasAgg



In [6]:
###  USED Vehicle Data analysis and exploration
## Read Sample data for Used Vehicles information for sales
df = pd.read_csv("..//vehicles_us.csv",parse_dates=['date_posted'], date_format='%Y-%m-%d')
## Print the info and column information
display(df.columns)
display(df.info())
display(df.head())


Index(['price', 'model_year', 'model', 'condition', 'cylinders', 'fuel',
       'odometer', 'transmission', 'type', 'paint_color', 'is_4wd',
       'date_posted', 'days_listed'],
      dtype='object')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   price         51525 non-null  int64         
 1   model_year    47906 non-null  float64       
 2   model         51525 non-null  object        
 3   condition     51525 non-null  object        
 4   cylinders     46265 non-null  float64       
 5   fuel          51525 non-null  object        
 6   odometer      43633 non-null  float64       
 7   transmission  51525 non-null  object        
 8   type          51525 non-null  object        
 9   paint_color   42258 non-null  object        
 10  is_4wd        25572 non-null  float64       
 11  date_posted   51525 non-null  datetime64[ns]
 12  days_listed   51525 non-null  int64         
dtypes: datetime64[ns](1), float64(4), int64(2), object(6)
memory usage: 5.1+ MB


None

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28


In [7]:
# Check for duplicates 
display(df[df.duplicated()])  
#  (No duplicates foound)



Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed


In [8]:
## Handling Missing Data and Data transformation

# Fill NAN values for is_4wd with 0
df['is_4wd']=df['is_4wd'].fillna(0)

# Fill NAN values for paint_color with 0
df['paint_color']=df['paint_color'].fillna('Unknown')

# Fill NAN values for model_year with  median of model_year for that model
df['model_year'] = df['model_year'].fillna(df.groupby('model')['model_year'].transform('median'))

# Fill NAN values for odometer with  median of odometer when grouped by condition
df['odometer'] = df['odometer'].fillna(df.groupby(['condition'])['odometer'].transform('median'))


# Fill NAN values for cylinders with  median of cylinders when grouped  type
df['cylinders'] = df['cylinders'].fillna(df.groupby(['type'])['cylinders'].transform('median'))

# Splitting Model column into Make and Model_Series  columns
df[['make', 'model_series']] = df['model'].str.split(' ', n=1, expand=True).fillna('')

#display(df.describe())
#display(df.sample(10))
#display(df.head())

In [9]:
## Data Visualization
#This histograms shows the count of vehicles available in a given price range for each of vehicle make

#st.write("Histogram of Vehicle count available  price range for each vehicle make")
hist = plt.histogram(df, x='price',color='make', nbins=2 , color_discrete_sequence=plt.colors.qualitative.Plotly,
                 title='Vehicle Price histogram')
hist.show()

## Conclusions :
##Make Ford with price range 0 - 190k are around 12600 
##Make chevrolet price range 0 - 190k are around 10600


# Display the plot in Streamlit
#st.plotly_chart(hist)


In [10]:
df[(df['make'] == 'ford') & (df['price'] >= 6500)  & (df['price'] <= 6999)]

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed,make,model_series
97,6500,2012.0,ford fusion se,excellent,6.0,gas,90000.0,automatic,sedan,grey,0.0,2018-09-20,94,ford,fusion se
148,6995,2004.0,ford expedition,excellent,8.0,gas,92546.0,automatic,SUV,blue,1.0,2018-05-23,18,ford,expedition
193,6995,2009.0,ford ranger,good,4.0,gas,129000.0,automatic,pickup,white,0.0,2019-03-31,130,ford,ranger
240,6500,2009.0,ford f-250,good,8.0,gas,113000.0,automatic,pickup,white,1.0,2018-09-09,19,ford,f-250
522,6995,2013.0,ford focus,good,4.0,gas,85742.0,automatic,hatchback,white,0.0,2018-12-16,24,ford,focus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50589,6500,2014.0,ford escape,like new,4.0,gas,140000.0,automatic,SUV,grey,0.0,2018-12-10,27,ford,escape
50703,6800,2005.0,ford f-150,excellent,8.0,gas,104230.0,automatic,pickup,black,1.0,2018-07-04,42,ford,f-150
50845,6999,2008.0,ford f250 super duty,good,8.0,gas,103000.0,automatic,pickup,silver,0.0,2018-08-06,48,ford,f250 super duty
51030,6988,2010.0,ford ranger,excellent,4.0,gas,188100.0,automatic,truck,white,0.0,2018-12-19,26,ford,ranger


In [11]:
## Data Visualization
#This Scatter  plot show the price vs Odometer values graph
#st.write("Scatter plot of Price vs Odometer:")

scatter_plt = plt.scatter(df, x='price', y='odometer', color='make', color_discrete_sequence=plt.colors.qualitative.Plotly,
                 title='Vehicle Price vs Odometer Scatter Plot')
scatter_plt.show()

## Conclusions :
### Most vehicles with under 500K Odometer are around $2K to $3K price range
### Vehicles with Odometers values has low price range

# Display the plot in Streamlit
#st.plotly_chart(scatter_plt)


In [12]:


## Data Visualization
# This Box plot gives an idean of the median prices , upper quartile and lower quartile prices per each make of the car
#st.write("Box plot of Price vs type:")

box_plt = plt.box(df, x='price', y='type', color='make', color_discrete_sequence=plt.colors.qualitative.Plotly,
                 title='Vehicle Price distribution by type')
box_plt.show()
## Conclusions :
## Nissan Pickup truck has the hightest price of 375K
## Most vehicles price under $65K
##

# Display the plot in Streamlit
#st.plotly_chart(box_plt)

