In [1]:
import pandas as pd
import plotly.express as px

In [2]:
# Load dataset

df = pd.read_csv("../vehicles_us.csv")


In [3]:
# Explore general information about the dataset
print(df.info())

# Display general statistics for the numerical columns
print()
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB
None

               price    model_year     cylinders       odometer   is_4wd  \
count   51525.000000  47906.000000  46265.000000   43633.000000  25572.0   
mean    12132.464920   200

In [4]:
# Display the firsts rows of the data set

df.head()

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28


In [5]:
# Normalize columns headers

df.columns = df.columns.str.lower().str.strip()


In [6]:
# Convert ['model_year', 'cylinders'] to int type 

df[['model_year', 'cylinders']] = df[['model_year', 'cylinders']].apply(
    lambda col: pd.to_numeric(col, errors='coerce').astype('Int64')
)

# Convert 'price' to float

df['price'] = pd.to_numeric(df['price'], errors='coerce').astype('float')

# Change 'date_posted' to date format

df['date_posted'] = pd.to_datetime(df['date_posted'], errors='coerce')


In [7]:
# Check missing values

print(df.isna().mean().sort_values(ascending=False))

is_4wd          0.503697
paint_color     0.179854
odometer        0.153168
cylinders       0.102086
model_year      0.070238
condition       0.000000
model           0.000000
price           0.000000
fuel            0.000000
type            0.000000
transmission    0.000000
date_posted     0.000000
days_listed     0.000000
dtype: float64


In [8]:
# Check the number of rows that will be lost before drop them
missing_before = df.isna().sum()
print("Missing values per column before drop:\n", missing_before)
print(f"\nRows before drop: {df.shape[0]}")
df = df.dropna()
print(f"Rows after drop: {df.shape[0]}")


Missing values per column before drop:
 price               0
model_year       3619
model               0
condition           0
cylinders        5260
fuel                0
odometer         7892
transmission        0
type                0
paint_color      9267
is_4wd          25953
date_posted         0
days_listed         0
dtype: int64

Rows before drop: 51525
Rows after drop: 14852


In [9]:
# Check dupplicates
duplicates = df.duplicated().sum()
print(f"Duplicated rows: {duplicates}")
df = df.drop_duplicates()


Duplicated rows: 0


In [10]:
# Check data types
print(df.dtypes)


price                  float64
model_year               Int64
model                   object
condition               object
cylinders                Int64
fuel                    object
odometer               float64
transmission            object
type                    object
paint_color             object
is_4wd                 float64
date_posted     datetime64[ns]
days_listed              int64
dtype: object


In [11]:
# Create histogram for vehicle prices
fig = px.histogram(df, x='price', nbins=50, title='Distribution of Vehicle Prices')
fig.show()

In [12]:
# Create a scatter plot

fig2 =  px.scatter(df, x='odometer', y='price', color='type', title='Price vs Odometer by Vehicle Type')
fig2.show()

In [13]:
df2 = pd.DataFrame({
    "Brand": ["Toyota", "Ford", "Honda"],
    "Sales": [100, 80, 90]
})

fig = px.bar(df2, x="Brand", y="Sales", title="Test Chart")
fig.show()