In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv('../data/vehicles.csv')

# Show rows and columns
print("Rows and columns:", df.shape)

# Preview the top 5 rows
df.head()

Rows and columns: (426880, 26)


Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,...,size,type,paint_color,image_url,description,county,state,lat,long,posting_date
0,7222695916,https://prescott.craigslist.org/cto/d/prescott...,prescott,https://prescott.craigslist.org,6000,,,,,,...,,,,,,,az,,,
1,7218891961,https://fayar.craigslist.org/ctd/d/bentonville...,fayetteville,https://fayar.craigslist.org,11900,,,,,,...,,,,,,,ar,,,
2,7221797935,https://keys.craigslist.org/cto/d/summerland-k...,florida keys,https://keys.craigslist.org,21000,,,,,,...,,,,,,,fl,,,
3,7222270760,https://worcester.craigslist.org/cto/d/west-br...,worcester / central MA,https://worcester.craigslist.org,1500,,,,,,...,,,,,,,ma,,,
4,7210384030,https://greensboro.craigslist.org/cto/d/trinit...,greensboro,https://greensboro.craigslist.org,4900,,,,,,...,,,,,,,nc,,,


In [3]:
# Show all column names
print("Column names:\n", df.columns.tolist())

# Check for missing values in each column
print("\nMissing values per column:\n")
print(df.isnull().sum().sort_values(ascending=False))

# Check data types for each column
print("\nData types:\n")
print(df.dtypes)

Column names:
 ['id', 'url', 'region', 'region_url', 'price', 'year', 'manufacturer', 'model', 'condition', 'cylinders', 'fuel', 'odometer', 'title_status', 'transmission', 'VIN', 'drive', 'size', 'type', 'paint_color', 'image_url', 'description', 'county', 'state', 'lat', 'long', 'posting_date']

Missing values per column:

county          426880
size            306361
cylinders       177678
condition       174104
VIN             161042
drive           130567
paint_color     130203
type             92858
manufacturer     17646
title_status      8242
lat               6549
long              6549
model             5277
odometer          4400
fuel              3013
transmission      2556
year              1205
description         70
posting_date        68
image_url           68
region_url           0
url                  0
id                   0
region               0
price                0
state                0
dtype: int64

Data types:

id                int64
url              object


In [4]:
# Columns to keep for analysis
columns_to_keep = [
    'price', 'year', 'manufacturer', 'model', 'condition',
    'cylinders', 'fuel', 'odometer', 'title_status',
    'transmission', 'drive', 'type', 'paint_color', 'state'
]

# Keep only those columns
df_clean = df[columns_to_keep].copy()

# Drop rows with missing critical info
df_clean.dropna(subset=['price', 'year', 'model', 'odometer'], inplace=True)

# Remove clearly bad entries (price too low or mileage too high)
df_clean = df_clean[(df_clean['price'] > 500) & (df_clean['odometer'] < 500000)]

# Show new shape
print("Cleaned dataset shape:", df_clean.shape)

# Preview cleaned data
df_clean.head()

Cleaned dataset shape: (374975, 14)


Unnamed: 0,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,type,paint_color,state
27,33590,2014.0,gmc,sierra 1500 crew cab slt,good,8 cylinders,gas,57923.0,clean,other,,pickup,white,al
28,22590,2010.0,chevrolet,silverado 1500,good,8 cylinders,gas,71229.0,clean,other,,pickup,blue,al
29,39590,2020.0,chevrolet,silverado 1500 crew,good,8 cylinders,gas,19160.0,clean,other,,pickup,red,al
30,30990,2017.0,toyota,tundra double cab sr,good,8 cylinders,gas,41124.0,clean,other,,pickup,red,al
31,15000,2013.0,ford,f-150 xlt,excellent,6 cylinders,gas,128000.0,clean,automatic,rwd,truck,black,al


In [5]:
# Save the cleaned dataset to the data folder
df_clean.to_csv('../data/vehicles_cleaned.csv', index=False)

print("Cleaned dataset saved to '../data/vehicles_cleaned.csv'")

Cleaned dataset saved to '../data/vehicles_cleaned.csv'


In [6]:
import sqlite3

# Connect to (or create) the database
conn = sqlite3.connect('../db/car_reference.db')

# Save the cleaned DataFrame to a table
df_clean.to_sql('vehicle_listings', conn, if_exists='replace', index=False)

# Confirm
print("Data saved to 'vehicle_listings' table in car_reference.db")

# Optional: check how many records were written
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM vehicle_listings")
print("Number of records:", cursor.fetchone()[0])

# Always close the connection when done
conn.close()

Data saved to 'vehicle_listings' table in car_reference.db
Number of records: 374975


In [7]:
conn = sqlite3.connect('../db/car_reference.db')
cursor = conn.cursor()

# Show all columns in the table
cursor.execute("PRAGMA table_info(vehicle_listings);")
columns = cursor.fetchall()

for col in columns:
    print(col)

conn.close()

(0, 'price', 'INTEGER', 0, None, 0)
(1, 'year', 'REAL', 0, None, 0)
(2, 'manufacturer', 'TEXT', 0, None, 0)
(3, 'model', 'TEXT', 0, None, 0)
(4, 'condition', 'TEXT', 0, None, 0)
(5, 'cylinders', 'TEXT', 0, None, 0)
(6, 'fuel', 'TEXT', 0, None, 0)
(7, 'odometer', 'REAL', 0, None, 0)
(8, 'title_status', 'TEXT', 0, None, 0)
(9, 'transmission', 'TEXT', 0, None, 0)
(10, 'drive', 'TEXT', 0, None, 0)
(11, 'type', 'TEXT', 0, None, 0)
(12, 'paint_color', 'TEXT', 0, None, 0)
(13, 'state', 'TEXT', 0, None, 0)
