In [1]:
import pandas as pd
from datetime import datetime

# Load cleaned dataset
df = pd.read_csv("vehicles_cleaned.csv")

# Set constants
USD_TO_CAD = 1.35
MILES_TO_KM = 1.60934

# Convert price and odometer
df['price'] = (df['price'] * USD_TO_CAD).round(0).astype("Int64")
df['odometer'] = (df['odometer'] * MILES_TO_KM).round(0).astype("Int64")

# Normalize text fields
text_fields = ['manufacturer', 'model', 'condition', 'cylinders', 'fuel', 'transmission', 'drive', 'size', 'type', 'paint_color']
available_fields = [col for col in text_fields if col in df.columns]

for col in available_fields:
    df[col] = df[col].astype(str).str.strip().str.lower().replace("nan", None)

# Drop irrelevant columns
columns_to_drop = ['url', 'region_url', 'image_url', 'county', 'VIN']
available_to_drop = [col for col in columns_to_drop if col in df.columns]
df.drop(columns=available_to_drop, inplace=True)

# Add created_at
df['created_at'] = datetime.now().strftime("%Y-%m-%d")

# Preview
df.head()

Unnamed: 0,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,type,paint_color,state,created_at
0,45346,2014.0,gmc,sierra 1500 crew cab slt,good,8 cylinders,gas,93218,clean,other,,pickup,white,al,2025-06-01
1,30497,2010.0,chevrolet,silverado 1500,good,8 cylinders,gas,114632,clean,other,,pickup,blue,al,2025-06-01
2,53446,2020.0,chevrolet,silverado 1500 crew,good,8 cylinders,gas,30835,clean,other,,pickup,red,al,2025-06-01
3,41836,2017.0,toyota,tundra double cab sr,good,8 cylinders,gas,66182,clean,other,,pickup,red,al,2025-06-01
4,20250,2013.0,ford,f-150 xlt,excellent,6 cylinders,gas,205996,clean,automatic,rwd,truck,black,al,2025-06-01


In [2]:
original_df = pd.read_csv("vehicles.csv")
original_df.columns

Index(['id', 'url', 'region', 'region_url', 'price', 'year', 'manufacturer',
       'model', 'condition', 'cylinders', 'fuel', 'odometer', 'title_status',
       'transmission', 'VIN', 'drive', 'size', 'type', 'paint_color',
       'image_url', 'description', 'county', 'state', 'lat', 'long',
       'posting_date'],
      dtype='object')

In [3]:
original_df = pd.read_csv("vehicles.csv")
original_df.columns

Index(['id', 'url', 'region', 'region_url', 'price', 'year', 'manufacturer',
       'model', 'condition', 'cylinders', 'fuel', 'odometer', 'title_status',
       'transmission', 'VIN', 'drive', 'size', 'type', 'paint_color',
       'image_url', 'description', 'county', 'state', 'lat', 'long',
       'posting_date'],
      dtype='object')

In [4]:
# Load original dataset with posting_date
original_df = pd.read_csv("vehicles.csv")

# Replace created_at with posting_date from original
df['created_at'] = pd.to_datetime(original_df['posting_date'], errors='coerce').dt.date

  df['created_at'] = pd.to_datetime(original_df['posting_date'], errors='coerce').dt.date


AttributeError: Can only use .dt accessor with datetimelike values

In [None]:
# Load original dataset
original_df = pd.read_csv("vehicles.csv")

# Try parsing posting_date with error fallback
posting_dates = pd.to_datetime(original_df['posting_date'], errors='coerce', utc=True)

# Now extract the date part safely
df['created_at'] = posting_dates.dt.date

In [None]:
# Load original dataset
original_df = pd.read_csv("vehicles.csv")

# Try parsing posting_date with error fallback
posting_dates = pd.to_datetime(original_df['posting_date'], errors='coerce', utc=True)

# Now extract the date part safely
df['created_at'] = posting_dates.dt.date

In [None]:
df[['model', 'price', 'created_at']].head()

In [None]:
# Load original dataset with posting_date
original_df = pd.read_csv("vehicles.csv", usecols=["id", "posting_date"])

# Convert posting_date to datetime safely
original_df['posting_date'] = pd.to_datetime(original_df['posting_date'], errors='coerce', utc=True).dt.date

# Merge posting_date into df using 'id'
df = df.merge(original_df, how='left', left_on='id', right_on='id')

# Rename for clarity
df.rename(columns={'posting_date': 'created_at'}, inplace=True)

In [1]:
import pandas as pd
import sqlite3
from pathlib import Path

# Path helpers – adjust if your folder names differ
BASE_DIR = Path.cwd().parent          # points to car-reference-db/
CSV_PATH = BASE_DIR / "data" / "vehicles_final_cleaned.csv"
DB_PATH  = BASE_DIR / "db" / "car_reference.db"

# Load the FINAL cleaned CSV (it still has 'id')
df = pd.read_csv(CSV_PATH)

print("Columns:", df.columns.tolist()[:8], "...")   # quick sanity-check

# Write / replace the table in SQLite
with sqlite3.connect(DB_PATH) as conn:
    df.to_sql("vehicle_listings", conn, if_exists="replace", index=False)
    print("✅ vehicle_listings table written:", len(df), "rows")

Columns: ['id', 'price', 'year', 'manufacturer', 'model', 'condition', 'cylinders', 'fuel'] ...
✅ vehicle_listings table written: 421603 rows


In [None]:
df = pd.read_csv("vehicles_cleaned.csv")  # This version must include 'id'

In [None]:
print("df columns:", df.columns)

In [None]:
# Only keep what's needed from original
original_df = pd.read_csv("vehicles.csv", usecols=["id", "posting_date"])
original_df['posting_date'] = pd.to_datetime(original_df['posting_date'], errors='coerce', utc=True).dt.date

# Merge on id
df = df.merge(original_df, how='left', on='id')
df.rename(columns={'posting_date': 'created_at'}, inplace=True)

In [None]:
# Load cleaned dataset again from scratch, but include 'id'
df = pd.read_csv("vehicles.csv")

# Keep only columns we care about (including 'id' now)
columns_to_keep = [
    'id', 'price', 'year', 'manufacturer', 'model', 'condition',
    'cylinders', 'fuel', 'odometer', 'title_status',
    'transmission', 'drive', 'type', 'paint_color', 'state'
]
df = df[columns_to_keep].copy()

In [None]:
# Convert currency and odometer to Canadian units
USD_TO_CAD = 1.35
MILES_TO_KM = 1.60934

df['price'] = (df['price'] * USD_TO_CAD).round(0).astype("Int64")
df['odometer'] = (df['odometer'] * MILES_TO_KM).round(0).astype("Int64")

# Normalize text fields if they exist
text_fields = ['manufacturer', 'model', 'condition', 'cylinders', 'fuel', 'transmission', 'drive', 'type', 'paint_color']
for col in text_fields:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip().str.lower().replace("nan", None)

In [None]:
# Extract and convert posting_date from original
original_df = pd.read_csv("vehicles.csv", usecols=["id", "posting_date"])
original_df['posting_date'] = pd.to_datetime(original_df['posting_date'], errors='coerce', utc=True).dt.date

# Merge on id
df = df.merge(original_df, how='left', on='id')
df.rename(columns={'posting_date': 'created_at'}, inplace=True)

In [None]:
df[['id', 'model', 'price', 'created_at']].head()

In [None]:
df[['model', 'created_at']].isna().sum()

In [None]:
df[df['model'].isna()].head()
df[df['created_at'].isna()].head()

In [None]:
raw_df = pd.read_csv("vehicles.csv", usecols=["id", "model", "posting_date"])
raw_df[raw_df['model'].isna()].shape

In [None]:
raw_df[raw_df['model'].isna()].head()

In [None]:
print(f"Final row count: {len(df)}")

In [None]:
df = df.dropna(subset=['model', 'created_at'])

In [None]:
print(f"Final row count: {len(df)}")

In [None]:
df.to_csv("vehicles_final_cleaned.csv", index=False)

In [None]:
import sqlite3

# Load your final cleaned CSV
df = pd.read_csv("vehicles_final_cleaned.csv")

# Connect to your database (will create it if not exists)
conn = sqlite3.connect("../car_reference.db")  # adjust path if needed

# Write to a new table
df.to_sql("reference_listings", conn, if_exists="replace", index=False)

# Confirm
print("✅ Data imported into reference_listings")
conn.close()

In [None]:
import sqlite3
import pandas as pd

# Connect to the database
conn = sqlite3.connect("../car_reference.db")

In [None]:
# Preview 5 rows
pd.read_sql_query("SELECT id, make, model, year, price, odometer, created_at FROM reference_listings LIMIT 5", conn)

In [None]:
pd.read_sql_query("PRAGMA table_info(reference_listings)", conn)

In [None]:
pd.read_sql_query("SELECT id, manufacturer, model, year, price, odometer, created_at FROM reference_listings LIMIT 5", conn)

In [None]:
import pandas as pd
import sqlite3

# Load the cleaned data
df = pd.read_csv("data/vehicles_final_cleaned.csv")

# Connect to the SQLite database
conn = sqlite3.connect("db/car_reference.db")

# Overwrite vehicle_listings table with correct schema
df.to_sql("vehicle_listings", conn, if_exists="replace", index=False)

# Confirm it's written
print("Table vehicle_listings successfully written.")

# Optional: check the columns
print(df.columns.tolist())

conn.close()

In [None]:
import pandas as pd
import sqlite3

# Load the cleaned data
df = pd.read_csv("data/vehicles_final_cleaned.csv")

# Connect to the SQLite database
conn = sqlite3.connect("db/car_reference.db")

# Overwrite vehicle_listings table with correct schema
df.to_sql("vehicle_listings", conn, if_exists="replace", index=False)

# Confirm it's written
print("Table vehicle_listings successfully written.")

# Optional: check the columns
print(df.columns.tolist())

conn.close()

In [None]:
import sqlite3
conn = sqlite3.connect("db/car_reference.db")
pd.read_sql_query("SELECT * FROM vehicle_listings LIMIT 5", conn)