## Part 1: Setup and Load Libraries

In [4]:
library(tidyverse)
library(lubridate)
library(stringr)

data_path <- "/workspaces/Fall2025-MS3083-Base_Template/data/"

# Display loaded packages with versions
cat("✓ Libraries loaded successfully!\n")
cat("Package versions:\n")
cat("- tidyverse:", as.character(packageVersion("tidyverse")), "\n")
cat("- lubridate:", as.character(packageVersion("lubridate")), "\n")
cat("- stringr:", as.character(packageVersion("stringr")), "\n")
cat("- Data path set to:", data_path, "\n")

✓ Libraries loaded successfully!
Package versions:
- tidyverse: 2.0.0 
- lubridate: 1.9.4 
Package versions:
- tidyverse: 2.0.0 
- lubridate: 1.9.4 
- stringr: 1.5.2 
- Data path set to: /workspaces/Fall2025-MS3083-Base_Template/data/ 
- stringr: 1.5.2 
- Data path set to: /workspaces/Fall2025-MS3083-Base_Template/data/ 


## Part 2: Load Core Business Data

In [5]:
sales_raw <- read_csv(paste0(data_path, "messy_dealership_sales.csv"), show_col_types = FALSE)
customers_raw <- read_csv(paste0(data_path, "messy_customer_data.csv"), show_col_types = FALSE)
vehicles_raw <- read_csv(paste0(data_path, "messy_vehicle_inventory.csv"), show_col_types = FALSE)
salespeople_raw <- read_csv(paste0(data_path, "messy_salesperson_info.csv"), show_col_types = FALSE)

cat("✓ Core data loaded successfully!\n")
cat("Dataset dimensions:\n")
cat("- Sales data:", nrow(sales_raw), "rows x", ncol(sales_raw), "columns\n")
cat("- Customer data:", nrow(customers_raw), "rows x", ncol(customers_raw), "columns\n")
cat("- Vehicle data:", nrow(vehicles_raw), "rows x", ncol(vehicles_raw), "columns\n")
cat("- Salesperson data:", nrow(salespeople_raw), "rows x", ncol(salespeople_raw), "columns\n")

✓ Core data loaded successfully!
Dataset dimensions:
- Sales data: 20 rows x 9 columns
- Customer data: 20 rows x 10 columns
- Vehicle data: 20 rows x 11 columns
- Salesperson data: 10 rows x 8 columns
Dataset dimensions:
- Sales data: 20 rows x 9 columns
- Customer data: 20 rows x 10 columns
- Vehicle data: 20 rows x 11 columns
- Salesperson data: 10 rows x 8 columns


## Part 3: Load Supporting Data

In [6]:
service_raw <- read_csv(paste0(data_path, "messy_service_records.csv"), show_col_types = FALSE)
financing_raw <- read_csv(paste0(data_path, "messy_financing_details.csv"), show_col_types = FALSE)
warranty_raw <- read_csv(paste0(data_path, "messy_warranty_info.csv"), show_col_types = FALSE)

cat("✓ Supporting data loaded successfully!\n")
cat("Dataset dimensions:\n")
cat("- Service records:", nrow(service_raw), "rows x", ncol(service_raw), "columns\n")
cat("- Financing data:", nrow(financing_raw), "rows x", ncol(financing_raw), "columns\n")
cat("- Warranty data:", nrow(warranty_raw), "rows x", ncol(warranty_raw), "columns\n")

total_raw_records <- nrow(sales_raw) + nrow(customers_raw) + nrow(vehicles_raw) + 
                     nrow(salespeople_raw) + nrow(service_raw) + 
                     nrow(financing_raw) + nrow(warranty_raw)
cat("\nTotal raw records to process:", total_raw_records, "\n") 


ERROR: Error in parse(text = input): <text>:7:29: unexpected symbol
6: 
7: cat("Dataset dimensions:\n")cat
                               ^


## Part 4: Inspect Data Quality Issues

In [None]:
cat("=== DATA QUALITY INSPECTION ===\n\n")

# Check missing values in sales data
cat("1. Missing values in sales data:\n")
missing_sales <- colSums(is.na(sales_raw))
print(missing_sales[missing_sales > 0])

# Check vehicle make inconsistencies
cat("\n2. Vehicle make formatting issues:\n")
unique_makes <- unique(sales_raw$vehicle_make)
cat("Found", length(unique_makes), "unique vehicle makes:\n")
print(head(unique_makes, 10))

# Check date format variations
cat("\n3. Date format patterns in sales data:\n")
date_samples <- head(sales_raw$sale_date[!is.na(sales_raw$sale_date)], 5)
cat("Sample dates showing format inconsistencies:\n")
print(date_samples)

# Check for NULL and empty string patterns
cat("\n4. Trade-in value data quality issues:\n")
trade_in_issues <- table(sales_raw$trade_in_value, useNA = "ifany")
cat("Unique trade-in values (first 10):\n")
print(head(trade_in_issues, 10))

## Part 5: Clean Sales Data

In [None]:
sales_clean <- sales_raw %>%
  mutate(
    # Standardize vehicle information
    vehicle_make = str_to_lower(str_trim(vehicle_make)),
    vehicle_model = str_to_lower(str_trim(vehicle_model)),
    
    # Handle multiple date formats with better error handling
    sale_date = case_when(
      str_detect(sale_date, "/") ~ mdy(sale_date),
      str_detect(sale_date, "-") ~ ymd(sale_date),
      TRUE ~ as.Date(NA)
    ),
    
    # Standardize payment methods
    payment_method = str_to_title(str_trim(payment_method)),
    
    # Clean salesperson names
    salesperson = str_trim(salesperson),
    
    # Handle missing trade-in values (convert NULL, empty, NA to 0)
    trade_in_value = case_when(
      is.na(trade_in_value) ~ 0,
      trade_in_value == "NULL" ~ 0,
      trade_in_value == "" ~ 0,
      TRUE ~ as.numeric(trade_in_value)
    ),
    
    # Clean customer names
    customer_name = str_trim(str_squish(customer_name)),
    customer_name = if_else(is.na(customer_name) | customer_name == "", 
                           "Unknown Customer", customer_name)
  )

cat("✓ Sales data cleaning completed!\n\n")

# Show cleaning results
cat("CLEANING RESULTS:\n")
cat("- Vehicle makes standardized:", length(unique(sales_clean$vehicle_make)), "unique values\n")
cat("- Date parsing successful:", sum(!is.na(sales_clean$sale_date)), "of", nrow(sales_clean), "dates\n")
cat("- Trade-in values cleaned:", sum(sales_clean$trade_in_value == 0), "missing values set to $0\n")
cat("- Customer names cleaned:", sum(sales_clean$customer_name == "Unknown Customer"), "missing names standardized\n")

# Sample of cleaned data
cat("\nSample of cleaned sales data:\n")
print(head(sales_clean %>% select(vehicle_make, vehicle_model, sale_date, trade_in_value), 3))

## Part 6: Clean Customer Data

In [None]:
customers_clean <- customers_raw %>%
  mutate(
    first_name = if_else(is.na(first_name) | first_name == "", "Unknown", str_trim(first_name)),
    last_name = if_else(is.na(last_name) | last_name == "", "Customer", str_trim(last_name)),
    full_name = paste(first_name, last_name),
    email = if_else(is.na(email) | email == "", 
                   paste0("customer", customer_id, "@placeholder.com"), 
                   str_to_lower(str_trim(email))),
    phone = str_replace_all(phone, "[^0-9]", ""),
    phone = str_replace(phone, "(\\d{3})(\\d{3})(\\d{4})", "(\\1) \\2-\\3"),
    state = str_to_upper(str_trim(state)),
    zip_code = as.character(zip_code),
    registration_date = ymd(registration_date)
  )

cat("✓ Customer data cleaning completed!\n\n")

# Show cleaning results
cat("CLEANING RESULTS:\n")
cat("- Missing first names filled:", sum(customers_clean$first_name == "Unknown"), "records\n")
cat("- Missing last names filled:", sum(customers_clean$last_name == "Customer"), "records\n")
cat("- Placeholder emails generated:", sum(str_detect(customers_clean$email, "@placeholder.com")), "records\n")
cat("- Phone numbers formatted:", sum(str_detect(customers_clean$phone, "\\(\\d{3}\\) \\d{3}-\\d{4}")), "records\n")
cat("- States standardized:", length(unique(customers_clean$state)), "unique state codes\n")

# Sample of cleaned data
cat("\nSample of cleaned customer data:\n")
print(head(customers_clean %>% select(full_name, email, phone, state), 3))

## Part 7: Clean Vehicle Inventory

In [None]:
vehicles_clean <- vehicles_raw %>%
  mutate(
    vin = str_to_upper(str_trim(vin)),
    make = str_to_lower(str_trim(make)),
    model = str_to_lower(str_trim(model)),
    condition = str_to_title(str_trim(condition)),
    color = str_to_title(str_trim(color)),
    sold = str_to_upper(str_trim(sold)) %in% c("YES", "Y", "TRUE", "1"),
    year = as.integer(year),
    mileage = as.integer(mileage),
    purchase_price = as.numeric(purchase_price),
    lot_date = ymd(lot_date)
  )

cat("✓ Vehicle inventory cleaning completed!\n\n")

# Show cleaning results
cat("CLEANING RESULTS:\n")
cat("- VINs standardized:", sum(str_length(vehicles_clean$vin) == 17), "complete VINs\n")
cat("- Vehicle makes standardized:", length(unique(vehicles_clean$make)), "unique makes\n")
cat("- Vehicles sold:", sum(vehicles_clean$sold, na.rm = TRUE), "of", nrow(vehicles_clean), "total\n")
cat("- Average vehicle year:", round(mean(vehicles_clean$year, na.rm = TRUE), 1), "\n")
cat("- Average mileage:", formatC(mean(vehicles_clean$mileage, na.rm = TRUE), format = "f", digits = 0, big.mark = ","), "miles\n")

# Sample of cleaned data
cat("\nSample of cleaned vehicle data:\n")
print(head(vehicles_clean %>% select(make, model, year, condition, sold), 3))

## Part 8: Clean Salesperson Data

In [None]:
salespeople_clean <- salespeople_raw %>%
  mutate(
    salesperson_name = str_to_title(str_trim(salesperson_name)),
    hire_date = ymd(hire_date),
    email = str_to_lower(str_trim(email)),
    email = if_else(is.na(email) | email == "",
                   paste0("employee", salesperson_id, "@dealership.com"),
                   email),
    phone = str_replace_all(phone, "[^0-9]", ""),
    phone = str_replace(phone, "(\\d{3})(\\d{3})(\\d{4})", "(\\1) \\2-\\3"),
    commission_rate = if_else(is.na(commission_rate), 0.03, commission_rate),
    department = str_to_title(str_trim(department)),
    status = str_to_lower(str_trim(status))
  )

cat("✓ Salesperson data cleaning completed!\n\n")

# Show cleaning results
cat("CLEANING RESULTS:\n")
cat("- Salesperson names formatted:", nrow(salespeople_clean), "records\n")
cat("- Placeholder emails generated:", sum(str_detect(salespeople_clean$email, "@dealership.com")), "records\n")
cat("- Default commission rates set:", sum(salespeople_clean$commission_rate == 0.03), "employees (3%)\n")
cat("- Average commission rate:", paste0(round(mean(salespeople_clean$commission_rate) * 100, 1), "%"), "\n")
cat("- Departments:", paste(unique(salespeople_clean$department), collapse = ", "), "\n")

# Sample of cleaned data
cat("\nSample of cleaned salesperson data:\n")
print(head(salespeople_clean %>% select(salesperson_name, department, commission_rate, status), 3))

## Part 9: Clean Service Records

In [None]:
service_clean <- service_raw %>%
  mutate(
    # Standardize VIN format
    vin = str_to_upper(str_trim(vin)),
    
    # Handle multiple date formats
    service_date = case_when(
      str_detect(service_date, "/") ~ mdy(service_date),
      str_detect(service_date, "-") ~ ymd(service_date),
      TRUE ~ as.Date(NA)
    ),
    
    # Standardize service information
    service_type = str_to_title(str_trim(service_type)),
    mechanic_name = str_to_title(str_trim(mechanic_name)),
    
    # Handle missing costs (convert NULL, empty, NA to 0)
    labor_cost = case_when(
      is.na(labor_cost) ~ 0,
      labor_cost == "NULL" ~ 0,
      labor_cost == "" ~ 0,
      TRUE ~ as.numeric(labor_cost)
    ),
    parts_cost = case_when(
      is.na(parts_cost) ~ 0,
      parts_cost == "NULL" ~ 0,
      parts_cost == "" ~ 0,
      TRUE ~ as.numeric(parts_cost)
    ),
    
    # Clean service notes
    notes = str_trim(str_squish(notes)),
    notes = if_else(is.na(notes) | notes == "", NA_character_, notes)
  )

cat("✓ Service records cleaning completed!\n\n")

# Show cleaning results
cat("CLEANING RESULTS:\n")
cat("- Service records processed:", nrow(service_clean), "records\n")
cat("- Service dates parsed:", sum(!is.na(service_clean$service_date)), "successful\n")
cat("- Labor costs with $0 (missing):", sum(service_clean$labor_cost == 0), "records\n")
cat("- Parts costs with $0 (missing):", sum(service_clean$parts_cost == 0), "records\n")
cat("- Average total service cost: $", round(mean(service_clean$labor_cost + service_clean$parts_cost), 2), "\n")
cat("- Service types:", length(unique(service_clean$service_type)), "unique types\n")

# Sample of cleaned data
cat("\nSample of cleaned service data:\n")
print(head(service_clean %>% select(service_type, mechanic_name, labor_cost, parts_cost), 3))

## Part 10: Clean Financing Details

In [None]:
financing_clean <- financing_raw %>%
  mutate(
    # Standardize lender names
    lender_name = str_to_title(str_trim(lender_name)),
    
    # Ensure numeric data types with proper error handling
    loan_amount = suppressWarnings(as.numeric(loan_amount)),
    interest_rate = suppressWarnings(as.numeric(interest_rate)),
    term_months = suppressWarnings(as.integer(term_months)),
    
    # Calculate monthly payment if missing using loan formula
    monthly_payment = case_when(
      is.na(monthly_payment) & !is.na(loan_amount) & !is.na(interest_rate) & 
      !is.na(term_months) & interest_rate > 0 & term_months > 0 ~ 
        loan_amount * (interest_rate / 100 / 12) * 
        (1 + interest_rate / 100 / 12)^term_months / 
        ((1 + interest_rate / 100 / 12)^term_months - 1),
      TRUE ~ suppressWarnings(as.numeric(monthly_payment))
    ),
    
    # Parse approval dates
    approval_date = case_when(
      str_detect(approval_date, "/") ~ mdy(approval_date),
      str_detect(approval_date, "-") ~ ymd(approval_date),
      TRUE ~ as.Date(NA)
    ),
    
    # Handle missing down payments
    down_payment = case_when(
      is.na(down_payment) | down_payment == "" | down_payment == "NULL" ~ 0,
      TRUE ~ suppressWarnings(as.numeric(down_payment))
    )
  )

cat("✓ Financing data cleaning completed!\n\n")

# Show cleaning results
calculated_payments <- sum(!is.na(financing_clean$monthly_payment) & 
                          is.na(suppressWarnings(as.numeric(financing_raw$monthly_payment))))

cat("CLEANING RESULTS:\n")
cat("- Financing records processed:", nrow(financing_clean), "records\n")
cat("- Monthly payments calculated:", calculated_payments, "missing values\n")
cat("- Average loan amount: $", formatC(mean(financing_clean$loan_amount, na.rm = TRUE), 
                                       format = "f", digits = 0, big.mark = ","), "\n")
cat("- Average interest rate:", round(mean(financing_clean$interest_rate, na.rm = TRUE), 2), "%\n")
cat("- Average term:", round(mean(financing_clean$term_months, na.rm = TRUE), 0), "months\n")
cat("- Zero down payments:", sum(financing_clean$down_payment == 0, na.rm = TRUE), "loans\n")

# Sample of cleaned data
cat("\nSample of cleaned financing data:\n")
print(head(financing_clean %>% select(lender_name, loan_amount, interest_rate, monthly_payment), 3)) 

## Part 11: Clean Warranty Information

In [None]:
warranty_clean <- warranty_raw %>%
  mutate(
    # Standardize warranty information
    warranty_type = str_to_title(str_trim(warranty_type)),
    provider = str_to_title(str_trim(provider)),
    
    # Parse dates consistently with better error handling
    start_date = case_when(
      str_detect(start_date, "/") ~ mdy(start_date),
      str_detect(start_date, "-") ~ ymd(start_date),
      TRUE ~ as.Date(NA)
    ),
    end_date = case_when(
      str_detect(end_date, "/") ~ mdy(end_date),
      str_detect(end_date, "-") ~ ymd(end_date),
      TRUE ~ as.Date(NA)
    ),
    
    # Ensure numeric coverage amounts
    coverage_amount = suppressWarnings(as.numeric(coverage_amount)),
    
    # Handle missing deductibles
    deductible = case_when(
      is.na(deductible) | deductible == "" | deductible == "NULL" ~ 0,
      TRUE ~ suppressWarnings(as.numeric(deductible))
    ),
    
    # Standardize status
    status = str_to_lower(str_trim(status))
  )

cat("✓ Warranty data cleaning completed!\n\n")

# Show cleaning results
cat("CLEANING RESULTS:\n")
cat("- Warranty records processed:", nrow(warranty_clean), "records\n")
cat("- Warranty types:", length(unique(warranty_clean$warranty_type)), "unique types\n")
cat("- Warranty providers:", length(unique(warranty_clean$provider)), "providers\n")
cat("- Start dates parsed:", sum(!is.na(warranty_clean$start_date)), "of", nrow(warranty_clean), "dates\n")
cat("- End dates parsed:", sum(!is.na(warranty_clean$end_date)), "of", nrow(warranty_clean), "dates\n")
cat("- Average coverage amount: $", formatC(mean(warranty_clean$coverage_amount, na.rm = TRUE), 
                                          format = "f", digits = 0, big.mark = ","), "\n")
cat("- Zero deductible warranties:", sum(warranty_clean$deductible == 0, na.rm = TRUE), "policies\n")
cat("- Active warranties:", sum(warranty_clean$status == "active", na.rm = TRUE), "policies\n")

# Sample of cleaned data
cat("\nSample of cleaned warranty data:\n")
print(head(warranty_clean %>% select(warranty_type, provider, coverage_amount, status), 3))

## Part 12: Data Quality Summary

In [None]:
cat(rep("=", 80), "\n")
cat("FINAL DATA CLEANING SUMMARY REPORT\n")
cat(rep("=", 80), "\n\n")

# Calculate totals
total_records <- nrow(sales_clean) + nrow(customers_clean) + nrow(vehicles_clean) + 
                 nrow(salespeople_clean) + nrow(service_clean) + 
                 nrow(financing_clean) + nrow(warranty_clean)

# Record counts by dataset
cat("DATASET RECORD COUNTS:\n")
cat(sprintf("%-20s: %6d records\n", "Sales", nrow(sales_clean)))
cat(sprintf("%-20s: %6d records\n", "Customers", nrow(customers_clean)))
cat(sprintf("%-20s: %6d records\n", "Vehicles", nrow(vehicles_clean)))
cat(sprintf("%-20s: %6d records\n", "Salespeople", nrow(salespeople_clean)))
cat(sprintf("%-20s: %6d records\n", "Service Records", nrow(service_clean)))
cat(sprintf("%-20s: %6d records\n", "Financing", nrow(financing_clean)))
cat(sprintf("%-20s: %6d records\n", "Warranties", nrow(warranty_clean)))
cat(sprintf("%-20s: %6d records\n", "TOTAL", total_records))

cat("\nDATA QUALITY IMPROVEMENTS:\n")
cat("- Standardized text formatting (case, whitespace)\n")
cat("- Unified date formats (YYYY-MM-DD)\n")
cat("- Converted boolean values consistently\n")
cat("- Handled missing values appropriately\n")
cat("- Generated placeholder data where needed\n")
cat("- Calculated missing financial values\n")

cat("\nDATA VALIDATION CHECKS:\n")
cat("✓ All datasets loaded and processed successfully\n")
cat("✓ Numeric fields converted with error handling\n")
cat("✓ Date parsing completed with fallback options\n")
cat("✓ Text fields standardized consistently\n")

cat(sprintf("\n%s\n", rep("=", 80)))
cat("Data cleaning completed successfully!\n")
cat("Ready for export to CSV files.\n")
cat(sprintf("%s\n", rep("=", 80)))

## Part 13: Export Cleaned Data

In [None]:
# Export all cleaned datasets
write_csv(sales_clean, paste0(data_path, "clean_dealership_sales.csv"))
write_csv(customers_clean, paste0(data_path, "clean_customer_data.csv"))
write_csv(vehicles_clean, paste0(data_path, "clean_vehicle_inventory.csv"))
write_csv(salespeople_clean, paste0(data_path, "clean_salesperson_info.csv"))
write_csv(service_clean, paste0(data_path, "clean_service_records.csv"))
write_csv(financing_clean, paste0(data_path, "clean_financing_details.csv"))
write_csv(warranty_clean, paste0(data_path, "clean_warranty_info.csv"))

cat("✓ DATA EXPORT COMPLETED!\n\n")

# Verify file exports
exported_files <- c("clean_dealership_sales.csv", "clean_customer_data.csv", 
                   "clean_vehicle_inventory.csv", "clean_salesperson_info.csv",
                   "clean_service_records.csv", "clean_financing_details.csv", 
                   "clean_warranty_info.csv")

cat("EXPORTED FILES:\n")
for(file in exported_files) {
  filepath <- paste0(data_path, file)
  if(file.exists(filepath)) {
    size_kb <- round(file.info(filepath)$size / 1024, 1)
    cat(sprintf("✓ %-30s (%s KB)\n", file, size_kb))
  }
}

cat("\nNEXT STEPS:\n")
cat("1. Verify exported CSV files in the data directory\n")
cat("2. Proceed to Part 2: PostgreSQL Database Operations\n")
cat("3. Load cleaned data into database tables\n")
cat("4. Perform data analysis and reporting\n")

cat("\n" , rep("=", 50), "\n")
cat("Final Project Part 1: COMPLETE\n")
cat(rep("=", 50), "\n")