# Data generator for car value predictor model

In [908]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

In [909]:
maker_weights = {
    'Acura'         :1.02, 
    'Audi'          :1.25, 
    'BMW'           :1.20, 
    'Cadillac'      :1.18,
    'Chevrolet'     :1.03, 
    'Chrysler'      :0.95, 
    'Dodge'         :1.01, 
    'Ford'          :1.03, 
    'GMC'           :1.05, 
    'Honda'         :1.01, 
    'Hyundai'       :0.90, 
    'Jeep'          :1.15, 
    'Kia'           :0.90, 
    'Lexus'         :1.18, 
    'Mazda'         :0.95, 
    'Mercedes-Benz' :1.20, 
    'Nissan'        :0.95, 
    'Porsche'       :1.85, 
    'Subaru'        :1.15, 
    'Tesla'         :1.15, 
    'Toyota'        :1.15, 
    'Volkswagen'    :1.01, 
    'Volvo'         :1.02,
}

In [910]:
year_weights = {
    1990            :0.15, 
    1991            :0.15, 
    1992            :0.15, 
    1993            :0.15, 
    1994            :0.15, 
    1995            :0.15, 
    1996            :0.15, 
    1997            :0.15, 
    1998            :0.15, 
    1999            :0.15, 
    2000            :0.20, 
    2001            :0.20, 
    2002            :0.20, 
    2003            :0.25, 
    2004            :0.25, 
    2005            :0.30, 
    2006            :0.30, 
    2007            :0.35, 
    2008            :0.35, 
    2009            :0.40, 
    2010            :0.40, 
    2011            :0.45,
    2012            :0.50, 
    2013            :0.55, 
    2014            :0.60, 
    2015            :0.65, 
    2016            :0.70, 
    2017            :0.75, 
    2018            :0.80, 
    2019            :0.80, 
    2020            :0.85, 
    2021            :0.85, 
    2022            :0.90, 
    2023            :0.95,
    2024            :1.00,
}

In [911]:
cond_weights = {
    'like new'      :1.00,
    'excellent'     :0.975,
    'great'         :0.95,
    'good'          :0.90,
    'fair'          :0.80,
    'poor'          :0.70,
}

In [912]:
maker_model_mapping = {
    'Acura'         :{'NSX':1.25, 'MDX':1.20, 'RDX': 1.10, 'TLX': 1.05, 'ILX': 1.00}, 
    'Audi'          :{'R8': 1.30, 'Q8': 1.25, 'A8': 1.25, 'Q7': 1.20, 'A6': 1.15, 'Q5': 1.10, 'A4': 1.05, 'Q3':1.00}, 
    'BMW'           :{'8 Series': 1.30, '7 Series': 1.25, 'X7': 1.175, 'X6': 1.15, 'X5': 1.125, '5 Series': 1.10, 
                      '3 Series': 1.10, 'X3': 1.075, '2 Series': 1.05, '1 Series': 1.00},
    'Cadillac'      :{'Escalade': 1.25, 'CT6': 1.20, 'XT6': 1.15, 'XT5': 1.10, 'CT5': 1.05, 'XT4': 1.00},
    'Chevrolet'     :{'Corvette': 1.35, 'Silverado': 1.30, 'Tahoe': 1.15, 'Suburban': 1.10, 'Traverse': 1.075, 'Equinox': 1.05,
                      'Malibu': 1.05, 'Trax': 1.02, 'Spark': 1.00, 'Sonic': 1.00}, 
    'Chrysler'      :{'Pacifica': 1.15, '300': 1.10, 'Voyager': 1.00, 'PT Cruiser': 1.00}, 
    'Dodge'         :{'Challenger': 1.20, 'Ram': 1.15, 'Charger': 1.10, 'Durango': 1.05, 'Journey': 1.00}, 
    'Ford'          :{'Mustang': 1.25, 'F-150': 1.25, 'Expedition': 1.15, 'Explorer': 1.10, 'Edge': 1.05, 'Escape': 1.05,
                      'Ranger': 1.05, 'Fusion': 1.05, 'EcoSport': 1.025, 'Fiesta': 1.00}, 
    'GMC'           :{'Sierra': 1.20, 'Yukon': 1.10, 'Acadia': 1.05, 'Terrain': 1.00, 'Canyon1': 1.00}, 
    'Honda'         :{'Odyssey': 1.20, 'Pilot': 1.15, 'Passport': 1.10, 'Ridgeline': 1.10, 'Accord': 1.075, 'CR-V': 1.05,
                      'Civic': 1.05, 'HR-V': 1.00, 'Fit': 1.00}, 
    'Hyundai'       :{'Palisade': 1.25, 'Santa Fe': 1.20, 'Sonata': 1.10, 'Tucson': 1.10, 'Elantra': 1.025, 'Kona': 1.00,
                      'Accent': 1.00, 'Venue': 1.00}, 
    'Jeep'          :{'Gladiator': 1.25, 'Grand Wagoneer': 1.20, 'Wrangler': 1.15, 'Grand Cherokee': 1.10, 'Cherokee': 1.05, 
                      'Compass': 1.00, 'Renegade': 1.00}, 
    'Kia'           :{'Telluride': 1.20, 'K900': 1.15, 'Stinger': 1.10, 'Sorento': 1.075, 'Sportage': 1.025, 'Forte': 1.025, 
                      'Soul': 1.025, 'Rio': 1.00}, 
    'Lexus'         :{'LX': 1.30, 'LS': 1.25, 'LC': 1.20, 'GX': 1.15, 'RX': 1.10, 'NX': 1.05, 'ES': 1.00, 'UX': 1.00}, 
    'Mazda'         :{'CX-9': 1.20, 'MX-5 Miata': 1.15, 'CX-5': 1.15, 'Mazda6': 1.10, 'Mazda3': 1.00}, 
    'Mercedes-Benz' :{'S-Class': 1.25, 'G-Class': 1.20, 'E-Class': 1.15, 'GLS': 1.12, 'C-Class': 1.10, 'GLC': 1.05,
                      'A-Class': 1.00, 'CLA-Class': 1.00}, 
    'Nissan'        :{'GT-R': 1.45, 'Armada': 1.35, 'Titan': 1.30, 'Pathfinder': 1.25, 'Murano': 1.20, 'Rogue': 1.10, 
                      'Maxima': 1.10, 'Altima': 1.00, 'Sentra': 1.00, 'Versa': 1.00 }, 
    'Porsche'       :{'911': 1.35, 'Taycan': 1.25, 'Panamera': 1.20, 'Cayenne': 1.15, 'Macan': 1.10, '718 Boxster': 1.00,
                      '718 Cayman': 1.00}, 
    'Subaru'        :{'Ascent': 1.20, 'Outback': 1.15, 'Forester': 1.10, 'Crosstrek': 1.05, 'Legacy': 1.025, 'Impreza': 1.00,
                      'WRX': 1.00}, 
    'Tesla'         :{'Model S': 1.30, 'Model X': 1.20, 'Model 3': 1.00, 'Model Y': 1.00}, 
    'Toyota'        :{'Seqoia': 1.325, 'Tundra': 1.30, '4runner': 1.225, 'Tacoma': 1.20, 'Avalon': 1.15,'Highlander': 1.15, 
                      'RAV4': 1.10, 'Camry': 1.05, 'Corolla': 1.00, 'Prius': 1.00}, 
    'Volkswagen'    :{'Touareg': 1.15, 'Arteon': 1.10, 'Atlas': 1.10, 'Passat': 1.05, 'Tiguan': 1.05, 'Golf': 1.00,
                      'Jetta': 1.00, 'ID.4': 1.00}, 
    'Volvo'         :{'XC90': 1.25, 'XC60': 1.225, 'S90': 1.20, 'V90 Cross Country': 1.15, 'XC40': 1.125, 'S60': 1.10, 
                      'V60 Cross Country': 1.05, 'V60': 1.00},
}

In [913]:
cars_data = pd.DataFrame(columns = ['Make','Model','Year','Miles','Condition'])

cars_data

Unnamed: 0,Make,Model,Year,Miles,Condition


In [914]:
# Function to fill random makers
def fill_random_maker(row, makers):
    return random.choice(makers)

# Number of records
i = 100000

# Populate the DataFrame with i rows
cars_data = cars_data.reindex(range(i))

# Fill in the 'Make' column with random makers using the .apply method
makers = list(maker_weights.keys())
cars_data['Make'] = cars_data.apply(lambda row: fill_random_maker(row, makers), axis=1)

cars_data

Unnamed: 0,Make,Model,Year,Miles,Condition
0,Kia,,,,
1,BMW,,,,
2,GMC,,,,
3,Volvo,,,,
4,Porsche,,,,
...,...,...,...,...,...
99995,BMW,,,,
99996,Ford,,,,
99997,Kia,,,,
99998,BMW,,,,


In [915]:
# To fill other columns with intended effects
def generate_year():
    # Skewed distribution towards newer years
    return int(np.random.choice(range(1990, 2024), p=np.geomspace(0.001, 1, num=34)/np.sum(np.geomspace(0.001, 1, num=34))))

def generate_miles(year):
    current_year = 2025
    age = current_year - year
    return max(0, int(age * (15000 + random.uniform(-12000, 6500))))

def generate_condition(year):
    current_year = 2025
    age = current_year - year
    if year >= 2023:
        return random.choice(['Like New', 'Excellent', 'Great'])
    elif age <= 5:
        return random.choice(['Excellent', 'Great', 'Good'])
    elif age <= 10:
        return random.choice(['Great', 'Good', 'Fair'])
    elif age <= 17:
        return random.choice(['Great', 'Good', 'Fair', 'Poor'])
    else:
        return random.choice(['Good', 'Fair', 'Poor'])
    
def generate_model(make):
    return random.choice(list(maker_model_mapping[make].keys()))

In [916]:
cars_data['Model'] = cars_data.apply(lambda row: generate_model(row['Make']), axis=1)
cars_data['Year'] = cars_data.apply(lambda row: generate_year(), axis=1)
cars_data['Miles'] = cars_data.apply(lambda row: generate_miles(row['Year']), axis=1)
cars_data['Condition'] = cars_data.apply(lambda row: generate_condition(row['Year']), axis=1)

cars_data


Unnamed: 0,Make,Model,Year,Miles,Condition
0,Kia,K900,2020,90367,Good
1,BMW,2 Series,2022,48783,Great
2,GMC,Terrain,2021,41066,Great
3,Volvo,XC90,2018,87144,Fair
4,Porsche,Panamera,2011,202063,Good
...,...,...,...,...,...
99995,BMW,5 Series,2016,151340,Good
99996,Ford,Ranger,2011,100153,Good
99997,Kia,K900,2021,20582,Excellent
99998,BMW,7 Series,2017,161204,Fair


In [917]:
def miles_multiplier(miles):
    return max(0.01, min(1, 1 - (miles / 400000)))

def calculate_value(row, base_value= 25000):
    maker_weight = maker_weights[row['Make']]
    model_weight = maker_model_mapping[row['Make']][row['Model']]
    year_weight = year_weights[row['Year']]
    cond_weight = cond_weights[row['Condition'].lower()]
    miles_weight = miles_multiplier(row['Miles'])
    
    if row['Miles'] > 25000:
        return base_value * ((maker_weight * model_weight * year_weight * cond_weight * miles_weight) + 0.001) + 1000
    else:
        return base_value * ((maker_weight * model_weight * cond_weight) + 0.001) + 1000


In [918]:
cars_data['Value'] = cars_data.apply(lambda row: calculate_value(row), axis=1)
cars_data['Value'] = cars_data['Value'].astype('int')

cars_data

Unnamed: 0,Make,Model,Year,Miles,Condition,Value
0,Kia,K900,2020,90367,Good,16347
1,BMW,2 Series,2022,48783,Great,24672
2,GMC,Terrain,2021,41066,Great,20045
3,Volvo,XC90,2018,87144,Fair,16980
4,Porsche,Panamera,2011,202063,Good,12147
...,...,...,...,...,...,...
99995,BMW,5 Series,2016,151340,Good,13949
99996,Ford,Ranger,2011,100153,Good,9233
99997,Kia,K900,2021,20582,Excellent,26253
99998,BMW,7 Series,2017,161204,Fair,14457


In [919]:
cars_data.sample(20)

Unnamed: 0,Make,Model,Year,Miles,Condition,Value
23565,Mercedes-Benz,G-Class,2020,96409,Excellent,23669
20919,Chevrolet,Silverado,2021,40645,Excellent,25948
46773,Hyundai,Palisade,2020,21068,Great,27743
8649,Kia,Rio,2013,117067,Great,9340
76492,Porsche,Macan,2023,36479,Great,42752
14880,Honda,Odyssey,2008,299065,Fair,3165
75435,Volkswagen,Touareg,2018,88012,Great,18237
51203,Tesla,Model 3,2023,6125,Great,28337
66581,Cadillac,CT6,2022,42578,Excellent,28781
28426,Lexus,RX,2023,10327,Like New,33475


In [920]:
cars_data.describe()

Unnamed: 0,Year,Miles,Value
count,100000.0,100000.0,100000.0
mean,2018.74791,76511.67092,20538.57125
std,4.661774,70663.981208,8754.440077
min,1990.0,6001.0,1048.0
25%,2017.0,30433.75,15048.75
50%,2020.0,53901.5,20624.0
75%,2022.0,98604.75,25947.0
max,2023.0,751855.0,63462.0


In [921]:
#cars_data.to_csv('car_value_training_data_v1.csv', index=False)

In [922]:
#cars_data.query("""Make == 'Toyota' & Year == 2010""")

In [923]:
old_low_miles = pd.DataFrame(columns= ['Make', 'Model', 'Year', 'Miles', 'Condition', 'Value'])

old_low_miles

Unnamed: 0,Make,Model,Year,Miles,Condition,Value


In [924]:
# Function to fill random makers
def fill_random_maker(row, makers):
    return random.choice(makers)

def generate_model(make):
    return random.choice(list(maker_model_mapping[make].keys()))

def generate_old_year():
    return int(np.random.choice(range(1990, 2010)))

def generate_low_miles(year):
    current_year = 2025
    age = current_year - year
    return max(0, int(age * (4000 + random.uniform(-1500, 1000))))

def generate_positive_condition(year):
    return random.choice(['Excellent', 'Great', 'Good', 'Fair'])

def miles_multiplier(miles):
    return max(0.01, min(1, 1 - (miles / 400000)))

def calculate_old_low_value(row, base_value= 17000):
    maker_weight = maker_weights[row['Make']]
    model_weight = maker_model_mapping[row['Make']][row['Model']]
    year_weight = year_weights[row['Year']]
    if row['Year'] < 2005:
        year_weight == 0.30
    cond_weight = cond_weights[row['Condition'].lower()]
    if row['Condition'] == 'Excellent':
        cond_weight += 0.50
    miles_weight = miles_multiplier(row['Miles'])
    return base_value * ((maker_weight * model_weight * model_weight * (year_weight) * (cond_weight) * miles_weight) + 0.001) + 1000



In [925]:
# Number of records
j = int(i/10)

# Populate the DataFrame with i rows
old_low_miles = old_low_miles.reindex(range(j))

# Fill in the 'Make' column with random makers using the .apply method
makers = list(maker_weights.keys())
old_low_miles['Make'] = old_low_miles.apply(lambda row: fill_random_maker(row, makers), axis=1)
old_low_miles['Model'] = old_low_miles.apply(lambda row: generate_model(row['Make']), axis=1)
old_low_miles['Year'] = old_low_miles.apply(lambda row: generate_old_year(), axis=1)
old_low_miles['Miles'] = old_low_miles.apply(lambda row: generate_low_miles(row['Year']), axis=1)
old_low_miles['Condition'] = old_low_miles.apply(lambda row: generate_positive_condition(row['Year']), axis=1)
old_low_miles['Value'] = old_low_miles.apply(lambda row: calculate_old_low_value(row), axis=1)
old_low_miles['Value'] = old_low_miles['Value'].astype('int')


old_low_miles

Unnamed: 0,Make,Model,Year,Miles,Condition,Value
0,Hyundai,Palisade,2008,61263,Fair,6685
1,Hyundai,Venue,1998,130633,Fair,2253
2,Jeep,Grand Wagoneer,1997,116125,Fair,3414
3,Honda,Fit,1995,106162,Excellent,3807
4,Subaru,Outback,2002,104859,Great,4641
...,...,...,...,...,...,...
9995,BMW,8 Series,1998,90167,Excellent,6925
9996,Lexus,LX,2004,70078,Good,7308
9997,Mercedes-Benz,A-Class,2009,67326,Excellent,11027
9998,Toyota,Corolla,1997,115987,Fair,2682


In [926]:
old_low_miles.sample(20)

Unnamed: 0,Make,Model,Year,Miles,Condition,Value
2044,Honda,Passport,2001,73364,Fair,3731
6848,Ford,Expedition,2000,94019,Good,4205
9806,Audi,A8,1996,114939,Good,4211
189,Tesla,Model S,2001,119973,Fair,4717
2333,Volvo,V60 Cross Country,1993,128845,Great,2863
7961,Volkswagen,Golf,2007,68256,Excellent,8368
6571,Ford,EcoSport,1995,107189,Fair,2633
369,Mazda,Mazda3,2002,65733,Great,3581
5083,Mazda,Mazda3,2001,95698,Great,3351
800,GMC,Yukon,1992,121865,Excellent,4339


In [927]:
new_high_miles = pd.DataFrame(columns= ['Make', 'Model', 'Year', 'Miles', 'Condition', 'Value'])

new_high_miles

Unnamed: 0,Make,Model,Year,Miles,Condition,Value


In [928]:
# Function to fill random makers
def fill_random_maker(row, makers):
    return random.choice(makers)

def generate_model(make):
    return random.choice(list(maker_model_mapping[make].keys()))

def generate_new_year():
    return int(np.random.choice(range(2020, 2024)))

def generate_high_miles(year):
    current_year = 2025
    age = current_year - year
    return max(0, int(age * (35000 + random.uniform(-5000, 10000))))

def generate_mixed_condition(year):
    return random.choice(['Great', 'Good', 'Fair'])

def miles_multiplier(miles):
    return max(0.01, min(1, 1 - (miles / 400000)))

def calculate_new_high_value(row, base_value= 17000):
    maker_weight = maker_weights[row['Make']]
    model_weight = maker_model_mapping[row['Make']][row['Model']]
    year_weight = year_weights[row['Year']]
    cond_weight = cond_weights[row['Condition'].lower()]
    miles_weight = miles_multiplier(row['Miles'])
    return base_value * ((maker_weight) * (model_weight) * (model_weight) * (year_weight) * (cond_weight) * miles_weight + 0.001) - 1000

In [929]:
# Number of records
k = int(i/25)

# Populate the DataFrame with i rows
new_high_miles = new_high_miles.reindex(range(k))

# Fill in the 'Make' column with random makers using the .apply method
makers = list(maker_weights.keys())
new_high_miles['Make'] = new_high_miles.apply(lambda row: fill_random_maker(row, makers), axis=1)
new_high_miles['Model'] = new_high_miles.apply(lambda row: generate_model(row['Make']), axis=1)
new_high_miles['Year'] = new_high_miles.apply(lambda row: generate_new_year(), axis=1)
new_high_miles['Miles'] = new_high_miles.apply(lambda row: generate_high_miles(row['Year']), axis=1)
new_high_miles['Condition'] = new_high_miles.apply(lambda row: generate_mixed_condition(row['Year']), axis=1)
new_high_miles['Value'] = new_high_miles.apply(lambda row: calculate_new_high_value(row), axis=1)
new_high_miles['Value'] = new_high_miles['Value'].astype('int')


new_high_miles

Unnamed: 0,Make,Model,Year,Miles,Condition,Value
0,Honda,Odyssey,2022,95221,Fair,12581
1,Toyota,Highlander,2022,113472,Fair,12351
2,Nissan,Sentra,2020,203827,Fair,4402
3,Audi,A8,2022,126933,Great,18397
4,Toyota,Highlander,2020,188348,Good,9482
...,...,...,...,...,...,...
3995,Ford,Explorer,2020,178084,Good,8009
3996,Volkswagen,Passat,2020,211522,Great,6219
3997,Porsche,Cayenne,2020,195677,Good,15270
3998,Tesla,Model X,2023,81194,Great,19266


In [930]:
all_cars_data = pd.concat(objs=[cars_data, old_low_miles, new_high_miles], axis=0)

all_cars_data

Unnamed: 0,Make,Model,Year,Miles,Condition,Value
0,Kia,K900,2020,90367,Good,16347
1,BMW,2 Series,2022,48783,Great,24672
2,GMC,Terrain,2021,41066,Great,20045
3,Volvo,XC90,2018,87144,Fair,16980
4,Porsche,Panamera,2011,202063,Good,12147
...,...,...,...,...,...,...
3995,Ford,Explorer,2020,178084,Good,8009
3996,Volkswagen,Passat,2020,211522,Great,6219
3997,Porsche,Cayenne,2020,195677,Good,15270
3998,Tesla,Model X,2023,81194,Great,19266


In [931]:
all_cars_data.to_csv("car_value_training_data_v2.csv")