In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from collections import defaultdict

In [2]:
df = pd.read_csv('../data/raw/used_car_sales.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122144 entries, 0 to 122143
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   ID            122144 non-null  int64 
 1   pricesold     122144 non-null  int64 
 2   yearsold      122144 non-null  int64 
 3   zipcode       121235 non-null  object
 4   Mileage       122144 non-null  int64 
 5   Make          122144 non-null  object
 6   Model         121571 non-null  object
 7   Year          122144 non-null  int64 
 8   Trim          73208 non-null   object
 9   Engine        94940 non-null   object
 10  BodyType      101358 non-null  object
 11  NumCylinders  122144 non-null  int64 
 12  DriveType     97287 non-null   object
dtypes: int64(6), object(7)
memory usage: 12.1+ MB


In [4]:
selected_zips = [zip for zip in list((df.zipcode.unique())) 
 if (len(str(zip)) == 5 and '*' not in zip)]
selected_df = df[df['zipcode'].isin(selected_zips)]
selected_df.shape

(82491, 13)

In [5]:
selected_df.head(10)

Unnamed: 0,ID,pricesold,yearsold,zipcode,Mileage,Make,Model,Year,Trim,Engine,BodyType,NumCylinders,DriveType
1,96705,15000,2019,81006,0,Replica/Kit Makes,Jaguar Beck Lister,1958,,383 Fuel injected,Convertible,8,RWD
2,119660,8750,2020,33449,55000,Jaguar,XJS,1995,2+2 Cabriolet,4.0L In-Line 6 Cylinder,Convertible,6,RWD
3,80773,11600,2019,7852,97200,Ford,Mustang,1968,Stock,289 cu. in. V8,Coupe,8,RWD
4,64287,44000,2019,7728,40703,Porsche,911,2002,Turbo X-50,3.6L,Coupe,6,AWD
7,5250,70000,2019,7627,6500,Land Rover,Defender,1997,,4.0 Liter Fuel Injected V8,,0,4WD
8,29023,1330,2019,7043,167000,Honda,Civic,2001,EX,,Coupe,4,FWD
9,80293,25200,2019,33759,15000,Pontiac,GTO,1970,,,,0,
11,72418,14100,2019,7014,109500,Jeep,Wrangler,2012,Unlimited,3.6L,SUV,6,4WD
12,91661,5700,2020,76051,1000000,Ford,F-100,1949,,,Standard Cab Pickup,8,RWD
19,59728,18550,2019,60448,6714,Chevrolet,Camaro,2002,"Z28,SS,SLP",5.7 liter v8,Coupe,8,RWD


In [6]:
var_list = ["Make", "Model", "Year", "BodyType"]
level_count = {}
for var in var_list:
    count = len(selected_df[var].unique())
    level_count[var] = count
level_count

{'Make': 379, 'Model': 3033, 'Year': 136, 'BodyType': 1940}

In [8]:
selected_df_make = selected_df.groupby('Make').count()
selected_df_make_count = selected_df_make.sort_values("ID", ascending=False)['ID']
selected_df_make_count = pd.DataFrame(selected_df_make_count)
selected_df_make_count.columns = ["Count"]
selected_df_make_count = selected_df_make_count.reset_index()
selected_df_make.to_csv('../data/raw/car_make_list.csv')

In [10]:
major_makes = {'AMC', 'Acura', 'Ford', 'BMW', 'Chevrolet', 'Dodge', 'Jeep', 'Audi', 'Buick', 'Cadillac', 'Chrysler', 'Ferrari', 'GMC', 'Honda', 
                'Hummer', 'Hyundai', 'Infiniti', 'Harvester', 'Jaguar', 'Kia', 'Land Rover', 'Lexus', 'Lincoln', 'MG', 'Maserati', 
                'Mazda', 'Mercedes-Benz', 'Mini', 'Mitsubishi', 'Nissan', 'Oldsmobile', 'Pontiac', 'Porsche', 'Rolls-Royce', 'Subaru', 
                'Suzuki', 'Tesla', 'Toyota', 'Triumph', 'Volkswagen', 'Volve'}

# Normalize the major makes to lowercase for matching
major_makes = {make.lower() for make in major_makes}

# Function to reclassify car makes
def reclassify_make(make):
    make_lower = make.lower()
    for major_make in major_makes:
        if major_make in make_lower:
            return major_make.title()  # Convert back to title case for presentation
    return 'Other'

# Reclassify the 'Make' column
selected_df['Reclassified Make'] = df['Make'].apply(reclassify_make)
selected_df['Reclassified Make'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_df['Reclassified Make'] = df['Make'].apply(reclassify_make)


Reclassified Make
Ford             14854
Chevrolet        14133
Other             5845
Toyota            4477
Mercedes-Benz     4196
Dodge             4035
Bmw               3598
Jeep              3106
Cadillac          2486
Volkswagen        2454
Honda             2394
Pontiac           1805
Gmc               1720
Nissan            1673
Porsche           1490
Lincoln           1292
Buick             1210
Chrysler          1040
Audi              1034
Subaru             923
Lexus              902
Jaguar             863
Land Rover         837
Oldsmobile         709
Mazda              678
Hyundai            489
Acura              465
Mini               446
Infiniti           393
Mg                 382
Harvester          352
Mitsubishi         346
Hummer             329
Kia                287
Triumph            277
Tesla              207
Amc                192
Maserati           159
Rolls-Royce        141
Ferrari            139
Suzuki             133
Name: count, dtype: int64

In [11]:
selected_df_make_rec = selected_df.groupby('Reclassified Make').count()
selected_df_make_rec_count = selected_df_make_rec.sort_values("ID", ascending=False)['ID']
selected_df_make_rec_count = pd.DataFrame(selected_df_make_rec_count)
selected_df_make_rec_count.columns = ["Count"]
selected_df_make_rec_count = selected_df_make_rec_count.reset_index()
selected_df_make_rec_count

Unnamed: 0,Reclassified Make,Count
0,Ford,14854
1,Chevrolet,14133
2,Other,5845
3,Toyota,4477
4,Mercedes-Benz,4196
5,Dodge,4035
6,Bmw,3598
7,Jeep,3106
8,Cadillac,2486
9,Volkswagen,2454


In [12]:
selected_df_make_rec_count['Count'].sum()

82491

In [13]:
selected_df_model = selected_df.groupby('Model').count()
selected_df_model_count = selected_df_model.sort_values("ID", ascending=False)['ID']
selected_df_model_count = pd.DataFrame(selected_df_model_count)
selected_df_model_count.columns = ["Count"]
selected_df_model_count = selected_df_model_count.reset_index()
selected_df_model_count.head(20)

Unnamed: 0,Model,Count
0,Mustang,2994
1,Corvette,2002
2,F-150,1740
3,Other,1533
4,Camaro,1448
5,Other Pickups,1303
6,F-250,1294
7,Wrangler,1152
8,3-Series,1111
9,C-10,1013
