In [2]:
import numpy as np
import pandas as pd
from datetime import datetime
import plotly.express as px
from collections import defaultdict


In [None]:
df = pd.read_csv('data/raw/used_car_sales.csv')

In [4]:
df.describe()

Unnamed: 0,ID,pricesold,yearsold,Mileage,Year,NumCylinders
count,122144.0,122144.0,122144.0,122144.0,122144.0,122144.0
mean,85094.212397,10808.560715,2019.375467,1404291.0,3959.362,17586.55
std,47786.970812,13987.29576,0.503671,33355930.0,198451.4,6144603.0
min,1.0,0.0,2018.0,0.0,0.0,0.0
25%,44547.25,2950.0,2019.0,44792.25,1977.0,4.0
50%,85555.5,6500.0,2019.0,90000.0,2000.0,6.0
75%,127078.5,13800.0,2020.0,140200.0,2008.0,8.0
max,165801.0,404990.0,2020.0,1235669000.0,20140000.0,2147484000.0


In [5]:
df.shape

(122144, 13)

In [6]:
df['Year'] = df['Year'].astype('int')
df['Year'] = df['Year'].astype('str')


In [7]:
df.head()

Unnamed: 0,ID,pricesold,yearsold,zipcode,Mileage,Make,Model,Year,Trim,Engine,BodyType,NumCylinders,DriveType
0,137178,7500,2020,786**,84430,Ford,Mustang,1988,LX,5.0L Gas V8,Sedan,0,RWD
1,96705,15000,2019,81006,0,Replica/Kit Makes,Jaguar Beck Lister,1958,,383 Fuel injected,Convertible,8,RWD
2,119660,8750,2020,33449,55000,Jaguar,XJS,1995,2+2 Cabriolet,4.0L In-Line 6 Cylinder,Convertible,6,RWD
3,80773,11600,2019,07852,97200,Ford,Mustang,1968,Stock,289 cu. in. V8,Coupe,8,RWD
4,64287,44000,2019,07728,40703,Porsche,911,2002,Turbo X-50,3.6L,Coupe,6,AWD


By state, by make, by brand, by price, by milage, body

Big number: avg price, 

In [8]:
selected_zips = [zip for zip in list((df.zipcode.unique())) 
 if (len(str(zip)) == 5 and '*' not in zip)]
selected_df = df[df['zipcode'].isin(selected_zips)]
selected_df.shape

(82491, 13)

In [9]:
var_list = ["Make", "Model", "Year", "BodyType"]
level_count = {}
for var in var_list:
    count = len(selected_df[var].unique())
    level_count[var] = count
level_count

{'Make': 379, 'Model': 3033, 'Year': 136, 'BodyType': 1940}

In [10]:
px.histogram(selected_df["Year"],color=selected_df["Year"],
             title="Histogram by Year: Sale Count")


In [11]:
range_year = np.arange(1992, 1995)
range_year = list(range_year)
range_year = [str(x) for x in range_year]
range_year

['1992', '1993', '1994']

In [12]:
selected_df_year = selected_df.groupby('Model').count()
selected_df_year_count = selected_df_year.sort_values("ID", ascending=False)['ID']
selected_df_year_count = pd.DataFrame(selected_df_year_count)
selected_df_year_count.columns = ["Count"]
selected_df_year_count = selected_df_year_count.reset_index()
selected_df_year_count

Unnamed: 0,Model,Count
0,Mustang,2994
1,Corvette,2002
2,F-150,1740
3,Other,1533
4,Camaro,1448
...,...,...
3027,F-500 Dump Truck,1
3028,F-4,1
3029,F-350 Super Duty,1
3030,F-250 HD,1


In [13]:
px.bar(x = selected_df_year_count["Count"][:10], 
       y = selected_df_year_count["Model"][:10],
       title = "Top 10 sale in company: Across ",
       labels={'x': 'Sales', 'y': 'Model'})

In [14]:
selected_df_year_count["Model"][:10]

0          Mustang
1         Corvette
2            F-150
3            Other
4           Camaro
5    Other Pickups
6            F-250
7         Wrangler
8         3-Series
9             C-10
Name: Model, dtype: object

In [15]:
def model_trend_year(selected_df, model): 
    model_sale = selected_df[selected_df["Model"] == model].groupby("Year").count()[["ID"]]
    model_sale.columns = ["Count"]
    model_sale = model_sale.reset_index()
    model_sale["Model"] = model
    fig = px.line(model_sale, x='Year', y='Count', title=f'Car Sale Trend: {model}')
    return model_sale, fig

In [16]:
models = []
for model in selected_df_year_count["Model"][:5]:
    model_sale, _ = model_trend_year(selected_df, model)
    models.append(model_sale)
models = pd.concat(models, axis=0)
models

Unnamed: 0,Year,Count,Model
0,1932,1,Mustang
1,1937,1,Mustang
2,1952,1,Mustang
3,1953,1,Mustang
4,1963,2,Mustang
...,...,...,...
44,2018,19,Camaro
45,2019,8,Camaro
46,2020,1,Camaro
47,68,1,Camaro


In [17]:
replacements = {'200': '2000', '84': '1984', '68': '1968', '88': '1988'}
models['Year'] = models['Year'].replace(replacements)
models = models.sort_values('Year', ascending=True)
models

Unnamed: 0,Year,Count,Model
0,1886,1,Other
1,1900,1,Other
0,1900,1,Corvette
2,1903,1,Other
3,1916,1,Other
...,...,...,...
45,2019,8,Camaro
54,2019,20,F-150
66,2020,1,Corvette
46,2020,1,Camaro


In [18]:
fig = px.line(models, x='Year', y='Count', title=f'Car Sale Trend: Top 5 Model', color = 'Model')
fig.show()

In [34]:
list(selected_df_year_count["Model"][1:2])

['Corvette']

In [48]:
# replacements = {'200': '2000', '84': '1984', '68': '1968', '88': '1988'}
# selected_df['Year'] = selected_df['Year'].replace(replacements)
# model_sale = model_sale.sort_values('Year', ascending=True)

# _,baseline_plot = model_trend_year(selected_df, "Mustang")
# for model in list(selected_df_year_count["Model"][1:2]):
#     model_sale, _ = model_trend_year(selected_df, model)

#     replacements = {'200': '2000', '84': '1984', '68': '1968', '88': '1988'}
#     model_sale['Year'] = model_sale['Year'].replace(replacements)
#     model_sale = model_sale.sort_values('Year', ascending=True)
    
#     baseline_plot.add_scatter(x = model_sale['Year'],
#                               y = model_sale['Count'],
#                               mode = 'lines', name = model)
# baseline_plot

In [44]:
selected_df

Unnamed: 0,ID,pricesold,yearsold,zipcode,Mileage,Make,Model,Year,Trim,Engine,BodyType,NumCylinders,DriveType
1,96705,15000,2019,81006,0,Replica/Kit Makes,Jaguar Beck Lister,1958,,383 Fuel injected,Convertible,8,RWD
2,119660,8750,2020,33449,55000,Jaguar,XJS,1995,2+2 Cabriolet,4.0L In-Line 6 Cylinder,Convertible,6,RWD
3,80773,11600,2019,07852,97200,Ford,Mustang,1968,Stock,289 cu. in. V8,Coupe,8,RWD
4,64287,44000,2019,07728,40703,Porsche,911,2002,Turbo X-50,3.6L,Coupe,6,AWD
7,5250,70000,2019,07627,6500,Land Rover,Defender,1997,,4.0 Liter Fuel Injected V8,,0,4WD
...,...,...,...,...,...,...,...,...,...,...,...,...,...
122139,14948,4200,2019,80233,102700,Ford,Mustang,1977,,302,Fastback,8,
122140,58814,6500,2019,53132,128000,Ford,E-Series Van,2012,,E-150,,0,
122141,2156,2000,2019,77536,50000,Ford,Bronco,1978,,351m,,8,4WD
122142,29096,2280,2019,92131,164337,BMW,3-Series,2000,328ci,M52TU 2.8L,Coupe,6,RWD
