In [365]:
import pandas as pd
import numpy as np
import math
import plotly.express as px
import seaborn as sns

Filtering data:

In [None]:
cars_df = pd.read_csv('data/car_prices.csv')
cars_df['make'] = cars_df['make'].str.lower()
toyota_df = cars_df[(cars_df['make'] == 'toyota') | (cars_df['make'] == 'lexus') | (cars_df['make'] == 'scion')]
toyota_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53633 entries, 57 to 558806
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   year          53633 non-null  int64  
 1   make          53633 non-null  object 
 2   model         53633 non-null  object 
 3   trim          53627 non-null  object 
 4   body          53419 non-null  object 
 5   transmission  48426 non-null  object 
 6   vin           53633 non-null  object 
 7   state         53633 non-null  object 
 8   condition     52506 non-null  float64
 9   odometer      53625 non-null  float64
 10  color         53614 non-null  object 
 11  interior      53614 non-null  object 
 12  seller        53633 non-null  object 
 13  mmr           53633 non-null  float64
 14  sellingprice  53633 non-null  float64
 15  saledate      53633 non-null  object 
dtypes: float64(4), int64(1), object(11)
memory usage: 7.0+ MB


Filling the missing values:

In [367]:
toyota_df.isna().sum()

year               0
make               0
model              0
trim               6
body             214
transmission    5207
vin                0
state              0
condition       1127
odometer           8
color             19
interior          19
seller             0
mmr                0
sellingprice       0
saledate           0
dtype: int64

In [368]:
bodies_dict = {
    'gx' : 'suv',
    'tundra' : 'truck',
    'sienna' : 'minivan',
    'lx' : 'suv',
    'corolla' : 'sedan',
    'matrix' : 'minivan',
    'camry' : 'sedan',
    'yaris' : 'hatchback',
    'previa' : 'minivan',
    'pickup' : 'truck',
    'avalon' : 'sedan'}

toyota_df.loc[toyota_df['body'].isnull(), 'body'] = toyota_df['model'].map(bodies_dict)

In [369]:
def get_fillna(col):
    return_dict = {}
    for now in toyota_df[toyota_df[col].isnull()]['model'].value_counts().index:
        mask = toyota_df['model'] == now
        return_dict[now] = toyota_df[mask][col].value_counts().index[0]
    return return_dict

toyota_df.loc[toyota_df['trim'].isnull(), 'trim'] = toyota_df['model'].map(get_fillna('trim'))
toyota_df.loc[toyota_df['transmission'].isnull(), 'transmission'] = toyota_df['model'].map(get_fillna('transmission'))
toyota_df.loc[toyota_df['color'].isnull(), 'color'] = toyota_df['model'].map(get_fillna('color'))
toyota_df.loc[toyota_df['interior'].isnull(), 'interior'] = toyota_df['model'].map(get_fillna('interior'))

In [370]:
toyota_df['model'] = toyota_df['model'].apply(lambda x: str.lower(x))
toyota_df['model']



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



57         rx 350
359        gs 350
386        es 350
407        rx 350
413       rx 450h
           ...   
558742     matrix
558745    4runner
558759     tundra
558763     tundra
558806     rx 350
Name: model, Length: 53633, dtype: object

In [371]:
def get_con(row):
    mask = (toyota_df['model'] == row['model']) & (toyota_df['year'] == row['year'])& (toyota_df['trim'] == row['trim'])
    price_to_find = row['sellingprice']
    con = toyota_df[mask].iloc[(toyota_df[mask]['sellingprice'] - price_to_find).abs().argsort()[:2]]['condition'].mean()  
    if math.isnan(con):
        mask = (toyota_df['model'] == row['model']) & (toyota_df['trim'] == row['trim'])
        price_to_find = row['year']
        con = toyota_df[mask].iloc[(toyota_df[mask]['sellingprice'] - price_to_find).abs().argsort()[:2]]['condition'].mean()  
    if math.isnan(con):
        mask = (toyota_df['model'] == row['model'])
        price_to_find = row['sellingprice']
        con = toyota_df[mask].iloc[(toyota_df[mask]['sellingprice'] - price_to_find).abs().argsort()[:2]]['condition'].mean()  
    if math.isnan(con):
        mask = (toyota_df['model'] == row['model']) & (toyota_df['year'] == row['year'])& (toyota_df['trim'] == row['trim'])
        con = toyota_df[mask]['condition'].mean()
    return con

toyota_df.loc[toyota_df['condition'].isna(), 'condition'] = toyota_df[toyota_df['condition'].isna()].apply(get_con, axis=1)

In [372]:
def get_odo(row):
    mask = (toyota_df['model'] == row['model']) & (toyota_df['year'] == row['year'])& (toyota_df['trim'] == row['trim'])
    price_to_find = row['sellingprice']
    odo = toyota_df[mask].iloc[(toyota_df[mask]['sellingprice'] - price_to_find).abs().argsort()[:2]]['odometer'].mean()  
    if math.isnan(odo):
        mask = (toyota_df['model'] == row['model']) & (toyota_df['trim'] == row['trim'])
        price_to_find = row['year']
        odo = toyota_df[mask].iloc[(toyota_df[mask]['sellingprice'] - price_to_find).abs().argsort()[:2]]['odometer'].mean()  
    if math.isnan(odo):
        mask = (toyota_df['model'] == row['model'])
        price_to_find = row['sellingprice']
        odo = toyota_df[mask].iloc[(toyota_df[mask]['sellingprice'] - price_to_find).abs().argsort()[:2]]['odometer'].mean()  
    if math.isnan(odo):
        mask = (toyota_df['model'] == row['model']) & (toyota_df['year'] == row['year'])& (toyota_df['trim'] == row['trim'])
        odo = toyota_df[mask]['odometer'].mean()
    return odo

toyota_df.loc[toyota_df['odometer'].isna(), 'odometer'] = toyota_df[toyota_df['odometer'].isna()].apply(get_con, axis=1)

Feature processing:

In [373]:
toyota_df['saledate'] = pd.to_datetime(toyota_df['saledate'], utc=True)



Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [374]:
toyota_df['saledate']

57       2014-12-29 16:00:00+00:00
359      2014-12-17 04:30:00+00:00
386      2014-12-18 04:00:00+00:00
407      2014-12-17 04:30:00+00:00
413      2014-12-17 04:30:00+00:00
                    ...           
558742   2015-06-18 21:45:00+00:00
558745   2015-06-18 22:00:00+00:00
558759   2015-07-07 02:30:00+00:00
558763   2015-07-09 02:00:00+00:00
558806   2015-07-05 23:00:00+00:00
Name: saledate, Length: 53633, dtype: datetime64[ns, UTC]

In [375]:
toyota_df['sale_month'] = toyota_df['saledate'].dt.month
toyota_df['sale_year'] = toyota_df['saledate'].dt.year
toyota_df['sale_day'] = toyota_df['saledate'].dt.day_name()
toyota_df['sale_time'] = toyota_df['saledate'].dt.time
toyota_df['saledate'] = toyota_df['saledate'].dt.date



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [376]:
fig = px.treemap(toyota_df.groupby('model', as_index=False).count(), path=['model'], values='state')
fig.show()

In [377]:
hist_data = toyota_df.groupby('saledate', as_index=False)['sellingprice'].sum()
fig = px.histogram(x=hist_data['saledate'], y=hist_data['sellingprice'])
fig.show()

In [380]:
outliers = toyota_df[toyota_df['year'] > toyota_df['sale_year']]
toyota_df.drop(outliers.index, inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [381]:
fig = px.histogram(toyota_df[['sellingprice']])
fig.show()

In [390]:
x = toyota_df['sellingprice']
q1, q2 = x.quantile(0.25), x.quantile(0.75)
iqr = q2 - q1
lower = q1 - (1.5*iqr)
upper = q2 + (1.5 * iqr)
outliers_iqr = toyota_df[(x<lower)|(x>upper)]
toyota_df = toyota_df[(x>lower)&(x<upper)]
fig = px.histogram(toyota_df['sellingprice'])
fig.show()


In [None]:
toyota_df.to_csv('data/toyota_sales.csv')