In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from matplotlib.mlab import PCA as mlabPCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
%matplotlib inline

Data comes from Kaggle: https://www.kaggle.com/irfanazeem/used-cars-sale-price

In [2]:
sales = pd.read_csv('car_train.csv')
sales_select = sales.loc[:, ['price', 'economy', 'odometer', 'year', 'model', 'colour']]
sales_select_nona = sales_select.dropna()

## Clean the data

The dataset is very large (38k rows). Dropping NAs could bias the data (perhaps older or more unreliable cars are more likely to have missing data?) and potentially this should be investigated in more detail. However, for the purposes of this exercise feature selection is more important. 

In [3]:
# Clean colour
normal_colours = ['White', 'Silver', 'Grey', 'Blue', 'Black', 'Red']
sales_select_nona.loc[~sales_select_nona.colour.isin(normal_colours), 'colour' ] = 'Other'
sales_select_nona.colour.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


White     8845
Silver    7140
Grey      5379
Blue      3990
Black     3509
Red       2053
Other     1891
Name: colour, dtype: int64

In [4]:
## Check models are clean

sales_select_nona.model.value_counts()

Forester    12163
RAV4        10403
Impreza     10241
Name: model, dtype: int64

In [None]:
sales_select_num = sales_select_nona.loc[:, ['price', 'economy', 'odometer', 'year']]

In [None]:
## Look for numeric outliers

sales_select_num = sales_select_nona.loc[:, ['price', 'economy', 'odometer', 'year']]
g= sns.PairGrid(sales_select_num, diag_sharey=False)
g.map_upper(plt.scatter, alpha=.5)
g.map_lower(sns.regplot, scatter_kws=dict(alpha=0))
g.map_diag(sns.kdeplot, lw=3)
plt.show()

In [None]:
## Price - outliers above $80,000

sales_select_nona.loc[sales_select_nona.price > 80000,]

In [None]:
## Four cars are over $80,000 and all are over $110,000.
## Brand new Imprezas cost under $20k, and RAV4s cost $25k.
## Potentially decimal point issue but unclear so will remove.

sales_select_nona = sales_select_nona.loc[sales_select_nona.price < 100000,]

In [None]:
## Odometer - outliers over 400,000 miles.

sales_select_nona.loc[sales_select_nona.odometer > 400000,]

In [None]:
## Removing the two with over 1 million miles.

sales_select_nona = sales_select_nona.loc[sales_select_nona.odometer < 1000000,]

In [None]:
## Replot pairplots

sales_select_num = sales_select_nona.loc[:, ['price', 'economy', 'odometer', 'year']]
g= sns.PairGrid(sales_select_num, diag_sharey=False)
g.map_upper(plt.scatter, alpha=.5)
g.map_lower(sns.regplot, scatter_kws=dict(alpha=0))
g.map_diag(sns.kdeplot, lw=3)
plt.show()

In [None]:
## Change df name to shorter one

ssn = sales_select_nona

## Univariate explorations

In [None]:
## Continuous variables

ssn.describe()

In [None]:
## Categorical variables

ssn.describe(include='object')

In [None]:
## Plot continuous variable histograms

ssn_num = ssn.loc[:, ['price', 'economy', 'odometer', 'year']]
ssn_num_long = pd.melt(ssn_num)
ssn_num_long.head()

g = sns.FacetGrid(ssn_num_long, col='variable', col_wrap=2,
                 sharex=False,
                 sharey=False)
g.map(plt.hist, 'value')
plt.show()

In [None]:
ssn_cat = ssn.loc[:, ['model', 'colour']]
ssn_model_counts = ssn_cat.model.value_counts().to_frame().reset_index()
ssn_model_counts.head()

In [None]:
## Plot model barplot

g = sns.barplot(x='model', y='index', data=ssn_model_counts)
g.set_xlabel('Number in dataset')
g.set_ylabel('Car model')
plt.show()

In [None]:
## Arrange car colours for plot

ssn_colour_counts = ssn_cat.colour.value_counts().to_frame().reset_index()
car_colours = ssn_colour_counts['index'].tolist()
car_colours_lower = list(map(lambda x: x.lower(), car_colours))

## Adjust for visibility

adjust_colours = {
    'white': 'whitesmoke',
    'other': 'hotpink'
}

car_colours_palette = [adjust_colours.get(item, item) for item in car_colours_lower]

In [None]:
## Car colours plot

g = sns.barplot(x='colour', y='index', data=ssn_colour_counts,
               palette=car_colours_palette)
g.set_xlabel('Number in dataset')
g.set_ylabel('Car colour')
plt.show()

## Bivariate explorations

The pairplot was already considered when cleaning the data but is replotted here for completeness.

In [None]:
g= sns.PairGrid(ssn_num, diag_sharey=False)
g.map_upper(plt.scatter, alpha=.5)
g.map_lower(sns.regplot, scatter_kws=dict(alpha=0))
g.map_diag(sns.kdeplot, lw=3)
plt.show()

In [None]:
## Model versus colour

ssn_model_colour = ssn.groupby(['model', 'colour']).count().reset_index()
ssn_model_colour = ssn_model_colour.loc[:, ['model', 'colour', 'price']]
ssn_model_colour.columns = ['model', 'colour', 'count']

sns.set(style='whitegrid')

g = sns.FacetGrid(ssn_model_colour, col='model')
g.map(sns.barplot,'count', 'colour', order=car_colours,
     palette=car_colours_palette)
plt.show()

In [None]:
## Model versus continuous variables

ssn_model_long = pd.melt(ssn.loc[:, ['model', 'price', 'economy', 'odometer', 'year']], id_vars='model')

sns.set(style='white')
g = sns.FacetGrid(ssn_model_long, col='variable', col_wrap=2,
                 sharey=False)
g.map(sns.boxplot, 'model', 'value', order=['Forester', 'Impreza', 'RAV4'])
plt.show()

In [None]:
ssn.groupby('model').describe().T

In [None]:
## Colour versus continous variables
## Could remove highest odometer values so charts not squashed?

ssn_colour_long = pd.melt(ssn.loc[:, ['colour', 'price', 'economy', 'odometer', 'year']], id_vars='colour')

g = sns.FacetGrid(ssn_colour_long, row='variable', aspect=4, sharey=False)
g.map(sns.violinplot, 'colour', 'value', order=car_colours,
     palette=car_colours_palette)
plt.show()

## Feature selection

In [None]:
features = pd.concat([pd.get_dummies(ssn['model']), pd.get_dummies(ssn['colour'])], axis=1)
features.head()

In [None]:
## Above average mileage
## Average mileage is around 15,000 per year

ssn_mileage = ssn.copy()
ssn_mileage['age'] = 2019 - ssn_mileage['year']
ssn_mileage['exp_mileage'] = ssn_mileage['age'] * 15000
ssn_mileage['mileage_status'] = np.where(ssn_mileage['exp_mileage'] > ssn_mileage['odometer'], 'below average', 'above average')
features['above_av_mileage'] = np.where(ssn_mileage.mileage_status == 'above average', 1, 0)
features.head()

In [None]:
## Year, economy and odometer are a bit correlated - use PCA

ssn_yeo = ssn.loc[:, ['year', 'economy', 'odometer']]

ssn_yeo_X = StandardScaler().fit_transform(ssn_yeo)

sklearn_pca = PCA(n_components=3) # Specify number of components you want out.
Y_sklearn = sklearn_pca.fit_transform(ssn_yeo_X)

sklearn_pca.explained_variance_ratio_

In [None]:
## Use first component only

features['pca_yeo'] = Y_sklearn[:, 0]
features.head()

In [None]:
## Year is not a particularly normal looking variable - try transform

fig = plt.figure()

fig.add_subplot(221)
plt.hist(ssn['year'])
plt.title('Raw')

fig.add_subplot(222)
plt.hist(np.log(ssn['year']))
plt.title('Log')

fig.add_subplot(223)
plt.hist(np.sqrt(ssn['year']))
plt.title('Sqrt')

fig.add_subplot(224)
plt.hist(1/(ssn['year']))
plt.title('Inverse')

plt.tight_layout()
plt.show()

In [None]:
## None were good. Try age of car instead of year

fig = plt.figure()

fig.add_subplot(221)
plt.hist(ssn_mileage['age'], bins=15)
plt.title('Raw')

fig.add_subplot(222)
plt.hist(np.log(ssn_mileage['age']), bins=15)
plt.title('Log')

fig.add_subplot(223)
plt.hist(np.sqrt(ssn_mileage['age']), bins=15)
plt.title('Sqrt')

fig.add_subplot(224)
plt.hist(1/(ssn_mileage['age']), bins=15)
plt.title('Inverse')

plt.tight_layout()
plt.show()

In [None]:
## Not much better but perhaps square root looks most normal?

features['sqrt_age'] = np.sqrt(ssn_mileage['age'])
features.head()

In [None]:
## Odometer is non normal
## Add 1 to remove zero problems

fig = plt.figure()

fig.add_subplot(221)
plt.hist(ssn['odometer']+1)
plt.title('Raw')

fig.add_subplot(222)
plt.hist(np.log(ssn['odometer']+1))
plt.title('Log')

fig.add_subplot(223)
plt.hist(np.sqrt(ssn['odometer']+1))
plt.title('Sqrt')

fig.add_subplot(224)
plt.hist(1/(ssn['odometer']+1))
plt.title('Inverse')

plt.tight_layout()
plt.show()

In [None]:
features['sqrt_odometer'] = np.sqrt(ssn['odometer'] +1)

In [None]:
## Higher order relationships

sns.regplot(
    ssn_mileage['economy'],
    ssn_mileage['age'],
    order=3,
    scatter_kws={'alpha':0.3},
    line_kws={'color':'black'},
    ci=None
)
plt.show()

In [None]:
features['economy_cubed'] = ssn['economy']**3
features.head()

## Filtering

In [None]:
features['price'] = ssn['price']
features_corr = features.corr()
sns.heatmap(features_corr, cmap='PRGn', center=0)
plt.show()

In [None]:
features_corr.price

In [None]:
## RAV4, Black, Blue, Grey, Other, Red and White all have 
## under abs(0.1) correlation with price so remove as pretty useless.

features_select = features.loc[:, ['Forester', 'Impreza', 'Silver', 'above_av_mileage', 'pca_yeo', 'sqrt_age', 'sqrt_odometer', 'economy_cubed']]
features_variance = features_select.apply(lambda x: np.var(x))
features_variance

In [None]:
## sqrt_odometer and economy_cubed are much larger than the others
## simply as a result of the scales on which the variables are on.