# Exploring eBay Car Sales Data
The aim of the project is to explore and analyze used car listings from eBay Kleinanzeigen *(classifieds section of the German eBay website).*

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
autos = pd.read_csv("autos.csv", encoding="Latin-1")

In [14]:
autos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
dateCrawled            50000 non-null object
name                   50000 non-null object
seller                 50000 non-null object
offerType              50000 non-null object
price                  50000 non-null object
abtest                 50000 non-null object
vehicleType            44905 non-null object
yearOfRegistration     50000 non-null int64
gearbox                47320 non-null object
powerPS                50000 non-null int64
model                  47242 non-null object
odometer               50000 non-null object
monthOfRegistration    50000 non-null int64
fuelType               45518 non-null object
brand                  50000 non-null object
notRepairedDamage      40171 non-null object
dateCreated            50000 non-null object
nrOfPictures           50000 non-null int64
postalCode             50000 non-null int64
lastSeen               50000 non-null obj

**Observations**
 - The dataset contains 20 columns, most of which are strings.
 - Some columns have null values, but none have more than ~20% null values.
 - The column names use camelcase

**Cleaning column labels** - from camelcase to py's preferred snakecase

In [15]:
autos.columns;

In [16]:
autos.columns = ['date_crawled', 'name', 'seller', 'offer_type', 'price', 'abtest',
       'vehicle_type', 'year_of_registration', 'gearbox', 'power_ps', 'model',
       'odometer', 'month_of_registration', 'fuel_type', 'brand',
       'not_repaired_damage', 'date_created', 'nr_of_pictures', 'postal_code',
       'last_seen']

In [17]:
autos.head(2)

Unnamed: 0,date_crawled,name,seller,offer_type,price,abtest,vehicle_type,year_of_registration,gearbox,power_ps,model,odometer,month_of_registration,fuel_type,brand,not_repaired_damage,date_created,nr_of_pictures,postal_code,last_seen
0,2016-03-26 17:47:46,Peugeot_807_160_NAVTECH_ON_BOARD,privat,Angebot,"$5,000",control,bus,2004,manuell,158,andere,"150,000km",3,lpg,peugeot,nein,2016-03-26 00:00:00,0,79588,2016-04-06 06:45:54
1,2016-04-04 13:38:56,BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik,privat,Angebot,"$8,500",control,limousine,1997,automatik,286,7er,"150,000km",6,benzin,bmw,nein,2016-04-04 00:00:00,0,71034,2016-04-06 14:45:08


In [18]:
autos.describe(include='all')

Unnamed: 0,date_crawled,name,seller,offer_type,price,abtest,vehicle_type,year_of_registration,gearbox,power_ps,model,odometer,month_of_registration,fuel_type,brand,not_repaired_damage,date_created,nr_of_pictures,postal_code,last_seen
count,50000,50000,50000,50000,50000,50000,44905,50000.0,47320,50000.0,47242,50000,50000.0,45518,50000,40171,50000,50000.0,50000.0,50000
unique,48213,38754,2,2,2357,2,8,,2,,245,13,,7,40,2,76,,,39481
top,2016-03-23 19:38:20,Ford_Fiesta,privat,Angebot,$0,test,limousine,,manuell,,golf,"150,000km",,benzin,volkswagen,nein,2016-04-03 00:00:00,,,2016-04-07 06:17:27
freq,3,78,49999,49999,1421,25756,12859,,36993,,4024,32424,,30107,10687,35232,1946,,,8
mean,,,,,,,,2005.07328,,116.35592,,,5.72336,,,,,0.0,50813.6273,
std,,,,,,,,105.712813,,209.216627,,,3.711984,,,,,0.0,25779.747957,
min,,,,,,,,1000.0,,0.0,,,0.0,,,,,0.0,1067.0,
25%,,,,,,,,1999.0,,70.0,,,3.0,,,,,0.0,30451.0,
50%,,,,,,,,2003.0,,105.0,,,6.0,,,,,0.0,49577.0,
75%,,,,,,,,2008.0,,150.0,,,9.0,,,,,0.0,71540.0,


* Dropping column - nr_of_pictures, which has all null values 
* Converting columns - price, odometer from string to float

In [19]:
autos.drop('nr_of_pictures', axis=1)
autos["price"] = (autos["price"]
                  .str.replace("$","")
                  .str.replace(',','')
                  .astype(float))

In [20]:
autos["odometer"] = (autos["odometer"]
                     .str.replace("km","")
                     .str.replace(',','')
                     .astype(float))

In [21]:
autos["odometer"].head()

0    150000.0
1    150000.0
2     70000.0
3     70000.0
4    150000.0
Name: odometer, dtype: float64

In [22]:
autos.rename({"price":"price_dollar","odometer":"odometer_km"}, 
             axis=1, 
             inplace=True)

autos.dtypes

date_crawled              object
name                      object
seller                    object
offer_type                object
price_dollar             float64
abtest                    object
vehicle_type              object
year_of_registration       int64
gearbox                   object
power_ps                   int64
model                     object
odometer_km              float64
month_of_registration      int64
fuel_type                 object
brand                     object
not_repaired_damage       object
date_created              object
nr_of_pictures             int64
postal_code                int64
last_seen                 object
dtype: object

###  Analyzing 'prices_dollar'

In [23]:
autos["price_dollar"].describe()

count    5.000000e+04
mean     9.840044e+03
std      4.811044e+05
min      0.000000e+00
25%      1.100000e+03
50%      2.950000e+03
75%      7.200000e+03
max      1.000000e+08
Name: price_dollar, dtype: float64

In [24]:
autos["price_dollar"].value_counts().sort_index(ascending=False).head(10)

99999999.0    1
27322222.0    1
12345678.0    3
11111111.0    2
10000000.0    1
3890000.0     1
1300000.0     1
1234566.0     1
999999.0      2
999990.0      1
Name: price_dollar, dtype: int64

In [25]:
autos["price_dollar"].value_counts().sort_index().head(10)

0.0     1421
1.0      156
2.0        3
3.0        1
5.0        2
8.0        1
9.0        1
10.0       7
11.0       2
12.0       3
Name: price_dollar, dtype: int64

**Outliers are outright visible. I will use IQR score to remove the outliers.**

In [26]:
Q1 = 1100
Q3 = 7200
IQR = 7200-1100

In [27]:
outliers = autos[(autos["price_dollar"] < (Q1 - 1.5 * IQR)) | (autos["price_dollar"] > (Q3 + 1.5 * IQR))]

In [28]:
outliers.shape

(3784, 20)

In [29]:
autos = autos[~((autos["price_dollar"] < (Q1 - 1.5*IQR)) | (autos["price_dollar"] > (Q3 + 1.5*IQR)))]

In [30]:
autos.shape

(46216, 20)

**Outliers are less than ~10% of the data and hence can be removed from the dataframe.**

In [31]:
autos["price_dollar"].value_counts().sort_index(ascending=False).head(10)

16350.0     9
16333.0     2
16300.0    11
16299.0     4
16290.0     3
16250.0     9
16200.0    17
16190.0     1
16150.0     2
16100.0     2
Name: price_dollar, dtype: int64

In [32]:
autos["price_dollar"].value_counts().sort_index().head(10)

0.0     1421
1.0      156
2.0        3
3.0        1
5.0        2
8.0        1
9.0        1
10.0       7
11.0       2
12.0       3
Name: price_dollar, dtype: int64

In [33]:
autos["price_dollar"].describe()

count    46216.000000
mean      3963.696101
std       3847.238683
min          0.000000
25%       1000.000000
50%       2500.000000
75%       5900.000000
max      16350.000000
Name: price_dollar, dtype: float64

### Analyzing 'odometer_km'
Using IQR method to remove the outliers.

In [34]:
autos["odometer_km"].describe()

count     46216.000000
mean     129603.275922
std       36811.596099
min        5000.000000
25%      125000.000000
50%      150000.000000
75%      150000.000000
max      150000.000000
Name: odometer_km, dtype: float64

In [35]:
outliers = autos[(autos["odometer_km"] < (125000 - 1.5 * 25000)) | (autos["odometer_km"] > (150000 + 1.5 * 25000))]

In [36]:
outliers.shape

(6252, 20)

In [37]:
autos = autos[~((autos["odometer_km"] < (125000 - 1.5*25000)) | (autos["odometer_km"] > (150000 + 1.5*25000)))]

In [16]:
autos.shape

(41520, 20)