In [1]:
import pandas
import numpy

In [2]:
# !mkdir -p ~/.ml/data/car

In [3]:
# !kaggle datasets download -d karimali/used-cars-data-pakistan

In [4]:
# !unzip used-cars-data-pakistan.zip

In [5]:
# !mv OLX_Car_Data_CSV.csv ~/.ml/data/car

In [6]:
carDataDirectory = "~/.ml/data/car/"
carDataCSV = "{}OLX_Car_Data_CSV.csv".format(carDataDirectory)

In [7]:
carsDF = pandas.read_csv(carDataCSV, header=0, encoding='unicode_escape')

In [8]:
carsDF.head(3)

Unnamed: 0,Brand,Condition,Fuel,KMs Driven,Model,Price,Registered City,Transaction Type,Year
0,Toyota,Used,Diesel,1.0,Prado,2100000,Karachi,Cash,1997.0
1,Suzuki,Used,Petrol,100000.0,Bolan,380000,Karachi,Cash,2006.0
2,Suzuki,Used,CNG,12345.0,Bolan,340000,Karachi,Cash,1998.0


In [9]:
categoricalVariables = ['Brand', 'Condition', 'Fuel','Model','Registered City','Transaction Type']
continuousVariables = ['Year','KMs Driven']

In [10]:
cleanedCarsDF = carsDF.dropna(axis=0)

In [11]:
categoricalDF = cleanedCarsDF[categoricalVariables]

In [12]:
continuousDF = cleanedCarsDF[continuousVariables]

In [13]:
continuousDF.describe()

Unnamed: 0,Year,KMs Driven
count,20334.0,20334.0
mean,2005.460067,135120.6
std,9.50177,627844.0
min,1915.0,1.0
25%,2001.0,19181.25
50%,2007.0,70000.0
75%,2013.0,100000.0
max,2020.0,10000000.0


In [14]:
continuousDF['Year'].unique()

array([1997., 2006., 1998., 2010., 2013., 2012., 2017., 2009., 1994.,
       1984., 2005., 1988., 1995., 1990., 2014., 1989., 2001., 2000.,
       2007., 2002., 2015., 1999., 1991., 1982., 2004., 2003., 2011.,
       1987., 1993., 2016., 1986., 2008., 2018., 1981., 1978., 1996.,
       1985., 1992., 1980., 1983., 1973., 1964., 1969., 1952., 1979.,
       1974., 1976., 1962., 1967., 1970., 1972., 1960., 1951., 1963.,
       1977., 1971., 1975., 1925., 1968., 2020., 1943., 2019., 1956.,
       1915., 1965.])

In [15]:
from functools import reduce

In [16]:
categoricalVariableDictionary = {}
for categoricalVariable in categoricalVariables:
    vocab = categoricalDF[categoricalVariable].str.lower().unique()
    def _toDictionary(dictionary, vocabItem):
        index, vocabToken = vocabItem
        dictionary[vocabToken] = index
        return dictionary
    categoricalVariableDictionary[categoricalVariable] = reduce(_toDictionary, enumerate(vocab), {})

In [17]:
for key, dick in categoricalVariableDictionary.items():
    print("{}: has this many: {}".format(key.upper(), len(dick.items())))

BRAND: has this many: 23
CONDITION: has this many: 2
FUEL: has this many: 5
MODEL: has this many: 296
REGISTERED CITY: has this many: 61
TRANSACTION TYPE: has this many: 2


In [18]:
for categoricalVar, dictionaryDude in categoricalVariableDictionary.items():
    categoricalDF[categoricalVar] = categoricalDF[categoricalVar].apply(lambda item: dictionaryDude[item.lower()])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [19]:
categoricalDF.head(-2)

Unnamed: 0,Brand,Condition,Fuel,Model,Registered City,Transaction Type
0,0,0,0,0,0,0
1,1,0,1,1,0,0
2,1,0,2,1,0,0
3,1,0,1,2,0,0
4,0,0,1,3,0,0
...,...,...,...,...,...,...
24964,1,0,2,21,10,0
24965,0,0,2,31,0,0
24966,0,0,1,232,5,0
24968,0,0,2,31,5,0


In [20]:
continuousDF.head(-2)

Unnamed: 0,Year,KMs Driven
0,1997.0,1.0
1,2006.0,100000.0
2,1998.0,12345.0
3,2010.0,94000.0
4,2013.0,100000.0
...,...,...
24964,1989.0,100000.0
24965,1994.0,100000.0
24966,2011.0,76190.0
24968,2001.0,200000.0


In [21]:
xDataSet = categoricalDF.join(continuousDF, how='outer')

In [22]:
xDataSet.head()

Unnamed: 0,Brand,Condition,Fuel,Model,Registered City,Transaction Type,Year,KMs Driven
0,0,0,0,0,0,0,1997.0,1.0
1,1,0,1,1,0,0,2006.0,100000.0
2,1,0,2,1,0,0,1998.0,12345.0
3,1,0,1,2,0,0,2010.0,94000.0
4,0,0,1,3,0,0,2013.0,100000.0


In [23]:
yDataSet = cleanedCarsDF['Price']

In [24]:
yDataSet.head()

0    2100000
1     380000
2     340000
3     535000
4    1430000
Name: Price, dtype: int64

In [25]:
from sklearn.tree import DecisionTreeRegressor

usedCarModel = DecisionTreeRegressor(random_state=1)


usedCarModel.fit(xDataSet, yDataSet)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=1, splitter='best')

In [26]:
cleanedCarsDF.head()

Unnamed: 0,Brand,Condition,Fuel,KMs Driven,Model,Price,Registered City,Transaction Type,Year
0,Toyota,Used,Diesel,1.0,Prado,2100000,Karachi,Cash,1997.0
1,Suzuki,Used,Petrol,100000.0,Bolan,380000,Karachi,Cash,2006.0
2,Suzuki,Used,CNG,12345.0,Bolan,340000,Karachi,Cash,1998.0
3,Suzuki,Used,Petrol,94000.0,Alto,535000,Karachi,Cash,2010.0
4,Toyota,Used,Petrol,100000.0,Corolla XLI,1430000,Karachi,Cash,2013.0


In [27]:
usedCarModel.predict(xDataSet.head())

array([2100000.,  380000.,  347500.,  540000., 1430000.])

In [28]:
from sklearn.metrics import mean_absolute_error

predictedCarPrices = usedCarModel.predict(xDataSet)

mean_absolute_error(yDataSet, predictedCarPrices)

8215.43805840728

In [38]:
from sklearn.model_selection import train_test_split

xTrainingSet, xValidationSet, yTrainingSet, yValidationSet = train_test_split(xDataSet, yDataSet, random_state=0)

usedCarModelV2 = DecisionTreeRegressor()

In [39]:
usedCarModelV2.fit(xTrainingSet, yTrainingSet)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [41]:
predictedCarPricesV2 = usedCarModelV2.predict(xValidationSet)

mean_absolute_error(yValidationSet, predictedCarPricesV2)

175569.10593955175

In [42]:
cleanedCarsDF.describe()

Unnamed: 0,KMs Driven,Price,Year
count,20334.0,20334.0,20334.0
mean,135120.6,869286.3,2005.460067
std,627844.0,1527317.0,9.50177
min,1.0,50000.0,1915.0
25%,19181.25,370000.0,2001.0
50%,70000.0,625000.0,2007.0
75%,100000.0,1080000.0,2013.0
max,10000000.0,87654320.0,2020.0


In [43]:
def getMeanAbsoluteError(maxLeafs, _xTrainingSet, _yTrainingSet, _xValidationSet, _yValidationSet):
    model = DecisionTreeRegressor(max_leaf_nodes=maxLeafs, random_state=0)
    model.fit(_xTrainingSet, _yTrainingSet)
    predictions = model.predict(_xValidationSet)
    loss = mean_absolute_error(_yValidationSet, predictions)
    return loss


In [44]:
for maxLeafs in [5, 50, 500, 5000]:
    lossyBoi = getMeanAbsoluteError(maxLeafs, xTrainingSet, yTrainingSet, xValidationSet, yValidationSet)
    print(f'Max Leaves:{maxLeafs}\t\tLoss:{lossyBoi}')

Max Leaves:5		Loss:400919.6408191887
Max Leaves:50		Loss:270007.6696610341
Max Leaves:500		Loss:187359.68528790644
Max Leaves:5000		Loss:176285.25615462306
