In [2]:
import pandas
import numpy

In [3]:
# !mkdir -p ~/.ml/data/car

In [4]:
# !kaggle datasets download -d karimali/used-cars-data-pakistan

In [5]:
# !unzip used-cars-data-pakistan.zip

In [6]:
# !mv OLX_Car_Data_CSV.csv ~/.ml/data/car

In [7]:
carDataDirectory = "~/.ml/data/car/"
carDataCSV = "{}OLX_Car_Data_CSV.csv".format(carDataDirectory)

In [8]:
carsDF = pandas.read_csv(carDataCSV, header=0, encoding='unicode_escape')

In [9]:
carsDF.head(3)

Unnamed: 0,Brand,Condition,Fuel,KMs Driven,Model,Price,Registered City,Transaction Type,Year
0,Toyota,Used,Diesel,1.0,Prado,2100000,Karachi,Cash,1997.0
1,Suzuki,Used,Petrol,100000.0,Bolan,380000,Karachi,Cash,2006.0
2,Suzuki,Used,CNG,12345.0,Bolan,340000,Karachi,Cash,1998.0


In [10]:
categoricalVariables = ['Brand', 'Condition', 'Fuel','Model','Registered City','Transaction Type']
continuousVariables = ['Year','KMs Driven']

In [11]:
cleanedCarsDF = carsDF.dropna()

In [12]:
categoricalDF = cleanedCarsDF[categoricalVariables]

In [13]:
continuousDF = cleanedCarsDF[continuousVariables]

In [14]:
from functools import reduce

In [15]:
categoricalVariableDictionary = {}
for categoricalVariable in categoricalVariables:
    vocab = categoricalDF[categoricalVariable].str.lower().unique()
    def _toDictionary(dictionary, vocabItem):
        index, vocabToken = vocabItem
        dictionary[vocabToken] = index
        return dictionary
    categoricalVariableDictionary[categoricalVariable] = reduce(_toDictionary, enumerate(vocab), {})

In [16]:
for key, dick in categoricalVariableDictionary.items():
    print("{}: has this many: {}".format(key.upper(), len(dick.items())))

BRAND: has this many: 23
CONDITION: has this many: 2
FUEL: has this many: 5
MODEL: has this many: 296
REGISTERED CITY: has this many: 61
TRANSACTION TYPE: has this many: 2


In [17]:
for categoricalVar, dictionaryDude in categoricalVariableDictionary.items():
    categoricalDF[categoricalVar] = categoricalDF[categoricalVar].apply(lambda item: dictionaryDude[item.lower()])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [18]:
xDataSet = categoricalDF.join(continuousDF, how='outer')

In [19]:
yDataSet = cleanedCarsDF.Price

In [20]:
from sklearn.metrics import mean_absolute_error

In [21]:
from sklearn.model_selection import train_test_split

xTrainingSet, xValidationSet, yTrainingSet, yValidationSet = train_test_split(xDataSet, yDataSet, random_state=0)

In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [23]:
# !pip3 install xgboost==0.80

In [33]:
def calculateLoss(model, inputValidationSet, expectedValidationSet):
    preds = model.predict(inputValidationSet)
    loss = mean_absolute_error(expectedValidationSet, preds)
    print(f"Mean absolute error {loss}")

In [99]:
from xgboost import XGBRegressor

usedCarModel = XGBRegressor(n_estimators=100, learning_rate=1e-1, n_jobs=-1)

In [100]:
usedCarModel.fit(xTrainingSet, yTrainingSet, 
            eval_set=[(xValidationSet, yValidationSet)],
            verbose=False
           )

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=-1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)

In [101]:
calculateLoss(usedCarModel, xValidationSet, yValidationSet)
calculateLoss(usedCarModel, xTrainingSet, yTrainingSet)

Mean absolute error 241490.93267058727
Mean absolute error 254450.31918500256


In [102]:
from sklearn.model_selection import cross_val_score

In [103]:
losses = -1 * cross_val_score(usedCarModel, 
                              xDataSet, 
                              yDataSet, 
                              cv=5, 
                              scoring='neg_mean_absolute_error')

In [104]:
losses

array([222503.10791796, 216658.33745047, 210447.50386783, 283831.81132006,
       361413.83381685])

In [105]:
losses.mean()

258970.91887463434

In [106]:
from sklearn.ensemble import RandomForestRegressor

In [107]:
usedCarModelV2 = RandomForestRegressor(random_state=0, n_estimators=1000, n_jobs=-1)

In [108]:
lossesV2 = -1 * cross_val_score(usedCarModelV2, 
                              xDataSet, 
                              yDataSet, 
                              cv=5, 
                              scoring='neg_mean_absolute_error')

In [109]:
lossesV2

array([166980.06648262, 150771.24607588, 137173.19493211, 264320.20788601,
       359015.69733446])

In [110]:
lossesV2.mean()

215652.08254221725