In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv(r'dataset.csv')

In [3]:
data.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,Brand,Model
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6,998.0,58.16,5.0,1.75,Maruti,Wagon
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5,Hyundai,Creta
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,4.5,Honda,Jazz
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0,Maruti,Ertiga
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74,Audi,A4


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 14 columns):
Name                 6019 non-null object
Location             6019 non-null object
Year                 6019 non-null int64
Kilometers_Driven    6019 non-null int64
Fuel_Type            6019 non-null object
Transmission         6019 non-null object
Owner_Type           6019 non-null object
Mileage              6017 non-null float64
Engine               5983 non-null float64
Power                5983 non-null float64
Seats                5977 non-null float64
Price                6019 non-null float64
Brand                6019 non-null object
Model                6019 non-null object
dtypes: float64(5), int64(2), object(7)
memory usage: 658.5+ KB


In [5]:
import matplotlib.pyplot as plt

data.hist(bins=50, figsize=(20,15)) #plot all numerical attributes
plt.show()

<Figure size 2000x1500 with 9 Axes>

In [6]:
'''
#stratified sampling
from sklearn.model_selection import StratifiedShuffleSplit

# Random state to ensure that same lines are used every time and prevent data snooping
# Selecting random instaces from each Brand
split = StratifiedShuffleSplit(n_splits=1,test_size =0.2, random_state=42)

for train_index, test_index in split.split(data,data['Brand']):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]
'''

"\n#stratified sampling\nfrom sklearn.model_selection import StratifiedShuffleSplit\n\n# Random state to ensure that same lines are used every time and prevent data snooping\n# Selecting random instaces from each Brand\nsplit = StratifiedShuffleSplit(n_splits=1,test_size =0.2, random_state=42)\n\nfor train_index, test_index in split.split(data,data['Brand']):\n    strat_train_set = data.loc[train_index]\n    strat_test_set = data.loc[test_index]\n"

In [7]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, random_state = 42, test_size = 0.2)

In [8]:
# creating a copy of train dataset
train_data = train.copy()
#seperate predicators and labels
train_data = train.drop('Price', axis = 1)
train_labels = train['Price']

In [9]:
train_data.columns

Index(['Name', 'Location', 'Year', 'Kilometers_Driven', 'Fuel_Type',
       'Transmission', 'Owner_Type', 'Mileage', 'Engine', 'Power', 'Seats',
       'Brand', 'Model'],
      dtype='object')

In [10]:
# custom DataFrameSelector class for column transformation 
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self,X):
        return X[self.attribute_names].values

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer

num_attrib = ['Year','Kilometers_Driven','Mileage', 'Engine', 'Power', 'Seats']
cat_attrib = ['Location','Fuel_Type','Transmission','Owner_Type','Brand','Model']

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attrib)),
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler',StandardScaler())
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attrib)),
    ('one_hot_encoder', OneHotEncoder())
])

full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipepline',num_pipeline),
    ('cat_pipeline',cat_pipeline)
])

cars_train_prepared = full_pipeline.fit_transform(train_data)
cars_train_prepared

<4815x263 sparse matrix of type '<class 'numpy.float64'>'
	with 57780 stored elements in Compressed Sparse Row format>

In [12]:
np.save('cars_train_prepared.npy', cars_train_prepared)

In [13]:
train_labels.to_csv('train_labels.csv', index= False)

  """Entry point for launching an IPython kernel.


In [14]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(cars_train_prepared,train_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [15]:
labels_pred = lin_reg.predict(cars_train_prepared)

In [16]:
from sklearn.metrics import r2_score, accuracy_score
print(r2_score(train_labels,labels_pred))

0.859070452178724


In [17]:
print(train_labels,labels_pred)

4248     1.95
4129     7.52
2534     8.75
2893     6.95
2860     6.00
        ...  
3772     6.75
5191    32.90
5226     9.00
5390     4.95
860      5.50
Name: Price, Length: 4815, dtype: float64 [-0.05304396  6.93972504  9.23471835 ...  7.94004674  4.89264464
  7.26606659]


In [18]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(train_labels,labels_pred)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

4.208347915395733

In [19]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(lin_reg, cars_train_prepared,train_labels,
                         scoring = "neg_mean_absolute_error",cv=10)
rmse_scores = np.sqrt(-scores)
rmse_scores.mean()

1.5636827057406308

In [20]:
#A low training error but high cross validation error means its overfit.
#A high training error but low cross validation error means its underfit.

In [21]:
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor()
tree.fit(cars_train_prepared,train_labels)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [22]:
labels = tree.predict(cars_train_prepared)

In [23]:
from sklearn.metrics import mean_squared_error
tree_mse = mean_squared_error(train_labels,labels)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.01932962688716707

In [24]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree, cars_train_prepared,train_labels,
                         scoring = "neg_mean_absolute_error",cv=10)
rmse_scores = np.sqrt(-scores)
rmse_scores.mean()

1.3715158334079576

In [25]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor()
forest.fit(cars_train_prepared,train_labels)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [26]:
labels = forest.predict(cars_train_prepared)

In [27]:
from sklearn.metrics import mean_squared_error
forest_mse = mean_squared_error(train_labels,labels)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

1.546966287781166

In [28]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(forest, cars_train_prepared,train_labels,
                         scoring = "neg_mean_absolute_error",cv=10)
rmse_scores = np.sqrt(-scores)
rmse_scores.mean()

1.213791400771885

In [29]:
print(train_labels, labels)

4248     1.95
4129     7.52
2534     8.75
2893     6.95
2860     6.00
        ...  
3772     6.75
5191    32.90
5226     9.00
5390     4.95
860      5.50
Name: Price, Length: 4815, dtype: float64 [2.209 7.111 9.5   ... 8.674 4.82  5.885]


In [30]:
from sklearn.metrics import r2_score
r2_score(train_labels, labels)

0.9809567756657144