In [1]:
# First we will upload elementary libraries
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import MinMaxScaler


import warnings
warnings.filterwarnings('ignore')

In [2]:
# Then comes the data, we're making copy of our original data set and work with it
df1=pd.read_csv("CarPrice.csv")
dfcars=df1.copy()

In [3]:
dfcars.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [4]:
#Custom Transformer that extracts columns passed as argument to its constructor 
class FeatureSelector( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self, feature_names ):
        self._feature_names = feature_names 
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        return X[ self._feature_names ] 

In [5]:
class NumericalTransformer(BaseEstimator, TransformerMixin):
    #Class Constructor
    def __init__( self, avgmpg = True ):
        self._avgmpg = avgmpg

    #Return self, nothing else to do here
    def fit( self, X, y = None ):
        return self 
    
    #Custom transform method we wrote that creates aformentioned features and drops redundant ones 
    def transform(self, X, y = None):
        #Check if needed 
        if self._avgmpg:
            #create new column
            X.loc[:,'avgmpg'] = (X["citympg"]+ X["highwaympg"])/2
            #drop redundant column
            X.drop(["citympg","highwaympg"], axis = 1, inplace=True )
     
    #Converting any infinity values in the dataset to Nan
            X = X.replace( [ np.inf, -np.inf ], np.nan )
    #returns a numpy array
            return X.values

In [6]:
#dfcars["CarBrand"] = dfcars['price'].apply(lambda x : "Cheap" if x < 10000 
                                                     #else ("Affordable" if 10000 <= x < 20000
                                                           #else ("Luxury")))

In [7]:
class CategoricalTransformer( BaseEstimator, TransformerMixin ):
    #Class constructor method that takes in a list of values as its argument
    def __init__(self, CarBrand=True, symboling=True):
        self._CarBrand = CarBrand
        self._symboling= symboling
    #Return self nothing else to do here
    def fit( self, X, y = None  ):
        return self
    
    #Transformer method we wrote for this transformer 
    def transform(self, X , y = None ):
       #Depending on constructor argument break dates column into specified units
       #using the helper functions written above 
        X["CarBrand"] = X['price'].apply(lambda x : "Cheap" if x < 10000 
                                                    else ("Affordable" if 10000 <= x < 20000
                                                          else ("Luxury")))
       #Drop unusable column 
        X = X.drop('CarName', axis = 1 )
        
        
        X['symboling'].apply(lambda x : "No risk" if x >= -3 & x <= -1
                                                     else ("Low risk" if x>=0 and x <= 1
                                                           else ("High risk")))
        X = X.drop('symboling', axis = 1 )

In [8]:
#dfcars["symboling"] = dfcars['symboling'].apply(lambda x : "No risk" if x >= -3 & x <= -1
                                                    # else ("Low risk" if x>=0 and x <= 1
                                                          # else ("High risk")))

In [9]:
dfcars.isnull().sum()

car_ID              0
symboling           0
CarName             0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64

In [10]:
dfcars.cylindernumber.unique()

array(['four', 'six', 'five', 'three', 'twelve', 'two', 'eight'],
      dtype=object)

In [11]:
#create funcion to identify all numeric data
def numeric(data):
    numeric_cols = data.select_dtypes(np.number).columns.tolist()
    return data[numeric_cols].head(3)

print('Numeric features of data set are:')
print()
numeric_df= numeric(dfcars)
numeric_df.head(3)

Numeric features of data set are:



Unnamed: 0,car_ID,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154,5000,19,26,16500.0


In [12]:
#create funcion to identify all categorical data
def categoric(data):
    categoric_cols = data.select_dtypes(exclude=np.number).columns.tolist()
    return data[categoric_cols].head(3)

print('Categoric features of data set are:')
print()
categoric_df= categoric(dfcars)
categoric_df.head(3)

Categoric features of data set are:



Unnamed: 0,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,enginetype,cylindernumber,fuelsystem
0,alfa-romero giulia,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,alfa-romero stelvio,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi


In [13]:
#Categrical features to pass down the categorical pipeline 
categorical_features = ['CarName','fueltype', 'aspiration', 'symboling', 'carbody',
                        'drivewheel', 'enginetype', 'cylindernumber','fuelsystem']

#Numerical features to pass down the numerical pipeline 
numerical_features = ['wheelbase', 'enginesize', 'curbweight' ,'carlength',
                      'carwidth', 'boreratio', 'horsepower', "citympg", 'highwaympg']

#Defining the steps in the categorical pipeline 
categorical_pipeline = Pipeline( steps = [ ( 'cat_selector', FeatureSelector(categorical_features) ),
                                          
                                          ('cat_transform', CategoricalTransformer()),
                                  
                                  ( 'one_hot_encoder', OneHotEncoder(handle_unknown='ignore') ) ] )
    
#Defining the steps in the numerical pipeline     
numerical_pipeline = Pipeline( steps = [ ( 'num_selector', FeatureSelector(numerical_features) ),
                                       
                                        ('num_trnasform', NumericalTransformer()),
                                  
                                  ( 'MM_scaler', MinMaxScaler() ) ] )

#Combining numerical and categorical piepline into one full big pipeline horizontally 
#using FeatureUnion
full_pipeline = FeatureUnion( transformer_list = [ ( 'categorical_pipeline', categorical_pipeline ), 
                                                  
                                                  ( 'numerical_pipeline', numerical_pipeline ) ] )


In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

#Leave it as a dataframe becuase our pipeline is called on a 
#pandas dataframe to extract the appropriate columns, remember?
X = dfcars.drop('price', axis = 1)
#You can covert the target variable to numpy 
y = dfcars['price'].values 

X_train, X_test, y_train, y_test = train_test_split( X, y , test_size = 0.2 , random_state = 42 )

#The full pipeline as a step in another pipeline with an estimator as the final step
full_pipeline_m = Pipeline( steps = [ ( 'full_pipeline', full_pipeline),
                                  
                                  ( 'model', RandomForestRegressor() ) ] )

#Can call fit on it just like any other pipeline
full_pipeline_m.fit( X_train, y_train )

#Can predict with it like any other pipeline
y_pred = full_pipeline_m.predict( X_test ) 

ValueError: Expected 2D array, got scalar array instead:
array=None.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
y_pred

In [None]:
# At the end let see how good is our prediction
df_pip = pd.DataFrame({'Actual': y_test, 'Predicted_norm': y_pred})
df_pip.head(5)

In [None]:
# Import metrics  mean_squere_error and mean_absolute_error

from sklearn import metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [None]:
MSE = mean_squared_error(y_test, y_pred)
print(f"Mean_squared_error via pipeline: {MSE} \n")
RMSE=np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print(f'RMSE via pipeline: {RMSE} \n')
MAE=mean_absolute_error(y_test,y_pred)              
print(f'Mean_absolute_error via pipeline: {MAE}\n')
    

In [None]:
dfcars.head()