In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from category_encoders import MEstimateEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [17]:
autos = pd.read_csv("auto_clean.csv")

In [18]:

# Data Cleaning #
# We will manipulate the missing data :
missing = autos.isnull().sum()
total_missing = missing.sum()
# Calculating the percentage of missing data :
total_cells = autos.shape[0]*autos.shape[1]
missing_percentage = (total_missing/total_cells)*100
print("The missing percentage of the dataset is {:.3}%".format(missing_percentage))
# As the missing percentage is very low so we will handle them by removing the columns that have the missing values
autos_plus = autos.dropna(axis=1)
# Two columns have been dropped


The missing percentage of the dataset is 0.0858%


In [19]:
X = autos_plus.copy()
y = X.pop("price")
# Adding two new important columns :
X["torque"] = (X["horsepower"] * 5252) / X["peak-rpm"]
X["speed"] = (X["torque"] * 1.35) / (X["curb-weight"] / 1000)

In [20]:
X_train_full, X_val_full, y_train, y_val = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [21]:
low_cardinality_columns = [col for col in X_train_full.columns
                           if X_train_full[col].nunique() <= 10 and X_train_full[col].dtype == 'object']
numerical_columns = [col for col in X_train_full.columns if X_train_full[col].dtype in ('float64', 'int64')]
cols_needed = low_cardinality_columns + numerical_columns
X_train = X_train_full[cols_needed].copy()
X_val = X_val_full[cols_needed].copy()




In [22]:

numerical_transformer = SimpleImputer(strategy='constant')
categorical_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(transformers=[
    ('num',numerical_transformer,numerical_columns),
    ('cat',categorical_transformer,low_cardinality_columns)
])
mypipeline = Pipeline(steps=[
    ('preprocessing',preprocessor),
    ('modeling',RandomForestRegressor(n_estimators=100,random_state=0))
])



In [23]:

def baseline_scoring(X_train,y_train):
    mypipeline.fit(X_train, y_train)
    return mean_absolute_error(y_val,mypipeline.predict(X_val))

In [24]:

def cross_val(X,y):
    scoring = -1*cross_val_score(mypipeline,X,y,
                                 cv=100,
                                 scoring='neg_mean_absolute_error')
    return scoring.mean()



In [None]:
# Encoding split
X_encode = autos_plus.copy()
y_encode = X_encode.pop('price')
X_encode["torque"] = (X_encode["horsepower"] * 5252) / X_encode["peak-rpm"]
X_encode["speed"] = (X_encode["torque"] * 1.35) / (X_encode["curb-weight"] / 1000)
# Create an uninformative feature
X_encode["Count"] = range(len(X))
X_encode["Count"][1] = 0  # actually need one duplicate value to circumvent error-checking in MEstimateEncoder
# Create the encoder instance. Choose m which is the smoothing factor to control noise.
encoder = MEstimateEncoder(cols=["Count"], m=0)
# fit and transform on the same dataset
X_encode = encoder.fit_transform(X_encode, y_encode)

In [26]:
print("the baseline score will be {}"
      .format(baseline_scoring(X_train,y_train)))
print("the score after the cross validation process will be {}"
      .format(cross_val(X,y)))
print("The score after applying the target encoding with baseline scoring will be {}"
      .format(baseline_scoring(X_encode, y_encode)))
print("The score after applying the target encoding and cross validation will be {}"
      .format(cross_val(X_encode, y_encode)))

the baseline score will be 1551.9782520325207
the score after the cross validation process will be 1613.344391666667
The score after applying the target encoding with baseline scoring will be 557.2737804878049
The score after applying the target encoding and cross validation will be 1613.344391666667
