# Code train

In [1]:
# Import library
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_selector, make_column_transformer, ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pickle

In [2]:
# Custom class
class AddCustomAttributes_bool(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X['feature_01'] = X['feature_0'] & X['feature_1']
        X['feature_09'] = X['feature_0'] & X['feature_9']
        X['feature_67'] = X['feature_6'] & X['feature_7']
        X = X.drop(['feature_0','feature_1','feature_9','feature_6','feature_7'],axis=1)
        return X
class AddCustomAttributes_num(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X["yearSquare"] = X["year"]**2
        X["carAgeCube"] = (2020 - X["year"])**3
        X['odometer'] = np.where(X['year']<=1982, 370000, X['odometer'])
        X["odometerYear"] = X["odometer"]*X["year"]
        X["odometerInvert"] = 1 / X["odometer"]
        X = X.drop([],axis=1)
        return X
class AddCustomAttributes_obj(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X.drop(['engineFuel'],axis=1)
        return X
    
# Train function
def train(X_train, y_train):
    num_pipeline = make_pipeline(AddCustomAttributes_num(), SimpleImputer(strategy="median"), StandardScaler())
    obj_pipeline = make_pipeline(AddCustomAttributes_obj(), OneHotEncoder(handle_unknown="ignore"))
    ct = make_column_transformer(
        (num_pipeline, make_column_selector(dtype_include=np.number)),
        (AddCustomAttributes_bool(), make_column_selector(dtype_include=bool)),
        (obj_pipeline, make_column_selector(dtype_include=object)))
    clf = make_pipeline(ct, LinearRegression())
#     clf = make_pipeline(ct, Ridge(alpha=0.50))
    clf.fit(X_train, y_train)
    return clf
    
# Load all test data
X = pd.read_csv("./X_train.csv", index_col="id")
y = pd.read_csv("./Y_train.csv", index_col="id")

# Train model
model_final = train(X, y)

# Save model
model_final_filename = "model_final.pkl"
with open(model_final_filename, 'wb') as file:
    pickle.dump(model_final, file)