In [None]:
# ----------------------------------------
# filename feature_engineering_pipeline_train_inference
# author Partha Deka
# revision 1.0
# revision history 1.0 - initial script

# Import all dependencies

# Timeseries Forecasting: `End-End Pipeline Code`
- The intial code for time series model training and prediction is based on the associated with `Chapter 10`

In [5]:
# Custom transformer for creating lagged features
class LagFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, lags=3):
        self.lags = lags

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        df = pd.DataFrame(X.copy())
        for lag in range(1, self.lags + 1):
            df[f'lag_{lag}'] = df['Value'].shift(lag)
        df.dropna(inplace=True)
        return df

# Create a synthetic time series dataset
date_range = pd.date_range(start='1/1/2020', periods=100, freq='D')
data = pd.DataFrame({'Date': date_range, 'Value': np.random.randn(100).cumsum()})
data.set_index('Date', inplace=True)

# Create lag features and corresponding target
lagged_data = LagFeatureTransformer(lags=3).transform(data)
X = lagged_data.drop(columns=['Value'])  # Features: lagged values
y = lagged_data['Value']  # Target: original values shifted by lag

# Train-test split  - set a seed for reproducibility random_state=42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, random_state = 0)

# Define the pipeline with scaling and XGBoost model
pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),                    # Feature scaling
    ('model', XGBRegressor(objective='reg:squarederror'))  # XGBoost for regression
])

# Train the pipeline
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Display the predicted vs actual values
result = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}, index=X_test.index)
print(result.head())


Mean Squared Error: 1.6107611013565681
              Actual  Predicted
Date                           
2020-03-21  2.398436   1.065399
2020-03-22  0.264023   2.270394
2020-03-23 -0.002856   0.220992
2020-03-24 -0.639781   1.801060
2020-03-25 -0.313801  -0.360834


# Housing Prediction with all Feature Engineering: `End-End Pipeline Code`
- The intial code with feature egineering steps is based on `Chapter 7`

In [7]:


# Custom transformer for handling missing values in categorical variables
class MissingValueImputer(BaseEstimator, TransformerMixin):
    def __init__(self, for_missing_string, for_frequent_category):
        self.for_missing_string = for_missing_string
        self.for_frequent_category = for_frequent_category
        self.frequent_categories = {}

    def fit(self, X, y=None):
        # Store the most frequent category for variables with few missing observations
        for var in self.for_frequent_category:
            self.frequent_categories[var] = X[var].mode()[0]
        return self

    def transform(self, X, y=None):
        # Replace missing values with "Missing" for specific variables
        X[self.for_missing_string] = X[self.for_missing_string].fillna('Missing')
        # Replace missing values with the most frequent category for specific variables
        for var in self.for_frequent_category:
            X[var] = X[var].fillna(self.frequent_categories[var])
        return X

# Load the dataset (Assuming the dataset is available)
data = pd.read_csv("house_pricing.csv")

# Define feature columns
numeric_features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'GrLivArea']
categorical_features = ['Neighborhood', 'HouseStyle', 'GarageType', 'SaleCondition']

# Handle missing categorical variables
cat_vars_with_na = [var for var in categorical_features if data[var].isnull().sum() > 0]
for_missing_string = [var for var in cat_vars_with_na if data[var].isnull().mean() > 0.1]
for_frequent_category = [var for var in cat_vars_with_na if data[var].isnull().mean() < 0.1]

# Target column
target = 'SalePrice'

# Split the data into features and target
X = data.drop(columns=[target, 'Id'])
y = data[target]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
    ('scaler', StandardScaler())                 # Scale features
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing categorical values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))     # One-hot encode categorical features
])

# Combine both numeric and categorical preprocessing, along with custom missing value handling
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the complete pipeline
pipeline = Pipeline(steps=[
    ('missing_imputer', MissingValueImputer(for_missing_string, for_frequent_category)),  # Custom missing value handling
    ('preprocessor', preprocessor),               # Preprocessing step
    ('model', XGBRegressor(objective='reg:squarederror'))  # XGBoost regression model
])

# Train the pipeline
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Display the predicted vs actual values
result = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}, index=X_test.index)
print(result.head())


Mean Squared Error: 1130202899.0655065
      Actual      Predicted
892   154500  142317.156250
1105  325000  364155.031250
413   115000   93371.398438
522   159000  135780.375000
1036  315500  266444.375000
