In [None]:
# DEFINE WORKFLOW AND FUNCTIONS FOR PIPELINE USE

# -1 - TIME ATTRIBUTES
# Step -1; Engineer time attributes
def time_attr(df):
    """
    Extracts time-based attributes from a DataFrame containing a 'date' column.

    Parameters:
    -----------
    df : pandas.DataFrame
        Input DataFrame containing a 'date' column from which time-based attributes
        will be extracted.

    Returns:
    --------
    pandas.DataFrame
        DataFrame with additional time-based attributes including 'year', 'month',
        'day', 'week_of_year', and 'year_week'.

    Notes:
    ------
    This function extracts various time-based attributes from a 'date' column in the
    input DataFrame, including year, month, day, week of the year, and year-week.
    The 'date' column is assumed to be in datetime format.

    Example:
    --------
    # Call the function to extract time-based attributes from the DataFrame df
    df_with_time_attrs = time_attr(df)
    """
    #Ensure th date variable is is proper format:
    df['date'] = pd.to_datetime(df['date'])

    # year
    df['year'] = df['date'].dt.year

    # month
    df['month'] = df['date'].dt.month

    # day
    df['day'] = df['date'].dt.day

    # week of year
    df['week_of_year'] = df['date'].dt.isocalendar().week

    # year week
    df['year_week'] = df['date'].dt.strftime( '%Y-%W' )

    df = df.reset_index()

    return df

# Step 2: Wrap your custom function using FunctionTransformer
time_attr_transformer = FunctionTransformer(time_attr)

# 0 - INPUT MISSING DATA USING kNN IMPUTER

# Step 0: Define your custom function
def data_imputer(df):
    """
    Imputes missing values in specified columns of a DataFrame using KNNImputer.

    Parameters:
    -----------
    df : pandas.DataFrame
        Input DataFrame containing columns with missing values to be imputed.

    Returns:
    --------
    pandas.DataFrame
        DataFrame with missing values imputed using KNNImputer.

    Notes:
    ------
    This function scans the variables to check whether they have missing values from >0 to <10%. 
    Then, it replaces missing values in the specified columns of the input DataFrame
    using KNNImputer from the scikit-learn library. It imputes missing values based on
    the k-nearest neighbors of the data points with missing values. The number of neighbors
    used for imputation is set to 5 by default.

    Example:
    --------
    # Import required libraries
    from sklearn.impute import KNNImputer
    
    # Call the function to impute missing values in the DataFrame df
    df_imputed = data_imputer(df)
    """
    # Calculate the proportion of missing values for each column
    missing_proportions = df.isna().sum() / df.shape[0] * 100

    # Initialize an empty list to store column names with more than 10% missing values
    columns_with_missing = []

    # Iterate over each column's missing proportion
    for column, proportion in missing_proportions.items():
        if proportion > 10:
            # Drop column with moe than 10% missing
            df.drop(column, axis=1, inplace=True)
        elif 0 < proportion < 10:
            columns_with_missing.append(column)

    # Initialize KNNImputer
    imputer = KNNImputer(n_neighbors=5)

    # Impute missing values
    df[columns_with_missing] = imputer.fit_transform(df[columns_with_missing])

    df = df.reset_index()
    
    return df

# Step 2: Wrap your custom function using FunctionTransformer
data_imputer_transformer = FunctionTransformer(data_imputer)

# 1 - FORMATTING THE CATEGORICAL VARIABLES

# Step 1: Define your custom function
def cat_format(df):
    """
    Formats categorical variables in the given DataFrame.

    Args:
        df (pandas.DataFrame): The input DataFrame containing categorical variables.

    Returns:
        pandas.DataFrame: The DataFrame with formatted categorical variables.

    Example:
        df = cat_format(df)
    
    """
    # Fix data type
    df['competition_distance'] = df['competition_distance'].astype(int)
    # df['competition_open_since_month'] = df['competition_open_since_month'].astype(int)
    # df['competition_open_since_year'] = df['competition_open_since_year'].astype(int)

    # competition since
    ## Formatting the full date for the competition since
    # df['competition_since'] = df.apply( lambda x: int(datetime( year=x['competition_open_since_year'], month=x['competition_open_since_month'],day=1 ), axis=1 ))

    ## Calculate the difference between the recorded selling date and the competition open since date
    # df['competition_time_month'] = ( ( df['date'] - df['competition_since'] )/30 ).apply( lambda x: x.days ).astype( int )

    # promo since
    # df['promo_since'] = df['promo2_since_year'].astype( str ) + '-' + df['promo2_since_week'].astype( str )
    # df['promo_since'] = df['promo_since'].apply( lambda x: datetime.strptime( x + '-1', '%Y-%W-%w' ) - timedelta( days=7 ) )
    # df['promo_time_week'] = ( ( df['date'] - df['promo_since'] )/7 ).apply( lambda x: x.days ).astype( int )

    # # assortment
    df['assortment'] = df['assortment'].apply( lambda x: 'basic' if x == 'a' else 'extra' if x == 'b' else 'extended' )

    # # state holiday
    df['state_holiday'] = df['state_holiday'].apply( lambda x: 'public_holiday' if x == 'a' else 'easter_holiday' if x == 'b' else 'christmas' if x == 'c' else 'regular_day' )

    df = df.reset_index()

    return df



# Step 2: Wrap your custom function using FunctionTransformer
cat_format_transformer = FunctionTransformer(cat_format)


# 2 - RESCALING
def rescaling_vars(df):
    """
    Rescales numerical variables in the given DataFrame using RobustScaler and MinMaxScaler.

    Args:
        df (pandas.DataFrame): The input DataFrame containing numerical variables.

    Returns:
        pandas.DataFrame: The DataFrame with rescaled numerical variables.

    Example:
        df = rescaling_vars(df)
    """
    rs = RobustScaler() # rescaling the variable
    mms = MinMaxScaler()

    # # competition distance
    # df['competition_distance'] = rs.fit_transform( df[['competition_distance']].values )

    # # competition time month
    # df['competition_time_month'] = rs.fit_transform( df[['competition_time_month']].values )

    # # promo time week
    # df['promo_time_week'] = mms.fit_transform( df[['promo_time_week']].values )

    # # year
    df['year'] = mms.fit_transform( df[['year']].values )

    df = df.reset_index()

    return df

# Step 2: Wrap your custom function using FunctionTransformer
rescaling_vars_transformer = FunctionTransformer(rescaling_vars)

# 3 - TRANSFORMATION:ENCODING

def encode_vars(df):
    """
    Encodes categorical variables in the given DataFrame using different techniques.

    Args:
        df (pandas.DataFrame): The input DataFrame containing categorical variables.

    Returns:
        pandas.DataFrame: The DataFrame with encoded variables.

    Example:
        df = encode_vars(df)
    """

    le = LabelEncoder()

    # state_holiday - One Hot Encoding
    df = pd.get_dummies( df, prefix=['state_holiday'], columns = ['state_holiday'] )

    # store_type - Label Encoding
    df['store_type'] = le.fit_transform( df['store_type'] )
    
    # assortment - Ordinal Encoding
    df['assortment_encoded'] = le.fit_transform(df['assortment'])

    df = df.reset_index()

    return df

# Step 2: Wrap your custom function using FunctionTransformer
encode_vars_transformer = FunctionTransformer(encode_vars)

# 4 - LOG-TRANSFORM RESPONSE VARIABLE
def log_trf_var(df):
    """
    Applies a natural logarithm transformation (log1p) to the 'sales' column in the given DataFrame.

    Args:
        df (pandas.DataFrame): The input DataFrame containing the 'sales' column.

    Returns:
        pandas.DataFrame: The DataFrame with the transformed 'sales' column.

    Example:
        df = log_trf_var(df)
    """
    df['sales'] = np.log1p(df['sales'])

    df = df.reset_index()

    return df

# Step 2: Wrap your custom function using FunctionTransformer
log_trf_var_transformer = FunctionTransformer(log_trf_var)

# 5 - NATURE TRANSFORMATION
def nature_transf_vars(df):
    """
    Apply trigonometric transformations to date-related columns in a DataFrame.

    Args:
        df (pandas.DataFrame): Input DataFrame containing columns 'day_of_week',
                              'month', 'day', and 'week_of_year'.

    Returns:
        pandas.DataFrame: Transformed DataFrame with additional columns:
            - 'day_of_week_sin' and 'day_of_week_cos': Sine and cosine of day of week.
            - 'month_sin' and 'month_cos': Sine and cosine of month.
            - 'day_sin' and 'day_cos': Sine and cosine of day.
            - 'week_of_year_sin' and 'week_of_year_cos': Sine and cosine of week of year.
    """
    # day of week
    df['day_of_week_sin'] = df['day_of_week'].apply( lambda x: np.sin( x * ( 2. * np.pi/7 ) ) )
    df['day_of_week_cos'] = df['day_of_week'].apply( lambda x: np.cos( x * ( 2. * np.pi/7 ) ) )

    # month
    df['month_sin'] = df['month'].apply( lambda x: np.sin( x * ( 2. * np.pi/12 ) ) )
    df['month_cos'] = df['month'].apply( lambda x: np.cos( x * ( 2. * np.pi/12 ) ) )

    # day 
    df['day_sin'] = df['day'].apply( lambda x: np.sin( x * ( 2. * np.pi/30 ) ) )
    df['day_cos'] = df['day'].apply( lambda x: np.cos( x * ( 2. * np.pi/30 ) ) )

    # week of year
    def weeks_in_year(year):
        if year % 4 == 0 and (year % 100 != 0 or year % 400 == 0):
            
            return 53  # Leap year
        else:
            return 52  # Non-leap year
        
    df['week_of_year_sin'] = df['week_of_year'].apply( lambda x: np.sin( x * ( 2. * np.pi/weeks_in_year(x) ) ) )
    df['week_of_year_cos'] = df['week_of_year'].apply( lambda x: np.cos( x * ( 2. * np.pi/weeks_in_year(x) ) ) )

    # df['week_of_year'] = df['week_of_year'].apply(lambda x: datetime.strptime(x,"%Y-%W"))

    df = df.reset_index()

    return df
# Step 2: Wrap your custom function using FunctionTransformer
nature_transf_vars_transformer = FunctionTransformer(nature_transf_vars)


# Custom transformer for Boruta feature selection
class BorutaFeatureSelector:
    def __init__(self,estimator):
        self.boruta = BorutaPy(estimator=estimator)

    def fit(self, X, y):
        self.boruta.fit(X, y)
        return self

    def transform(self, X):
        return X[:, self.boruta.support_]

# Create the pipeline
pipeline = Pipeline([
    ('time_attr', time_attr_transformer),
    ('data_imputer', data_imputer_transformer),
    ('cat_format', cat_format_transformer),
    ('rescaling_vars', rescaling_vars_transformer),
    ('encode_vars', encode_vars_transformer),
    ('log_trf_var', log_trf_var_transformer),
    ('nature_transf_vars', nature_transf_vars_transformer),
    ('boruta', BorutaFeatureSelector(estimator=RandomForestRegressor(n_jobs=-1))),  # Run Boruta once
    ('model_lr', LinearRegression())
])

# Time series cross-validation
tscv = TimeSeriesSplit(n_splits=5)

# Perform cross-validation on the entire training data
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=tscv, scoring='neg_mean_absolute_error')

# Evaluate the mean cross-validation score
mean_cv_score = cv_scores.mean()

print(f"Mean Cross-Validation Accuracy: {mean_cv_score:.4f}")

# Train the final model on the entire training data
final_model = pipeline.named_steps['model_lr']
final_model.fit(X_train, y_train)

# Transform validation/test data
X_val_selected = pipeline.transform(X_val)
X_test_selected = pipeline.transform(X_test)

# Evaluate the final model on the test set
test_accuracy = final_model.score(X_test_selected, y_test)
print(f"Test Set Accuracy: {test_accuracy:.4f}")
