In [3]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import gc
from typing import Dict, List, Tuple

class FeatureEngineer:
    """
    Advanced feature engineering for Santander product recommendation
    Optimized for memory efficiency
    """
    
    def __init__(self, product_cols: List[str]):
        self.product_cols = product_cols
        self.categorical_cols = ['segmento', 'ind_actividad_cliente', 'canal_entrada', 
                                 'cod_prov', 'indrel_1mes', 'tiprel_1mes']
        
    def create_lag_features(self, df: pd.DataFrame, n_lags: int = 5) -> pd.DataFrame:
        """
        Create lag features for products and categorical variables
        Memory efficient implementation using chunks
        """
        print("Creating lag features...")
        
        # Sort by customer and date
        df = df.sort_values(['ncodpers', 'fecha_dato'])
        
        # Initialize feature dict
        lag_features = {}
        
        # Process in chunks to save memory
        chunk_size = 100000
        n_chunks = len(df) // chunk_size + 1
        
        for chunk_idx in range(n_chunks):
            start_idx = chunk_idx * chunk_size
            end_idx = min((chunk_idx + 1) * chunk_size, len(df))
            
            chunk = df.iloc[start_idx:end_idx]
            
            # Product lags
            for lag in range(1, n_lags + 1):
                for col in self.product_cols[:10]:  # Start with top 10 products
                    lag_col = f'{col}_lag_{lag}'
                    chunk[lag_col] = chunk.groupby('ncodpers')[col].shift(lag)
                    
            # Categorical lags (for changes)
            for lag in range(1, 3):  # Only 2 lags for categorical
                for col in self.categorical_cols:
                    if col in df.columns:
                        lag_col = f'{col}_lag_{lag}'
                        chunk[lag_col] = chunk.groupby('ncodpers')[col].shift(lag)
            
            # Store results
            if chunk_idx == 0:
                result_df = chunk
            else:
                result_df = pd.concat([result_df, chunk], axis=0)
            
            if chunk_idx % 10 == 0:
                print(f"Processed {chunk_idx}/{n_chunks} chunks")
                gc.collect()
        
        return result_df
    
    def create_time_since_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Create 'time since' features for product ownership and changes
        """
        print("Creating time since features...")
        
        # Convert fecha_dato to datetime if not already
        df['fecha_dato'] = pd.to_datetime(df['fecha_dato'])
        
        # Sort by customer and date
        df = df.sort_values(['ncodpers', 'fecha_dato'])
        
        # For each product, calculate months since first/last ownership
        for product in self.product_cols[:10]:  # Top 10 products
            # Time since first ownership
            df[f'{product}_months_since_first'] = df.groupby('ncodpers').apply(
                lambda x: self._calculate_months_since_first(x, product)
            ).reset_index(level=0, drop=True)
            
            # Time since last purchase (if changed from 0 to 1)
            df[f'{product}_months_since_purchase'] = df.groupby('ncodpers').apply(
                lambda x: self._calculate_months_since_purchase(x, product)
            ).reset_index(level=0, drop=True)
        
        # Time since any change in categorical variables
        for col in self.categorical_cols:
            if col in df.columns:
                df[f'{col}_months_since_change'] = df.groupby('ncodpers').apply(
                    lambda x: self._calculate_months_since_change(x, col)
                ).reset_index(level=0, drop=True)
        
        return df
    
    def create_aggregation_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Create aggregated features by customer segments
        """
        print("Creating aggregation features...")
        
        # Product ownership rates by segment combinations
        segment_cols = ['segmento', 'cod_prov', 'canal_entrada']
        
        for seg_col in segment_cols:
            if seg_col in df.columns:
                # Calculate mean product ownership by segment
                for product in self.product_cols[:5]:  # Top 5 products
                    agg_col = f'{product}_mean_by_{seg_col}'
                    segment_means = df.groupby(seg_col)[product].mean()
                    df[agg_col] = df[seg_col].map(segment_means)
        
        # Customer-level aggregations
        customer_aggs = df.groupby('ncodpers').agg({
            **{col: ['mean', 'sum', 'std'] for col in self.product_cols[:5]},
            'antiguedad': ['mean', 'max'],
            'age': ['mean']
        })
        
        # Flatten column names
        customer_aggs.columns = ['_'.join(col).strip() for col in customer_aggs.columns.values]
        
        # Merge back
        df = df.merge(customer_aggs, on='ncodpers', how='left', suffixes=('', '_customer'))
        
        return df
    
    def create_trend_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Create trend and momentum features
        """
        print("Creating trend features...")
        
        # Sort by customer and date
        df = df.sort_values(['ncodpers', 'fecha_dato'])
        
        # Product count trends
        df['total_products'] = df[self.product_cols].sum(axis=1)
        
        # Rolling averages
        for window in [3, 6]:
            df[f'total_products_ma_{window}'] = df.groupby('ncodpers')['total_products'].transform(
                lambda x: x.rolling(window, min_periods=1).mean()
            )
        
        # Product momentum (change in last 3 months)
        df['product_momentum'] = df.groupby('ncodpers')['total_products'].diff(3)
        
        # Seasonal features
        df['month'] = df['fecha_dato'].dt.month
        df['quarter'] = df['fecha_dato'].dt.quarter
        df['is_year_end'] = (df['month'].isin([11, 12])).astype(int)
        df['is_quarter_end'] = (df['month'].isin([3, 6, 9, 12])).astype(int)
        
        return df
    
    def create_target_encoding(self, df: pd.DataFrame, target_col: str = None) -> pd.DataFrame:
        """
        Create target encoding features for categorical variables
        Memory efficient implementation
        """
        print("Creating target encoding features...")
        
        if target_col is None:
            # Create a proxy target: whether customer added any product
            df['added_product'] = 0
            for product in self.product_cols:
                df['added_product'] |= (df[product] > df[product].shift(1)).astype(int)
        else:
            df['added_product'] = df[target_col]
        
        # Target encode categorical variables
        for cat_col in self.categorical_cols:
            if cat_col in df.columns:
                # Calculate encoding on training data
                encoding = df.groupby(cat_col)['added_product'].mean()
                df[f'{cat_col}_target_enc'] = df[cat_col].map(encoding)
                
                # Add noise to prevent overfitting
                noise = np.random.normal(0, 0.01, len(df))
                df[f'{cat_col}_target_enc'] += noise
        
        return df
    
    # Helper methods
    def _calculate_months_since_first(self, group, product):
        """Calculate months since first ownership of product"""
        first_owned = group[group[product] == 1]['fecha_dato'].min()
        if pd.isna(first_owned):
            return [999] * len(group)  # Never owned
        return [(row['fecha_dato'] - first_owned).days // 30 for _, row in group.iterrows()]
    
    def _calculate_months_since_purchase(self, group, product):
        """Calculate months since product was purchased (0->1 transition)"""
        purchases = group[(group[product] == 1) & (group[product].shift(1) == 0)]['fecha_dato']
        if len(purchases) == 0:
            return [999] * len(group)
        last_purchase = purchases.max()
        return [(row['fecha_dato'] - last_purchase).days // 30 for _, row in group.iterrows()]
    
    def _calculate_months_since_change(self, group, col):
        """Calculate months since last change in categorical variable"""
        changes = group[group[col] != group[col].shift(1)]['fecha_dato']
        if len(changes) <= 1:  # No changes except first occurrence
            return [999] * len(group)
        last_change = changes.iloc[-1]
        return [(row['fecha_dato'] - last_change).days // 30 for _, row in group.iterrows()]
    
    def engineer_all_features(self, df: pd.DataFrame, is_train: bool = True) -> pd.DataFrame:
        """
        Apply all feature engineering steps
        """
        print(f"Starting feature engineering on {len(df)} rows...")
        initial_memory = df.memory_usage(deep=True).sum() / 1024**2
        print(f"Initial memory usage: {initial_memory:.2f} MB")
        
        # 1. Lag features
        df = self.create_lag_features(df, n_lags=3)
        gc.collect()
        
        # 2. Time since features
        df = self.create_time_since_features(df)
        gc.collect()
        
        # 3. Aggregation features
        df = self.create_aggregation_features(df)
        gc.collect()
        
        # 4. Trend features
        df = self.create_trend_features(df)
        gc.collect()
        
        # 5. Target encoding (only on train)
        if is_train:
            df = self.create_target_encoding(df)
            gc.collect()
        
        # Optimize dtypes to save memory
        df = self.optimize_dtypes(df)
        
        final_memory = df.memory_usage(deep=True).sum() / 1024**2
        print(f"Final memory usage: {final_memory:.2f} MB")
        print(f"Total features: {len(df.columns)}")
        
        return df
    
    def optimize_dtypes(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Optimize data types to reduce memory usage
        """
        for col in df.columns:
            col_type = df[col].dtype
            
            # Skip object and datetime columns
            if col_type == 'object' or pd.api.types.is_datetime64_any_dtype(df[col]):
                continue
            
            c_min = df[col].min()
            c_max = df[col].max()
            
            # Skip if NaN values make min/max invalid
            if pd.isna(c_min) or pd.isna(c_max):
                continue
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            elif str(col_type)[:5] == 'float':
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
        
        return df

In [None]:
# Load the cleaned data
df_train = pd.read_csv('/Users/dennis_m_jose/Documents/GitHub/Santander-RecSys-Dennis/data/cleaned_santander_data.csv')
product_cols = [col for col in df_train.columns if col.endswith('ult1')]

# Initialize feature engineer
fe = FeatureEngineer(product_cols)

# Testing on small sample first to avoid long runtimes
sample_df = df_train.head(50000).copy()
featured_sample = fe.engineer_all_features(sample_df, is_train=True)

print(f"Original columns: {len(sample_df.columns)}")
print(f"Featured columns: {len(featured_sample.columns)}")
print(f"New features created: {len(featured_sample.columns) - len(sample_df.columns)}")

#few of the new features
new_features = [col for col in featured_sample.columns if col not in sample_df.columns]
print(f"\nSample of new features created:")
print(new_features[:10])

  df_train = pd.read_csv('/Users/dennis_m_jose/Documents/GitHub/Santander-RecSys-Dennis/data/cleaned_santander_data.csv')


Starting feature engineering on 50000 rows...
Initial memory usage: 49.66 MB
Creating lag features...
Processed 0/1 chunks


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk[lag_col] = chunk.groupby('ncodpers')[col].shift(lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk[lag_col] = chunk.groupby('ncodpers')[col].shift(lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk[lag_col] = chunk.groupby('ncodpers')[col].shift(lag)
A value is trying to be set 

Creating time since features...
Creating aggregation features...
Creating trend features...
Creating target encoding features...
Final memory usage: 138.47 MB
Total features: 162
Original columns: 46
Featured columns: 162
New features created: 116

Sample of new features created:
['ind_ahor_fin_ult1_lag_1', 'ind_aval_fin_ult1_lag_1', 'ind_cco_fin_ult1_lag_1', 'ind_cder_fin_ult1_lag_1', 'ind_cno_fin_ult1_lag_1', 'ind_ctju_fin_ult1_lag_1', 'ind_ctma_fin_ult1_lag_1', 'ind_ctop_fin_ult1_lag_1', 'ind_ctpp_fin_ult1_lag_1', 'ind_deco_fin_ult1_lag_1']


In [6]:
new_features

['ind_ahor_fin_ult1_lag_1',
 'ind_aval_fin_ult1_lag_1',
 'ind_cco_fin_ult1_lag_1',
 'ind_cder_fin_ult1_lag_1',
 'ind_cno_fin_ult1_lag_1',
 'ind_ctju_fin_ult1_lag_1',
 'ind_ctma_fin_ult1_lag_1',
 'ind_ctop_fin_ult1_lag_1',
 'ind_ctpp_fin_ult1_lag_1',
 'ind_deco_fin_ult1_lag_1',
 'ind_ahor_fin_ult1_lag_2',
 'ind_aval_fin_ult1_lag_2',
 'ind_cco_fin_ult1_lag_2',
 'ind_cder_fin_ult1_lag_2',
 'ind_cno_fin_ult1_lag_2',
 'ind_ctju_fin_ult1_lag_2',
 'ind_ctma_fin_ult1_lag_2',
 'ind_ctop_fin_ult1_lag_2',
 'ind_ctpp_fin_ult1_lag_2',
 'ind_deco_fin_ult1_lag_2',
 'ind_ahor_fin_ult1_lag_3',
 'ind_aval_fin_ult1_lag_3',
 'ind_cco_fin_ult1_lag_3',
 'ind_cder_fin_ult1_lag_3',
 'ind_cno_fin_ult1_lag_3',
 'ind_ctju_fin_ult1_lag_3',
 'ind_ctma_fin_ult1_lag_3',
 'ind_ctop_fin_ult1_lag_3',
 'ind_ctpp_fin_ult1_lag_3',
 'ind_deco_fin_ult1_lag_3',
 'segmento_lag_1',
 'ind_actividad_cliente_lag_1',
 'canal_entrada_lag_1',
 'cod_prov_lag_1',
 'indrel_1mes_lag_1',
 'tiprel_1mes_lag_1',
 'segmento_lag_2',
 'ind_act

In [7]:
featured_sample

Unnamed: 0,fecha_dato,ncodpers,ind_empleado,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,...,quarter,is_year_end,is_quarter_end,added_product,segmento_target_enc,ind_actividad_cliente_target_enc,canal_entrada_target_enc,cod_prov_target_enc,indrel_1mes_target_enc,tiprel_1mes_target_enc
0,2015-01-28,1013024,N,ES,H,23.0,2012-04-23,0.0,39,1.0,...,1,0,0,0,0.569336,0.413330,0.674805,0.185303,0.208252,0.454834
1,2015-01-28,1013031,N,ES,V,40.0,2012-04-23,0.0,39,1.0,...,1,0,0,1,0.555664,0.428223,0.668457,0.488525,0.197632,0.457031
2,2015-01-28,1013035,N,ES,V,37.0,2012-04-23,0.0,39,1.0,...,1,0,0,1,0.550781,0.414551,0.657227,0.227905,0.209839,0.428467
3,2015-01-28,1013036,N,ES,V,66.0,2012-04-23,0.0,39,1.0,...,1,0,0,1,0.541992,0.418701,0.675293,0.411133,0.203491,0.451660
4,2015-01-28,1013038,N,ES,V,38.0,2012-04-23,0.0,39,1.0,...,1,0,0,1,0.554199,0.066772,0.682129,0.432129,0.199097,0.461914
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2015-01-28,1125272,N,ES,V,40.0,2013-03-21,0.0,28,1.0,...,1,0,0,0,0.565430,0.397949,0.670410,0.234619,0.214844,0.463135
49996,2015-01-28,1125274,N,ES,V,37.0,2013-03-21,0.0,28,1.0,...,1,0,0,0,0.562500,0.059448,0.664062,0.148804,0.200195,0.063538
49997,2015-01-28,1125278,N,ES,V,50.0,2013-03-21,0.0,28,1.0,...,1,0,0,1,0.543945,0.395996,0.673340,0.307617,0.207764,0.447754
49998,2015-01-28,1125282,N,ES,V,44.0,2013-03-21,0.0,28,1.0,...,1,0,0,1,0.563965,0.422363,0.668945,0.582031,0.209473,0.468018
