#age
#city population bins
#time from last buy
#num of trans last transaction
#average amt per user
#check if the accounts commited fraud before
#SSN_State_Mismatch_Flag


In [4]:
import pandas as pd
import numpy as np

In [5]:
file_path = 'clensed_data.pkl'
df = pd.read_pickle(file_path)

print(" Data loaded successfully.")


 Data loaded successfully.


In [6]:
print(df.info(verbose=True, max_cols=30))

<class 'pandas.core.frame.DataFrame'>
Index: 2013945 entries, 0 to 17284449
Data columns (total 23 columns):
 #   Column              Dtype         
---  ------              -----         
 0   Unnamed: 0          int64         
 1   ssn                 string        
 2   cc_num              int64         
 3   city                string        
 4   zip                 int64         
 5   lat                 float64       
 6   long                float64       
 7   city_pop            float64       
 8   job                 string        
 9   dob                 datetime64[ns]
 10  acct_num            int64         
 11  profile             string        
 12  trans_num           string        
 13  trans_date          datetime64[ns]
 14  category            string        
 15  amt                 float64       
 16  is_fraud            int64         
 17  merchant            string        
 18  merch_lat           float64       
 19  merch_long          float64       
 20  is_mal

Date Features

In [7]:
current_year = 2020

df['TX_MONTH'] = df['trans_date'].dt.month
print("TX_MONTH feature created.")
df['TX_DAY'] = df['trans_date'].dt.day
print("TX_DAY feature created.")
df['TX_HOUR'] = df['trans_timestamp'].dt.hour
print("TX_HOUR feature created.")

df['DOB_YEAR']= df['dob'].dt.year
print("DOB_YEAR feature created.")
df['DOB_MONTH']= df['dob'].dt.month
print("DOB_MONTH feature created.")


df['age'] = current_year - df['DOB_YEAR']
print("Age feature created.")
df['IS_WEEKEND'] = np.where(df['trans_date'].dt.dayofweek >= 5, 1, 0)
print("IS_WEEKEND feature created.")

TX_MONTH feature created.
TX_DAY feature created.
TX_HOUR feature created.
DOB_YEAR feature created.
DOB_MONTH feature created.
Age feature created.
IS_WEEKEND feature created.


Time since last transaction

In [8]:
df.sort_values(by=['cc_num', 'trans_timestamp'], inplace=True)

df['TIME_SINCE_LAST_TX'] = (
    df.groupby('cc_num')['trans_timestamp']
      .diff()
      .dt.total_seconds()
      .fillna(0)
)

print("'TIME_SINCE_LAST_TX' (in seconds) created.")

'TIME_SINCE_LAST_TX' (in seconds) created.


Velocity Features

In [9]:
df = df.reset_index(drop=True)
df.sort_values(by=['cc_num', 'trans_timestamp'], inplace=True)

WINDOWS = ['1h', '24h', '7d'] 

for window in WINDOWS:
    tx_count_col = f'TX_COUNT_{window}'
    amt_max_col = f'AMT_MAX_{window}'
    amt_avg_col = f'AMT_AVG_{window}'

    df[tx_count_col] = 0.0
    df[amt_max_col] = 0.0
    df[amt_avg_col] = 0.0

    for cc_val, grp in df.groupby('cc_num', sort=False):
        s = grp.set_index('trans_timestamp')['amt']

        cnt = s.rolling(window=window).count().shift(1).fillna(0).values
        mx = s.rolling(window=window).max().shift(1).fillna(0).values
        avg = s.rolling(window=window).mean().shift(1).fillna(0).values

        df.loc[grp.index, tx_count_col] = cnt
        df.loc[grp.index, amt_max_col] = mx
        df.loc[grp.index, amt_avg_col] = avg

print("Rolling Velocity Features (TX_COUNT_*, AMT_MAX_*, AMT_AVG_*) created.")

Rolling Velocity Features (TX_COUNT_*, AMT_MAX_*, AMT_AVG_*) created.


Relative amount

In [10]:
# overall lifetime average amount
user_avg_amt = df.groupby('cc_num')['amt'].mean().rename('USER_LIFETIME_AVG_AMT')

df = df.merge(user_avg_amt, on='cc_num', how='left')

df['AMT_vs_USER_AVG'] = df['amt'] / df['USER_LIFETIME_AVG_AMT']
df['AMT_vs_USER_AVG'].fillna(0, inplace=True) 

df.drop(columns=['USER_LIFETIME_AVG_AMT'], inplace=True)

print("Relative Amount Feature (AMT_vs_USER_AVG) created.")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['AMT_vs_USER_AVG'].fillna(0, inplace=True)


Relative Amount Feature (AMT_vs_USER_AVG) created.


Geospatial Features:

In [11]:
from geopy.distance import great_circle

def calculate_great_circle_distance(row):
    """Calculates distance in meters between customer home and merchant."""
    try:
        customer_loc = (row['lat'], row['long'])
        merchant_loc = (row['merch_lat'], row['merch_long'])
        return great_circle(customer_loc, merchant_loc).meters
    except Exception:
        return 0 # Return 0 for bad coordinates

df['DIST_HOME_MERCH'] = df.apply(calculate_great_circle_distance, axis=1)

df['DIST_HOME_MERCH_LOG'] = np.log1p(df['DIST_HOME_MERCH'])
df.drop(columns=['DIST_HOME_MERCH'], inplace=True)

print("Geospatial Distance Feature (DIST_HOME_MERCH_LOG) created.")

Geospatial Distance Feature (DIST_HOME_MERCH_LOG) created.


In [12]:
profile_components = df['profile'].str.split('_', expand=True)
df['PROFILE_GEO_TYPE'] = profile_components[3]
print("'PROFILE_GEO_TYPE' created.")

'PROFILE_GEO_TYPE' created.


Transation Velocity Features

In [None]:
df = df.reset_index(drop=True)

df.sort_values(by=['cc_num', 'trans_timestamp'], inplace=True)

WINDOWS = ['1h', '24h', '7d']

for window in WINDOWS:
    print(f"Calculating features for window: {window}")

    tx_count_col = f'TX_COUNT_{window}'
    amt_max_col = f'AMT_MAX_{window}'
    amt_avg_col = f'AMT_AVG_{window}'

    df[tx_count_col] = 0.0
    df[amt_max_col] = 0.0
    df[amt_avg_col] = 0.0

    for cc_val, grp in df.groupby('cc_num', sort=False):
        s = grp.set_index('trans_timestamp')['amt']

        cnt = s.rolling(window=window).count().shift(1).fillna(0).values
        mx = s.rolling(window=window).max().shift(1).fillna(0).values
        avg = s.rolling(window=window).mean().shift(1).fillna(0).values

        df.loc[grp.index, tx_count_col] = cnt
        df.loc[grp.index, amt_max_col] = mx
        df.loc[grp.index, amt_avg_col] = avg

print("Rolling Velocity Features created successfully.")

Calculating features for window: 1h


In [None]:
#Historical average of the is_fraud flag, grouped by zip code.

df.sort_values(by=['trans_timestamp'], inplace=True)
df = df.reset_index(drop=True)

def zip_card_count_1d(group):
    s = group.set_index('trans_timestamp')['cc_num']
    res = s.rolling('24h').apply(lambda x: x.nunique(), raw=False).shift(1)
    return pd.Series(res.values, index=group.index).fillna(0)

df['ZIP_CARD_COUNT_1D'] = df.groupby('zip', group_keys=False).apply(zip_card_count_1d).astype(float)

print("'ZIP_CARD_COUNT_1D' (Unique Cards in 24h) created.")

SSN-Based Features

In [None]:
ssn_acct_counts = df.groupby('ssn')['acct_num'].nunique().rename('UNIQUE_ACCT_COUNT')
df = df.merge(ssn_acct_counts, on='ssn', how='left')
# 1 if the SSN is tied to more than one account, 0 otherwise
df['SSN_SHARED_FLAG'] = np.where(df['UNIQUE_ACCT_COUNT'] > 1, 1, 0)

df.drop(columns=['UNIQUE_ACCT_COUNT'], inplace=True)

print("'SSN_SHARED_FLAG' created.")

In [None]:
df = df.reset_index(drop=True)
df['SSN_COUNT_1D'] = 0.0

for ssn_val, grp in df.groupby('ssn', sort=False):
    sorted_grp = grp.sort_values('trans_timestamp')
    s = sorted_grp.set_index('trans_timestamp')['amt']
    cnt = s.rolling(window='24h').count().shift(1).fillna(0).values
    df.loc[sorted_grp.index, 'SSN_COUNT_1D'] = cnt

print("'SSN_COUNT_1D' (count in last 24 hours) created.")

CC_NUM Features

In [None]:
#finds if the cc number had previous frauds

df.sort_values(by=['cc_num', 'trans_timestamp'], inplace=True)

df['CC_CUM_FRAUD'] = df.groupby('cc_num')['is_fraud'].cumsum()
df['CC_PREV_FRAUD'] = df['CC_CUM_FRAUD'].shift(1).fillna(0)

df.drop(columns=['CC_CUM_FRAUD'], inplace=True)

print("'CC_PREV_FRAUD' (Historical Fraud Count) created.")

In [None]:
#card type
df['CC_BIN'] = df['cc_num'][:6]

print("'CC_BIN' (Categorical BIN) created.")

In [None]:
# total count of transactions for each cc_num
cc_lifetime_counts = df.groupby('cc_num')['amt'].count().rename('CC_COUNT_LIFETIME')
df = df.merge(cc_lifetime_counts, on='cc_num', how='left')

print("'CC_COUNT_LIFETIME' created.")

Binning

In [None]:
def create_and_drop_bins(df: pd.DataFrame, features_to_bin: list, num_bins: int = 5) -> pd.DataFrame:
    """
    Applies quantile binning to a list of numerical features, creates new 
    categorical columns, and drops the original numerical columns.

    Args:
        df (pd.DataFrame): The input DataFrame.
        features_to_bin (list): List of numerical column names to bin.
        num_bins (int): The number of quantile bins to create.

    Returns:
        pd.DataFrame: The DataFrame with new binned features and dropped originals.
    """
    binned_features = []
    dropped_originals = []
    
    print(f"Starting Quantile Binning for {features_to_bin} into {num_bins} bins.")

    for col in features_to_bin:
        new_col_name = f'{col}_BIN'
        
        if col not in df.columns:
            print(f"⚠️ Warning: Column '{col}' not found. Skipping.")
            continue
            
        try:
            #Creates bins with equal number of records
            df[new_col_name] = pd.qcut(
                df[col], 
                q=num_bins, 
                labels=False, 
                duplicates='drop' 
            ).astype('category').astype(str) 
            
            binned_features.append(new_col_name)
            dropped_originals.append(col)
            print(f"  - ✅ '{col}' successfully binned into {df[new_col_name].nunique()} segments.")
            
        except ValueError as e:
            print(f"  - ❌ Could not bin '{col}' into {num_bins} bins. Error: {e}")
            continue

    df.drop(columns=dropped_originals, inplace=True, errors='ignore') 
    
    print("\n--- Binning Summary ---")
    print(f"✅ Binned Features Created: {binned_features}")
    print(f"✅ Original Numerical Features Dropped: {dropped_originals}")
    
    return df



In [None]:
BINNING_CANDIDATES = [
    'amt', 
    'city_pop', 
    'TIME_SINCE_LAST_TX',
    'age',
    'AMT_vs_USER_AVG', 
    'ZIP_CARD_COUNT_1D',
    'AMT_AVG_7d', 
    'CC_COUNT_LIFETIME'
]

df = create_and_drop_bins(df, BINNING_CANDIDATES, num_bins=5)

Dropping original features

In [None]:
df.drop(columns=['profile','zip', 'long', 'lat', 'Unnamed: 0', 'cc_num', 'dob', 'acct_num', 'trans_num'
                 ,'trans_date','merch_lat', 'merch_long', 'trans_timestamp','ssn','DOB_YEAR', 'DOB_MONTH'], inplace=True)

In [None]:
df.describe()

##Hot One Encoding

In [None]:
CATEGORICAL_FEATURES = ['category','city', 'TX_MONTH', 'TX_HOUR','TX_DAY', 'amt_BIN', 'city_pop_BIN', 'TIME_SINCE_LAST_TX_BIN', 'age_BIN',
                         'merchant', 'job', 'PROFILE_GEO_TYPE', 'CC_BIN' ]

df = pd.get_dummies(
    df,
    columns=CATEGORICAL_FEATURES,
    prefix=CATEGORICAL_FEATURES,
    drop_first=True 
)
print(f"Encoding Complete. Final feature count: {df.shape[1] - 1}")

Standardization

In [None]:

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


X = df.drop(columns=['is_fraud'])
y = df['is_fraud']

ALL_FEATURES = X.columns.tolist()

print("Starting final Standardization for all features...")
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, index=X.index, columns=ALL_FEATURES)
df = pd.concat([X_scaled, y], axis=1)
print("Final Standardization Complete.")

Unbalanced Data fitting

In [None]:

X = df.drop(columns=['is_fraud'])
y = df['is_fraud'] 

total_samples = len(df)


X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, 
    test_size=0.3, 
    random_state=42, 
    stratify=y 
)


X_dev, X_test, y_dev, y_test = train_test_split(
    X_temp, y_temp, 
    test_size=0.5,
    random_state=42, 
    stratify=y_temp 
)

train_percent = (X_train.shape[0] / total_samples) * 100
dev_percent = (X_dev.shape[0] / total_samples) * 100
test_percent = (X_test.shape[0] / total_samples) * 100

print(f"Total Samples: {total_samples:,}")
print(f"Training set size: {X_train.shape[0]:,} ({train_percent:.2f}%)")
print(f"Validation (Dev) set size: {X_dev.shape[0]:,} ({dev_percent:.2f}%)")
print(f"Testing set size: {X_test.shape[0]:,} ({test_percent:.2f}%)")
print("\n✅ Data split successfully with stratification.")

##Feature Selection:


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier


imbalance_ratio = (y_train == 0).sum() / (y_train == 1).sum()
class_weights = {0: 1, 1: imbalance_ratio}

models = {
    # 1. Lasso (L1 penalty)
    'Lasso_L1': LogisticRegression(penalty='l1', solver='saga', C=0.01, class_weight=class_weights, random_state=42, max_iter=3000),
    
    # 2. Ridge (L2 penalty)
    'Ridge_L2': LogisticRegression(penalty='l2', solver='saga', C=0.01, class_weight=class_weights, random_state=42, max_iter=3000),
    
    # 3. Linear SVM (L1 penalty) - rmoved due to long training times
    #'SVM_L1': LinearSVC(C=0.01, penalty="l1", dual=False, class_weight=class_weights, random_state=42, max_iter=2000),
    
    # 4. Gradient Boosting
    'GradientBoost': GradientBoostingClassifier(n_estimators=50, max_depth=4, random_state=42),
    
    # 5. Random Forest
    'RandomForest': RandomForestClassifier(n_estimators=50, max_depth=8, class_weight=class_weights, random_state=42, n_jobs=-1),
}

selection_results = {}
print("Starting Consensus Feature Selection...")

for name, model in models.items():
    print(f"-> Fitting {name}...")
    
    model.fit(X_train, y_train)

    if hasattr(model, 'coef_'):
        coef = model.coef_[0] if model.coef_.ndim > 1 else model.coef_
        selected = (np.abs(coef) > 1e-4).astype(int)
    elif hasattr(model, 'feature_importances_'):
        selected = (model.feature_importances_ > 1e-4).astype(int)
    else:
        selected = np.zeros(X_train.shape[1]).astype(int)
    
    selection_results[name] = selected

selection_df = pd.DataFrame(selection_results, index=X_train.columns)
selection_df.index.name = 'Feature'

selection_df['Sum'] = selection_df.sum(axis=1)
selection_df = selection_df.sort_values(by='Sum', ascending=False)

TOP_N_FEATURES = 20
print(f"Filtering down to the absolute top {TOP_N_FEATURES} features...")

final_selected_features_df = selection_df.sort_values(
    by='Sum', 
    ascending=False
).head(TOP_N_FEATURES)

final_selected_features = final_selected_features_df.index.tolist()

print(f"✅ Final feature count selected: {len(final_selected_features)}")
print("\nTop 20 Features Selected:")
print(final_selected_features_df[['Sum']].to_markdown())


In [None]:
final_selected_features

In [None]:
SELECTED_FEATURES = final_selected_features

X_train_filtered = X_train[SELECTED_FEATURES]
X_dev_filtered = X_dev[SELECTED_FEATURES]
X_test_filtered = X_test[SELECTED_FEATURES]

np.save('selected_features.npy', np.array(SELECTED_FEATURES))

df_train_final = pd.concat([X_train_filtered, y_train], axis=1)
df_dev_final = pd.concat([X_dev_filtered, y_dev], axis=1)
df_test_final = pd.concat([X_test_filtered, y_test], axis=1)

df_train_final.to_pickle('train_set_final_filtered.pkl')
df_dev_final.to_pickle('dev_set_final_filtered.pkl')
df_test_final.to_pickle('test_set_final_filtered.pkl')

print("\n Final, optimized, and filtered data sets saved for Modeling.")
