# üéØ Training ML Simple - Fraud Detection

**Objectif** : Entra√Æner un RandomForest simple avec Recall > 90%

**Features** : amt, lat, long, city_pop, merch_lat, merch_long

**Target** : is_fraud

In [1]:
import os
import pandas as pd
import numpy as np
from datetime import datetime, date


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, recall_score
from geopy import distance

import joblib
import dotenv

In [2]:
dotenv.load_dotenv('../.env')

True

In [3]:
csv_path = os.getenv('CSV_PATH')

## 1. Load Data

In [4]:
# Load dataset
data = pd.read_csv(csv_path, index_col=0)

print(f"Dataset shape: {data.shape}")
print(f"\nColumns: {list(data.columns)}")
print(f"\nFraud distribution:")
print(data['is_fraud'].value_counts(normalize=True))

Dataset shape: (555719, 22)

Columns: ['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud']

Fraud distribution:
is_fraud
0    0.99614
1    0.00386
Name: proportion, dtype: float64


## 2. Select Features and pipeline

In [5]:
df_test = data.sample(1000)
df_test.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
324742,2020-10-23 13:48:00,374125201044065,fraud_Collier LLC,home,76.79,Christopher,Gilbert,M,20937 Reed Lakes Apt. 271,Washington,...,38.9757,-77.0282,601723,"Optician, dispensing",1970-07-20,851a5a5c3e5d280be8ce905436f38132,1382536080,38.320918,-77.244308,0
373967,2020-11-14 04:15:32,630423337322,"fraud_Christiansen, Goyette and Schamberger",gas_transport,38.09,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,423b682b048838754bd3769d4e065319,1384402532,48.784571,-118.776208,0
7158,2020-06-23 08:16:25,3518758680062249,fraud_Rodriguez Group,gas_transport,72.6,Antonio,Koch,M,8738 Richard Brooks,Moab,...,38.5677,-109.5271,9772,Location manager,1989-11-24,c1a03ecd7efa8f7fc796e9bcb5a18f02,1371975385,38.131946,-110.238787,0
64866,2020-07-13 17:30:02,213195754007681,"fraud_Conroy, Balistreri and Gorczany",health_fitness,27.29,Zachary,Allen,M,69252 Oconnor Alley Apt. 153,Ollie,...,41.2001,-92.1354,568,Commercial horticulturist,1969-07-24,ef01801e45a382c8c49b373856c5aa98,1373736602,40.927051,-92.928472,0
288682,2020-10-06 23:46:47,4128730454058057622,"fraud_Schaefer, Fay and Hilll",entertainment,5.14,Monique,Martin,F,68276 Matthew Springs,Ratcliff,...,31.3833,-95.0619,43,"Engineer, production",1949-10-04,0b3071a683086cdcb1b5aa4c206b8bbc,1381103207,31.670275,-95.560212,0


### 2.0 create some function

In [6]:
def distance_cus_mer(lat1, lon1, lat2, lon2):
    customer = (lat1, lon1)
    merch = (lat2, lon2)
    dist = distance.distance(customer, merch).km
    return round(dist, 2)

In [7]:
def age(born):
    born  = datetime.strptime(born, '%Y-%m-%d').date()
    today  = date.today()
    return today.year  - born.year  - ((today.month,today.day) < (born.month,born.day))

In [8]:
def make_date_feature(df, col):
    df[col] = pd.to_datetime(df[col])
    df['time'] = pd.to_datetime(df[col]).dt.time
    df['hour'] = pd.to_datetime(df[col]).dt.hour
    df['is_night'] = df['hour'].between(22, 6).astype(int)
    df['is_morning'] = df['hour'].between(6, 12).astype(int)
    df['is_afternoon'] = df['hour'].between(12, 18).astype(int)
    df['is_evening'] = df['hour'].between(18, 22).astype(int)
    df['is_business_hour'] = df['hour'].between(8, 17).astype(int)
    df['year'] = df[col].dt.year
    df['month'] = df[col].dt.month
    df['day'] = df[col].dt.day
    df['dayofweek'] = df[col].dt.day_of_week
    df['is_we'] = df['dayofweek'].between(5, 6).astype(int)

### 2.1 make feature

#### 2.1.1 distance 

In [9]:
df_test['distance_km'] = df_test.apply(lambda row:distance_cus_mer(row['lat'], row['long'], row['merch_lat'], row['merch_long']),axis=1)

#### 2.1.2 merch name

In [10]:
df_test['merchant'] = df_test['merchant'].str.split('fraud_').str[-1]

#### 2.1.3 age & time

In [11]:
df_test['age'] = df_test.apply(lambda row:age(row['dob']),axis=1)

In [12]:
make_date_feature(df_test, col='trans_date_trans_time')

#### 2.1.4 drop columns

In [13]:
df_test = df_test.drop(df_test[['lat','long', 'merch_lat', 'merch_long', 'first', 'last', 'job', 'dob', 'trans_date_trans_time', 'unix_time', 'city', 'street']], axis=1)

In [14]:
df_test.head(5)

Unnamed: 0,cc_num,merchant,category,amt,gender,state,zip,city_pop,trans_num,is_fraud,...,is_night,is_morning,is_afternoon,is_evening,is_business_hour,year,month,day,dayofweek,is_we
324742,374125201044065,Collier LLC,home,76.79,M,DC,20012,601723,851a5a5c3e5d280be8ce905436f38132,0,...,0,0,1,0,1,2020,10,23,4,0
373967,630423337322,"Christiansen, Goyette and Schamberger",gas_transport,38.09,F,WA,99160,149,423b682b048838754bd3769d4e065319,0,...,0,0,0,0,0,2020,11,14,5,1
7158,3518758680062249,Rodriguez Group,gas_transport,72.6,M,UT,84532,9772,c1a03ecd7efa8f7fc796e9bcb5a18f02,0,...,0,1,0,0,1,2020,6,23,1,0
64866,213195754007681,"Conroy, Balistreri and Gorczany",health_fitness,27.29,M,IA,52576,568,ef01801e45a382c8c49b373856c5aa98,0,...,0,0,1,0,1,2020,7,13,0,0
288682,4128730454058057622,"Schaefer, Fay and Hilll",entertainment,5.14,F,TX,75858,43,0b3071a683086cdcb1b5aa4c206b8bbc,0,...,0,0,0,0,0,2020,10,6,1,0


### 2.2 apply it to data

In [15]:
data['distance_km'] = data.apply(lambda row:distance_cus_mer(row['lat'], row['long'], row['merch_lat'], row['merch_long']),axis=1)
data['merchant'] = data['merchant'].str.split('fraud_').str[-1]
data['age'] = data.apply(lambda row:age(row['dob']),axis=1)
make_date_feature(data, col='trans_date_trans_time')
data = data.drop(data[['lat','long', 'merch_lat', 'merch_long', 'first', 'last', 'job', 'dob', 'trans_date_trans_time', 'unix_time', 'time', 'trans_num', 'city', 'street']], axis=1)

In [16]:
data.head(5)

Unnamed: 0,cc_num,merchant,category,amt,gender,state,zip,city_pop,is_fraud,distance_km,...,is_night,is_morning,is_afternoon,is_evening,is_business_hour,year,month,day,dayofweek,is_we
0,2291163933867244,Kirlin and Sons,personal_care,2.86,M,SC,29209,333497,0,24.61,...,0,1,1,0,1,2020,6,21,6,1
1,3573030041201292,Sporer-Keebler,personal_care,29.84,F,UT,84002,302,0,104.83,...,0,1,1,0,1,2020,6,21,6,1
2,3598215285024754,"Swaniawski, Nitzsche and Welch",health_fitness,41.28,F,NY,11710,34496,0,59.2,...,0,1,1,0,1,2020,6,21,6,1
3,3591919803438423,Haley Group,misc_pos,60.05,M,FL,32780,54767,0,27.62,...,0,1,1,0,1,2020,6,21,6,1
4,3526826139003047,Johnston-Casper,travel,3.19,M,MI,49632,1126,0,104.42,...,0,1,1,0,1,2020,6,21,6,1


In [30]:
print(f"\nColumns: {list(data.columns)}")


Columns: ['cc_num', 'merchant', 'category', 'amt', 'gender', 'state', 'zip', 'city_pop', 'is_fraud', 'distance_km', 'age', 'hour', 'is_night', 'is_morning', 'is_afternoon', 'is_evening', 'is_business_hour', 'year', 'month', 'day', 'dayofweek', 'is_we']


In [17]:
target = 'is_fraud'

y = data[[target]]
X = data.drop(target, axis=1)

In [18]:
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = [feature for feature in X.columns if feature not in numerical_features]
print(f' numerical feature => {numerical_features}')
print(f' categorical feature => {categorical_features}')

 numerical feature => ['cc_num', 'amt', 'zip', 'city_pop', 'distance_km', 'age', 'hour', 'is_night', 'is_morning', 'is_afternoon', 'is_evening', 'is_business_hour', 'year', 'month', 'day', 'dayofweek', 'is_we']
 categorical feature => ['merchant', 'category', 'gender', 'state']


## 3. Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=y,  # Important for imbalanced data
    random_state=42
)

print(f"Train set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

Train set: (444575, 21)
Test set: (111144, 21)


In [20]:
print("Encoding categorical features and standardizing numerical features...")
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_transformer = Pipeline(
    steps=[
    ('encoder', OneHotEncoder(drop='first',
                                handle_unknown='ignore')) # first column will be dropped to avoid creating correlations between features
    ])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocessings on train set
print("...Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done...')
print(X_train[0:5]) 
print()

# Preprocessings on test set
print("...Performing preprocessings on test set...")
print(X_test.head()) 
X_test = preprocessor.transform(X_test) # Don't fit again !! 
print('...Done...')
print(X_test[0:5,:])

print("...all Done")

Encoding categorical features and standardizing numerical features...
...Performing preprocessings on train set...
                  cc_num                           merchant       category  \
219591  6011382886333463                          Upton PLC  entertainment   
385043   370877495212014            Casper, Hand and Zulauf    grocery_pos   
270811  3566373869538620                 Conroy-Cruickshank  gas_transport   
270180  6517217825320610  Parisian, Schiller and Altenwerth       misc_net   
551235   213125815021702                    Breitenberg LLC         travel   

           amt gender state    zip  city_pop  distance_km  age  ...  is_night  \
219591   18.25      M    MI  48636       864       114.13   83  ...         0   
385043  192.53      F    NC  28659     21134       109.92   41  ...         0   
270811   43.90      M    NM  87540       923       112.66   61  ...         0   
270180    1.25      M    MS  39665      1196       100.35   67  ...         0   
551235    9

## 4. Train Model (RandomForest)

In [21]:
# RandomForest with class_weight='balanced' for imbalanced data
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=10,
    class_weight='balanced',  # ‚≠ê Important for fraud detection
    random_state=42,
    n_jobs=-1
)

print("üî® Training RandomForest...")
model.fit(X_train, y_train)
print("‚úÖ Training completed!")

üî® Training RandomForest...


  return fit_method(estimator, *args, **kwargs)


‚úÖ Training completed!


## 5. Evaluate Model

In [22]:
# Predictions
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

# Metrics
print("\n" + "="*60)
print("üìä MODEL PERFORMANCE")
print("="*60)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Legitimate', 'Fraud']))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

recall = recall_score(y_test, y_pred)
print(f"\n‚≠ê RECALL (most important): {recall:.2%}")

if recall >= 0.90:
    print("‚úÖ Target achieved (Recall >= 90%)")
else:
    print("‚ö†Ô∏è Target not reached (adjust class_weight or threshold)")


üìä MODEL PERFORMANCE

Classification Report:
              precision    recall  f1-score   support

  Legitimate       1.00      0.96      0.98    110715
       Fraud       0.08      0.82      0.14       429

    accuracy                           0.96    111144
   macro avg       0.54      0.89      0.56    111144
weighted avg       1.00      0.96      0.98    111144


Confusion Matrix:
[[106562   4153]
 [    79    350]]

‚≠ê RECALL (most important): 81.59%
‚ö†Ô∏è Target not reached (adjust class_weight or threshold)


## 6. Feature Importance

In [29]:
# Feature importance
importance_df = pd.DataFrame({
    'features' : preprocessor.get_feature_names_out(),
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False).head(10)

print("\nüìä Feature Importance:")
importance_df


üìä Feature Importance:


Unnamed: 0,features,importance
1,num__amt,0.327258
6,num__hour,0.120358
11,num__is_business_hour,0.105248
9,num__is_afternoon,0.071663
8,num__is_morning,0.044993
719,cat__category_shopping_net,0.031036
712,cat__category_grocery_pos,0.029997
13,num__month,0.026548
5,num__age,0.017051
14,num__day,0.014173


## 7. Save Model

In [None]:
# Save model & preprocessor
model_path = '../04_models/fraud_model.pkl'
preprocessor_path = '../04_models/preprocessor.pkl'
joblib.dump(preprocessor, preprocessor_path)
joblib.dump(model, model_path, compress=3)

print(f"\n‚úÖ Model saved to: {model_path}")
print(f"\nModel size: {os.path.getsize(model_path) / 1024:.2f} KB")

# Test loading
loaded_model = joblib.load(model_path)
test_pred = loaded_model.predict(X_test[:5])
print(f"\n‚úÖ Model loaded successfully!")
print(f"Test prediction: {test_pred}")

## 8. push to hf_model

### 8.1 import & config

In [None]:
import os
from huggingface_hub import HfApi
from dotenv import load_dotenv

load_dotenv()


print("=" * 60)
print("üöÄ Uploading Model + Preprocessor to HuggingFace Hub")
print("=" * 60)


# ----------------------------
# Configuration
# ----------------------------


repo_id = os.getenv('HF_MODEL_REPO', 'Terorra/fd_model_jedha')
token = os.getenv('HF_TOKEN')

if not token:
    print("‚ùå HF_TOKEN not found in .env")
    return

api = HfApi()


### 8.2 upload preprocessor

In [None]:
# ----------------------------
# Upload preprocessor
# ----------------------------


print("\nüì¶ Uploading preprocessor.pkl...")
try:
    api.upload_file(
        path_or_fileobj="../04_models/preprocessor.pkl",
        path_in_repo="preprocessor.pkl",
        repo_id=repo_id,
        token=token
    )
    print("‚úÖ Preprocessor uploaded")
except Exception as e:
    print(f"‚ùå Preprocessor upload failed: {e}")
    return

### 8.3 upload model

In [None]:
# ----------------------------
# Upload model
# ----------------------------


print("\nüì¶ Uploading fraud_model.pkl...")
try:
    api.upload_file(
        path_or_fileobj="../04_models/fraud_model.pkl",
        path_in_repo="fraud_model.pkl",
        repo_id=repo_id,
        token=token
    )
    print("‚úÖ Model uploaded")
except Exception as e:
    print(f"‚ùå Model upload failed: {e}")
    return

## 9. final test

In [None]:
# import  
import requests
from huggingface_hub import hf_hub_download
import joblib
import pandas as pd
from math import radians, sin, cos, sqrt, atan2

In [None]:
api_url = os.getenv('PAYMENT_API_URL')
hf_token = os.getenv('HF_TOKEN')

In [None]:
r = requests.get(api_url)
api_data = r.json()
df_api = pd.read_json(api_data, orient='split')

df_api['current_time'] = df_api['current_time'].astype(str)
df_api = df_api.rename(columns={'current_time': 'transaction_time'})
df_api['merchant'] = df_api['merchant'].str.split('fraud_').str[-1]

df_api

  df_api = pd.read_json(api_data, orient='split')


Unnamed: 0,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,merch_lat,merch_long,is_fraud,transaction_time
388961,38530489946071,Rempel Inc,shopping_net,2.56,Laura,Johns,F,95835 Garcia Rue,Arcadia,SC,29320,34.9572,-81.9916,530,Animal technologist,1989-05-14,62311691d3eb6b515a9b936d79982bea,34.707109,-81.093911,0,2026-01-28 23:54:22.833


In [None]:
# ===========================
# make some custom functtion 
# ===========================

# => support function 
# --------------------------------------------------

def age(born):
    """
    Calculate age from birth date
    
    Args:
        born (str): Birth date in format 'YYYY-MM-DD'
    
    Returns:
        int: Age in years, or None if invalid
    
    Example:
        >>> age('1990-01-15')
        36
    """
    if pd.isna(born) or born is None or born == '':
        return None
    
    try:
        born_date = datetime.strptime(str(born), '%Y-%m-%d').date()
        today = date.today()
        return today.year - born_date.year - ((today.month, today.day) < (born_date.month, born_date.day))
    except Exception as e:
        print(f"‚ö†Ô∏è Error calculating age for {born}: {e}")
        return None

def distance_cus_mer(lat1, lon1, lat2, lon2):
            
    """
    Calculate distance between two GPS coordinates using Haversine formula
    Fallback if geopy is not available
    
    Args:
        lat1, lon1: First point coordinates
        lat2, lon2: Second point coordinates
    
    Returns:
        float: Distance in kilometers
    """
    if any(pd.isna([lat1, lon1, lat2, lon2])):
        return None
    
    try:
        R = 6371  # Earth radius in km
        lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
        c = 2 * atan2(sqrt(a), sqrt(1-a))
        return round(R * c, 2)
    except Exception as e:
        print(f"‚ö†Ô∏è Error calculating haversine distance: {e}")
        return None
    
def make_date_feature(df, col='transaction_time'):
    """
    Add comprehensive time-based features
    
    Features added:
    - time: Time of day
    - hour: Hour (0-23)
    - is_night: 22h-6h
    - is_morning: 6h-12h
    - is_afternoon: 12h-18h
    - is_evening: 18h-22h
    - is_business_hour: 8h-17h
    - year, month, day
    - dayofweek: 0=Monday, 6=Sunday
    - is_we: Weekend flag
    
    Args:
        df (DataFrame): Input data
        col (str): Name of datetime column
    
    Returns:
        None (modifies df in-place)
    """
    if col not in df.columns:
        print(f"‚ö†Ô∏è Column '{col}' not found for date features")
        return
    
    try:
        # Parse datetime
        df[col] = pd.to_datetime(df[col])
        
        # Time features
        df['time'] = df[col].dt.time
        df['hour'] = df[col].dt.hour
        
        # Time periods
        df['is_night'] = df['hour'].between(22, 6, inclusive="left").astype(int)
        df['is_morning'] = df['hour'].between(6, 12, inclusive="left").astype(int)
        df['is_afternoon'] = df['hour'].between(12, 18, inclusive="left").astype(int)
        df['is_evening'] = df['hour'].between(18, 22, inclusive="left").astype(int)
        df['is_business_hour'] = df['hour'].between(8, 17).astype(int)
        
        # Date components
        df['year'] = df[col].dt.year
        df['month'] = df[col].dt.month
        df['day'] = df[col].dt.day
        df['dayofweek'] = df[col].dt.day_of_week
        df['is_we'] = df['dayofweek'].between(5, 6).astype(int)
        
        print(f"  ‚úÖ Time features added from '{col}'")
        
    except Exception as e:
        print(f"  ‚ö†Ô∏è Time features failed: {e}")


# => apply function 
# --------------------------------------------------

def add_age_feature(df, dob_column='dob', verbose=True):
    """
    Add age feature to DataFrame
    
    Args:
        df (DataFrame): Input data
        dob_column (str): Name of date of birth column
        verbose (bool): Print progress
    
    Returns:
        DataFrame: Data with 'age' column added
    """
    df = df.copy()
    
    if dob_column not in df.columns:
        if verbose:
            print(f"  ‚ö†Ô∏è Column '{dob_column}' not found")
        return df
    
    try:
        df['age'] = df[dob_column].apply(age)
        valid_ages = df['age'].notna().sum()
        if verbose:
            print(f"  ‚úÖ Age feature added ({valid_ages}/{len(df)} valid)")
        return df
        
    except Exception as e:
        if verbose:
            print(f"  ‚ö†Ô∏è Age feature failed: {e}")
        return df

def add_distance_feature(df, 
                         client_lat='lat', client_lon='long',
                         merchant_lat='merch_lat', merchant_lon='merch_long',
                         verbose=True):
    """
    Add distance feature between client and merchant using geopy
    
    Args:
        df (DataFrame): Input data
        client_lat, client_lon: Client coordinate columns
        merchant_lat, merchant_lon: Merchant coordinate columns
        verbose (bool): Print progress
    
    Returns:
        DataFrame: Data with 'distance_km' column added
    """
    df = df.copy()
    
    required_cols = [client_lat, client_lon, merchant_lat, merchant_lon]
    missing_cols = [col for col in required_cols if col not in df.columns]
    
    if missing_cols:
        if verbose:
            print(f"  ‚ö†Ô∏è Missing columns for distance: {missing_cols}")
        return df
    
    try:
        df['distance_km'] = df.apply(
            lambda row: distance_cus_mer(
                row[client_lat], row[client_lon],
                row[merchant_lat], row[merchant_lon]
            ),
            axis=1
        )
        
        valid_distances = df['distance_km'].notna().sum()
        if verbose:
            print(f"  ‚úÖ Distance feature added ({valid_distances}/{len(df)} valid)")
        return df
        
    except Exception as e:
        if verbose:
            print(f"  ‚ö†Ô∏è Distance feature failed: {e}")
        return df
    
def add_time_features(df, time_column='transaction_time', verbose=True):
    """
    Add comprehensive time-based features
    
    Args:
        df (DataFrame): Input data
        time_column (str): Name of datetime column
        verbose (bool): Print progress
    
    Returns:
        DataFrame: Data with time features added
    """
    df = df.copy()
    
    if time_column not in df.columns:
        if verbose:
            print(f"  ‚ö†Ô∏è Column '{time_column}' not found")
        return df
    
    try:
        make_date_feature(df, col=time_column)
        
        if verbose:
            features = ['hour', 'is_night', 'is_morning', 'is_afternoon', 
                       'is_evening', 'is_business_hour', 'dayofweek', 'is_we']
            print(f"  ‚úÖ Time features added: {features}")
        return df
        
    except Exception as e:
        if verbose:
            print(f"  ‚ö†Ô∏è Time features failed: {e}")
        return df

def add_engineered_features(df, verbose=True):
    """
    Add ALL engineered features to DataFrame
    Main function used in DAG
    
    Features added:
    - age: Age from dob
    - distance_km: Customer-merchant distance
    - hour, is_night, is_morning, is_afternoon, is_evening, is_business_hour
    - year, month, day, dayofweek, is_we
    
    Args:
        df (DataFrame): Input data
        verbose (bool): Print progress messages
    
    Returns:
        DataFrame: Data with all engineered features
    """
    df = df.copy()
    
    if verbose:
        print(f"üîß Feature Engineering (starting with {len(df)} rows):")
    
    # Add age
    df = add_age_feature(df, verbose=verbose)
    
    # Add distance (using geopy)
    df = add_distance_feature(df, verbose=verbose)
    
    # Add time features
    df = add_time_features(df, verbose=verbose)
    
    # Note: amt_per_capita not added by default (uncomment if needed)
    # df = add_amount_features(df, verbose=verbose)
    
    if verbose:
        possible_features = ['age', 'distance_km', 'hour', 'is_night', 'is_morning',
                            'is_afternoon', 'is_evening', 'is_business_hour',
                            'year', 'month', 'day', 'dayofweek', 'is_we']
        added_features = [f for f in possible_features if f in df.columns]
        print(f"‚úÖ Feature engineering complete - Added: {len(added_features)} features")
    
    return df

def drop_columns_for_training(df, verbose=True):
    """
    Drop columns not needed for ML training
    Keeps only features used by the model
    
    Columns to drop:
    - lat, long, merch_lat, merch_long (used for distance calculation)
    - first, last, job, dob (personal info)
    - trans_date_trans_time, unix_time (raw time)
    - city, street (location details)
    - time (redundant with hour)
    - trans_num (ID, not a feature)
    
    Args:
        df (DataFrame): Data with all columns
        verbose (bool): Print progress
    
    Returns:
        DataFrame: Data with only training features
    """
    df = df.copy()
    
    # Columns to drop
    cols_to_drop = ['lat', 'long', 'merch_lat', 'merch_long',
                    'first', 'last', 'job', 'dob',
                    'transaction_time',
                    'city', 'street', 'time']
    
    # Only drop columns that exist
    existing_cols_to_drop = [col for col in cols_to_drop if col in df.columns]
    
    if existing_cols_to_drop:
        df = df.drop(columns=existing_cols_to_drop)
        if verbose:
            print(f"  ‚úÖ Dropped {len(existing_cols_to_drop)} columns: {existing_cols_to_drop}")
    
    return df


In [None]:
df_api = add_engineered_features(df_api)

üîß Feature Engineering (starting with 1 rows):
  ‚úÖ Age feature added (1/1 valid)
  ‚úÖ Distance feature added (1/1 valid)
  ‚úÖ Time features added from 'transaction_time'
  ‚úÖ Time features added: ['hour', 'is_night', 'is_morning', 'is_afternoon', 'is_evening', 'is_business_hour', 'dayofweek', 'is_we']
‚úÖ Feature engineering complete - Added: 13 features


In [None]:
df_api = drop_columns_for_training(df_api)

  ‚úÖ Dropped 12 columns: ['lat', 'long', 'merch_lat', 'merch_long', 'first', 'last', 'job', 'dob', 'transaction_time', 'city', 'street', 'time']


In [None]:
df_api

Unnamed: 0,cc_num,merchant,category,amt,gender,state,zip,city_pop,trans_num,is_fraud,age,distance_km,hour,is_night,is_morning,is_afternoon,is_evening,is_business_hour,year,month,day,dayofweek,is_we
388961,38530489946071,Rempel Inc,shopping_net,2.56,F,SC,29320,530,62311691d3eb6b515a9b936d79982bea,0,36,86.52,23,0,0,0,0,0,2026,1,28,2,0


In [None]:
model_path = hf_hub_download(
    repo_id="Terorra/fd_model_jedha",
    filename="fraud_model.pkl"
)

fraud_model.pkl:   0%|          | 0.00/751k [00:00<?, ?B/s]

In [None]:
preprocessor_path = hf_hub_download(
    repo_id="Terorra/fd_model_jedha",
    filename="preprocessor.pkl"
)

preprocessor.pkl:   0%|          | 0.00/24.0k [00:00<?, ?B/s]

In [None]:
model = joblib.load(model_path)

In [None]:
preprocessor = joblib.load(preprocessor_path)

In [None]:
X_transformed = preprocessor.transform(df_api)
print(f"‚úÖ Data preprocessed: {X_transformed.shape}")

‚úÖ Data preprocessed: (1, 772)


In [None]:
predictions = model.predict(X_transformed)
probabilities = model.predict_proba(X_transformed)[:, 1]

In [None]:
df_api['is_fraud_pred'] = predictions
df_api['fraud_probability'] = probabilities
df_api

Unnamed: 0,cc_num,merchant,category,amt,gender,state,zip,city_pop,trans_num,is_fraud,age,distance_km,hour,is_night,is_morning,is_afternoon,is_evening,is_business_hour,year,month,day,dayofweek,is_we,is_fraud_pred,fraud_probability
388961,38530489946071,Rempel Inc,shopping_net,2.56,F,SC,29320,530,62311691d3eb6b515a9b936d79982bea,0,36,86.52,23,0,0,0,0,0,2026,1,28,2,0,1,0.521895
