In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [None]:
# train_df = pd.read_csv('fraudTrain.csv', index_col=0)
# test_df = pd.read_csv('fraudTest.csv', index_col=0)

In [None]:
# df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)
# fraud_counts = df['is_fraud'].value_counts()
# print(fraud_counts)

In [None]:
# plt.figure(figsize=(8, 8))
# labels = ['Legitimate (0)', 'Fraud (1)']
# colors = ['#66b3ff', '#ff9999']
# explode = (0, 0.1)  
# plt.pie(fraud_counts, 
#         labels=labels, 
#         autopct='%1.1f%%', 
#         startangle=140, 
#         colors=colors, 
#         explode=explode, 
#         shadow=True)

# plt.title('Distribution of Fraudulent vs Legitimate Transactions', fontsize=15)
# plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
# plt.show()

In [None]:
# print("Value Counts:")
# print(fraud_counts)

In [None]:
# df.to_csv('cc_data.csv', index=False)
# print("File saved successfully as 'combined_fraud_data.csv'")

In [7]:
df = pd.read_csv('cc_data.csv')


In [8]:


pd.set_option("display.max_columns", 100)

2Ô∏è‚É£ Feature engineering


In [9]:
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['dob'] = pd.to_datetime(df['dob'])

# Sort by customer and time
df = df.sort_values(by=['cc_num', 'trans_date_trans_time']).reset_index(drop=True)

# Time features
df['transaction_hour'] = df['trans_date_trans_time'].dt.hour
df['transaction_day'] = df['trans_date_trans_time'].dt.day
df['transaction_month'] = df['trans_date_trans_time'].dt.month
df['transaction_weekday'] = df['trans_date_trans_time'].dt.weekday
df['is_weekend'] = df['transaction_weekday'].isin([5, 6]).astype(int)

# Age
df['age'] = (df['trans_date_trans_time'] - df['dob']).dt.days / 365.25

In [10]:
# -------------------------------
# Rolling transaction counts
# -------------------------------
df.set_index('trans_date_trans_time', inplace=True)
df['txn_count_1h'] = df.groupby('cc_num')['amt'].rolling('1h').count().reset_index(level=0, drop=True)
df['txn_count_24h'] = df.groupby('cc_num')['amt'].rolling('24h').count().reset_index(level=0, drop=True)

# Rolling amount statistics (24h, no leakage)
df['amt_mean_24h'] = df.groupby('cc_num')['amt'].rolling('24h', closed='left').mean().reset_index(level=0, drop=True)
df['amt_std_24h'] = df.groupby('cc_num')['amt'].rolling('24h', closed='left').std().reset_index(level=0, drop=True)
df['amt_zscore_24h'] = (df['amt'] - df['amt_mean_24h']) / (df['amt_std_24h'] + 1e-6)
df.reset_index(inplace=True)


In [11]:
# -------------------------------
# Geodistance
# -------------------------------
def haversine(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    c = 2*np.arcsin(np.sqrt(a))
    return 6371 * c

df['geo_distance_km'] = haversine(df['lat'], df['long'], df['merch_lat'], df['merch_long'])



In [12]:

# -------------------------------
# Time-based train/test split (no leakage)
# -------------------------------
df = df.sort_values('trans_date_trans_time')
split_time = df['trans_date_trans_time'].quantile(0.8)
train_df = df[df['trans_date_trans_time'] <= split_time].copy()
test_df  = df[df['trans_date_trans_time'] > split_time].copy()

In [13]:
# -------------------------------
# 6 Handle missing values safely
# -------------------------------
cols_to_process = ['amt_mean_24h', 'amt_std_24h', 'amt_zscore_24h']
for col in cols_to_process:
    train_df[f'{col}_missing'] = train_df[col].isna().astype(int)
    test_df[f'{col}_missing'] = test_df[col].isna().astype(int)
    median = train_df[col].median()  # Use train median only
    train_df[col] = train_df[col].fillna(median)
    test_df[col] = test_df[col].fillna(median)

In [15]:

# -------------------------------
# Risk encoding (train-only, no leakage)
# -------------------------------
global_fraud_rate = train_df['is_fraud'].mean()

def risk_encode(train_df, test_df, col, target='is_fraud', min_samples=50):
    stats = train_df.groupby(col)[target].agg(['mean','count']).rename(columns={'mean':'fraudrate','count':'n'})
    stats['risk'] = (stats['fraudrate']*stats['n'] + global_fraud_rate*min_samples) / (stats['n']+min_samples)
    train_encoded = train_df[col].map(stats['risk']).fillna(global_fraud_rate)
    test_encoded  = test_df[col].map(stats['risk']).fillna(global_fraud_rate)
    return train_encoded, test_encoded

for col in ['merchant','category','job']:
    train_df[f'{col}_risk'], test_df[f'{col}_risk'] = risk_encode(train_df, test_df, col)

In [16]:
print(train_df.columns.tolist())

['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud', 'transaction_hour', 'transaction_day', 'transaction_month', 'transaction_weekday', 'is_weekend', 'age', 'txn_count_1h', 'txn_count_24h', 'amt_mean_24h', 'amt_std_24h', 'amt_zscore_24h', 'geo_distance_km', 'amt_mean_24h_missing', 'amt_std_24h_missing', 'amt_zscore_24h_missing', 'merchant_risk', 'category_risk', 'job_risk']


In [17]:
# -------------------------------
# 8 Prepare final features
# -------------------------------
FINAL_FEATURES = [
    'amt', 'gender', 'city_pop', 'age', 'transaction_hour', 'transaction_day', 
    'transaction_month', 'transaction_weekday', 'is_weekend',
    'txn_count_1h', 'txn_count_24h', 'amt_mean_24h', 'amt_std_24h', 'amt_zscore_24h',
    'amt_mean_24h_missing', 'amt_std_24h_missing', 'amt_zscore_24h_missing',
    'geo_distance_km', 'merchant_risk', 'category_risk', 'job_risk',
    'lat', 'long', 'merch_lat', 'merch_long'
]


X_train = train_df[FINAL_FEATURES].copy()
y_train = train_df['is_fraud']
X_test  = test_df[FINAL_FEATURES].copy()
y_test  = test_df['is_fraud']

In [18]:

# -------------------------------
# Encode gender
# -------------------------------
X_train['gender'] = X_train['gender'].str.upper().map({'M':1,'F':0}).fillna(-1).astype(int)
X_test['gender']  = X_test['gender'].str.upper().map({'M':1,'F':0}).fillna(-1).astype(int)

In [17]:
X_train.head()


Unnamed: 0,amt,gender,city_pop,age,transaction_hour,transaction_day,transaction_month,transaction_weekday,is_weekend,txn_count_1h,txn_count_24h,amt_mean_24h,amt_std_24h,amt_zscore_24h,amt_mean_24h_missing,amt_std_24h_missing,amt_zscore_24h_missing,geo_distance_km,merchant_risk,category_risk,job_risk,lat,long,merch_lat,merch_long
839573,4.97,0,3495,30.814511,0,1,1,1,0,1.0,1.0,54.852361,41.299177,-0.298,0,0,0,78.597568,0.013787,0.013851,0.004182,36.0788,-81.1781,36.011293,-82.048315
68160,107.23,0,149,40.531143,0,1,1,1,0,1.0,1.0,54.852361,41.299177,-0.298,0,0,0,30.212176,0.010485,0.013605,0.001919,48.8878,-118.2105,49.159047,-118.186462
443631,220.11,1,4154,56.950034,0,1,1,1,0,1.0,1.0,54.852361,41.299177,-0.298,0,0,0,108.206083,0.001931,0.002415,0.012877,42.1808,-112.262,43.150704,-112.154481
974884,45.0,1,1939,51.969884,0,1,1,1,0,1.0,1.0,54.852361,41.299177,-0.298,0,0,0,95.673231,0.003075,0.004446,0.006886,46.2306,-112.1138,47.034331,-112.561071
702664,41.96,1,99,32.76386,0,1,1,1,0,1.0,1.0,54.852361,41.299177,-0.298,0,0,0,77.556744,0.003858,0.003093,0.005571,38.4207,-79.4629,38.674999,-78.632459


In [None]:
from imblearn.pipeline import Pipeline
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import TomekLinks, EditedNearestNeighbours
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
 


In [4]:
# ========================== CONFIG ==========================
CONFIG = {
    "experiment_name": "Fast_Model_Selection for Credit Card Fraud Detection",
    "mlflow_uri": "https://dagshub.com/VIKR4NT10/codesoft.mlflow",
    "repo_owner": "VIKR4NT10",
    "repo_name": "codesoft"
}

# ========================== MLflow + DAGsHub ==========================
mlflow.set_tracking_uri(CONFIG["mlflow_uri"])
dagshub.init(
    repo_owner=CONFIG["repo_owner"],
    repo_name=CONFIG["repo_name"],
    mlflow=True
)
mlflow.set_experiment(CONFIG["experiment_name"])

<Experiment: artifact_location='mlflow-artifacts:/c9b9923bb97249ce941d5332da26ec05', creation_time=1768716939875, experiment_id='4', last_update_time=1768716939875, lifecycle_stage='active', name='Fast_Model_Selection for Credit Card Fraud Detection', tags={}>

In [20]:
def evaluate(model, X_test, y_test):
    y_prob = model.predict_proba(X_test)[:, 1]
   
    return {
        "PR_AUC": average_precision_score(y_test, y_prob),
        "ROC_AUC": roc_auc_score(y_test, y_prob)
    }


In [21]:
TREE_FEATURES = X_train.columns.tolist()

LINEAR_FEATURES = [
    c for c in X_train.columns
    if c not in ["amt_mean_24h", "amt_std_24h"]
]


In [22]:
models = {
    "xgb": XGBClassifier(
        n_estimators=100,         # reduced from 300
        max_depth=4,              # reduced from 6
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),
        tree_method="hist",
        n_jobs=-1,
        random_state=42
    ),
    "rf": RandomForestClassifier(
        n_estimators=100,         # reduced from 200
        max_depth=10,             # reduced from 18
        min_samples_leaf=50,
        class_weight="balanced",
        n_jobs=-1,
        random_state=42
    ),
    "logreg": LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        n_jobs=-1
    ),
    "mlp": MLPClassifier(
        hidden_layer_sizes=(64, 32),
        alpha=1e-4,
        max_iter=30,
        early_stopping=True,
        random_state=42
    )
}


In [23]:
# ---------- Samplers ----------
samplers = {
    "cost_sensitive": None,
    "tomek": TomekLinks(n_jobs=-1),
    "enn": EditedNearestNeighbours(n_neighbors=3, n_jobs=-1),
    "smote_enn": SMOTEENN(random_state=42)
}


In [24]:
experiments = [
    ("cost_sensitive", "xgb"),
    ("cost_sensitive", "rf"),
    ("cost_sensitive", "logreg"),
    ("tomek", "xgb"),        # will skip TomekLinks automatically
    ("tomek", "rf"),         # will skip TomekLinks automatically
    ("enn", "rf"),
    ("smote_enn", "logreg"),
    ("cost_sensitive", "mlp")
]


In [None]:
# import mlflow

# # Set your experiment name
# mlflow.set_experiment("credit_card_fraud_experiments")

# # Get the experiment object
# experiment = mlflow.get_experiment_by_name("credit_card_fraud_experiments")
# experiment_id = experiment.experiment_id
# print("Experiment ID:", experiment_id)
# runs = mlflow.search_runs(experiment_ids=[experiment_id])
# print(runs[["run_id", "status", "tags.mlflow.runName"]])
# for run_id in runs["run_id"]:
#     mlflow.delete_run(run_id)

# print(f"Deleted {len(runs)} runs from experiment '{experiment.name}'")


In [25]:
from sklearn.model_selection import train_test_split

FRAC = 0.3  # 30% of training data

X_train_sub, _, y_train_sub, _ = train_test_split(
    X_train,
    y_train,
    train_size=FRAC,
    stratify=y_train,
    random_state=42
)


In [26]:
# ---------------- Experiment loop ----------------
results = []
mlflow.set_experiment("credit_card_fraud_experiments")

for sampler_name, model_name in experiments:
    print(f"\nRunning: {sampler_name} + {model_name}")

    sampler = samplers[sampler_name]
    model = models[model_name]

    # Feature selection
    if model_name in ["xgb", "rf"]:
        Xtr, Xte = X_train_sub, X_test
        feature_type = "tree"
        use_scaler = False
    else:
        Xtr = X_train_sub[LINEAR_FEATURES]
        Xte = X_test[LINEAR_FEATURES]
        feature_type = "linear"
        use_scaler = True

    with mlflow.start_run():
        mlflow.log_param("model", model_name)
        mlflow.log_param("sampler", sampler_name)
        mlflow.log_param("feature_type", feature_type)
        mlflow.log_param("train_frac", FRAC)

        # ----- Build pipeline -----
        steps = []

        if sampler is not None:
            steps.append(("sampler", sampler))

        if use_scaler:
            steps.append(("scaler", StandardScaler()))

        steps.append(("model", model))

        pipeline = Pipeline(steps)

        # ----- Train -----
        pipeline.fit(Xtr, y_train_sub)

        # ----- Evaluate -----
        metrics = evaluate(pipeline, Xte, y_test)

        for k, v in metrics.items():
            mlflow.log_metric(k, v)

        mlflow.sklearn.log_model(pipeline, "model")

        results.append({
            "sampler": sampler_name,
            "model": model_name,
            **metrics
        })


Running: cost_sensitive + xgb




üèÉ View run receptive-horse-47 at: https://dagshub.com/VIKR4NT10/codesoft.mlflow/#/experiments/5/runs/e9ed5b8592b04cdbb27a13acf35b8d15
üß™ View experiment at: https://dagshub.com/VIKR4NT10/codesoft.mlflow/#/experiments/5

Running: cost_sensitive + rf




üèÉ View run chill-gull-299 at: https://dagshub.com/VIKR4NT10/codesoft.mlflow/#/experiments/5/runs/9fd3c16c7c884975840e7269df1ceb2b
üß™ View experiment at: https://dagshub.com/VIKR4NT10/codesoft.mlflow/#/experiments/5

Running: cost_sensitive + logreg




üèÉ View run adorable-goose-762 at: https://dagshub.com/VIKR4NT10/codesoft.mlflow/#/experiments/5/runs/1b07b372955d436494cbf930654f92f4
üß™ View experiment at: https://dagshub.com/VIKR4NT10/codesoft.mlflow/#/experiments/5

Running: tomek + xgb




üèÉ View run handsome-shad-336 at: https://dagshub.com/VIKR4NT10/codesoft.mlflow/#/experiments/5/runs/8baf5fc1e4d34dbdbb94b04f33a34312
üß™ View experiment at: https://dagshub.com/VIKR4NT10/codesoft.mlflow/#/experiments/5

Running: tomek + rf




üèÉ View run victorious-swan-781 at: https://dagshub.com/VIKR4NT10/codesoft.mlflow/#/experiments/5/runs/6c1c682ab6154de183e34e1add0cf4b5
üß™ View experiment at: https://dagshub.com/VIKR4NT10/codesoft.mlflow/#/experiments/5

Running: enn + rf




üèÉ View run clumsy-wren-362 at: https://dagshub.com/VIKR4NT10/codesoft.mlflow/#/experiments/5/runs/83a76c3a60bd420195e09bbffbeb5318
üß™ View experiment at: https://dagshub.com/VIKR4NT10/codesoft.mlflow/#/experiments/5

Running: smote_enn + logreg




üèÉ View run intelligent-grub-994 at: https://dagshub.com/VIKR4NT10/codesoft.mlflow/#/experiments/5/runs/43ea28b351e9419f90774c9491ab3453
üß™ View experiment at: https://dagshub.com/VIKR4NT10/codesoft.mlflow/#/experiments/5

Running: cost_sensitive + mlp




üèÉ View run industrious-swan-37 at: https://dagshub.com/VIKR4NT10/codesoft.mlflow/#/experiments/5/runs/a80625b9a27247fb9a12eac92afcd9f6
üß™ View experiment at: https://dagshub.com/VIKR4NT10/codesoft.mlflow/#/experiments/5


In [27]:
results_df = pd.DataFrame(results).sort_values("PR_AUC", ascending=False)
print(results_df)


          sampler   model    PR_AUC   ROC_AUC
4           tomek      rf  0.798064  0.993175
5             enn      rf  0.784224  0.994010
1  cost_sensitive      rf  0.776676  0.993588
0  cost_sensitive     xgb  0.721121  0.991970
3           tomek     xgb  0.716431  0.991043
7  cost_sensitive     mlp  0.600800  0.966482
2  cost_sensitive  logreg  0.123529  0.853643
6       smote_enn  logreg  0.108587  0.839485


In [None]:
from sklearn.metrics import precision_score, recall_score

# Create a results list with precision and recall
results_with_pr = []

# Use a standard threshold 0.5 for now
THRESHOLD = 0.5

for entry in results:
    sampler_name = entry["sampler"]
    model_name = entry["model"]

    # Retrieve the fitted pipeline/model from MLflow or previous loop
    # Assuming you kept `fitted_model` for each combination
    pipeline_or_model = entry.get("fitted_model")  # If you stored it

    if pipeline_or_model is None:
        continue  # skip if model not stored

    # Predict probabilities
    y_prob = pipeline_or_model.predict_proba(X_test)[:, 1]

    # Convert to class predictions using threshold
    y_pred = (y_prob >= THRESHOLD).astype(int)

    # Compute precision and recall
    prec = precision_score(y_test, y_pred)
    rec  = recall_score(y_test, y_pred)

    results_with_pr.append({
        "sampler": sampler_name,
        "model": model_name,
        "PR_AUC": entry["PR_AUC"],
        "ROC_AUC": entry["ROC_AUC"],
        "Precision": prec,
        "Recall": rec
    })

# Convert to DataFrame
results_pr_df = pd.DataFrame(results_with_pr).sort_values("PR_AUC", ascending=False)
print(results_pr_df)


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('cc_data.csv')
pd.set_option("display.max_columns", 100)
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['dob'] = pd.to_datetime(df['dob'])

# Sort by customer and time
df = df.sort_values(by=['cc_num', 'trans_date_trans_time']).reset_index(drop=True)

# Time features
df['transaction_hour'] = df['trans_date_trans_time'].dt.hour
df['transaction_day'] = df['trans_date_trans_time'].dt.day
df['transaction_month'] = df['trans_date_trans_time'].dt.month
df['transaction_weekday'] = df['trans_date_trans_time'].dt.weekday
df['is_weekend'] = df['transaction_weekday'].isin([5, 6]).astype(int)

# Age
df['age'] = (df['trans_date_trans_time'] - df['dob']).dt.days / 365.25

# -------------------------------
# Rolling transaction counts
# -------------------------------
df.set_index('trans_date_trans_time', inplace=True)
df['txn_count_1h'] = df.groupby('cc_num')['amt'].rolling('1h').count().reset_index(level=0, drop=True)
df['txn_count_24h'] = df.groupby('cc_num')['amt'].rolling('24h').count().reset_index(level=0, drop=True)

# Rolling amount statistics (24h, no leakage)
df['amt_mean_24h'] = df.groupby('cc_num')['amt'].rolling('24h', closed='left').mean().reset_index(level=0, drop=True)
df['amt_std_24h'] = df.groupby('cc_num')['amt'].rolling('24h', closed='left').std().reset_index(level=0, drop=True)
df['amt_zscore_24h'] = (df['amt'] - df['amt_mean_24h']) / (df['amt_std_24h'] + 1e-6)
df.reset_index(inplace=True)

# -------------------------------
# Geodistance
# -------------------------------
def haversine(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    c = 2*np.arcsin(np.sqrt(a))
    return 6371 * c

df['geo_distance_km'] = haversine(df['lat'], df['long'], df['merch_lat'], df['merch_long'])

# -------------------------------
# Time-based train/test split (no leakage)
# -------------------------------
df = df.sort_values('trans_date_trans_time')
split_time = df['trans_date_trans_time'].quantile(0.8)
train_df = df[df['trans_date_trans_time'] <= split_time].copy()
test_df  = df[df['trans_date_trans_time'] > split_time].copy()

# -------------------------------
# 6 Handle missing values safely
# -------------------------------
cols_to_process = ['amt_mean_24h', 'amt_std_24h', 'amt_zscore_24h']
for col in cols_to_process:
    train_df[f'{col}_missing'] = train_df[col].isna().astype(int)
    test_df[f'{col}_missing'] = test_df[col].isna().astype(int)
    median = train_df[col].median()  # Use train median only
    train_df[col] = train_df[col].fillna(median)
    test_df[col] = test_df[col].fillna(median)

# -------------------------------
# Risk encoding (train-only, no leakage)
# -------------------------------
global_fraud_rate = train_df['is_fraud'].mean()

def risk_encode(train_df, test_df, col, target='is_fraud', min_samples=50):
    stats = train_df.groupby(col)[target].agg(['mean','count']).rename(columns={'mean':'fraudrate','count':'n'})
    stats['risk'] = (stats['fraudrate']*stats['n'] + global_fraud_rate*min_samples) / (stats['n']+min_samples)
    train_encoded = train_df[col].map(stats['risk']).fillna(global_fraud_rate)
    test_encoded  = test_df[col].map(stats['risk']).fillna(global_fraud_rate)
    return train_encoded, test_encoded

for col in ['merchant','category','job']:
    train_df[f'{col}_risk'], test_df[f'{col}_risk'] = risk_encode(train_df, test_df, col)

# -------------------------------
# 8 Prepare final features
# -------------------------------
FINAL_FEATURES = [
    'amt', 'gender', 'city_pop', 'age', 'transaction_hour', 'transaction_day',
    'transaction_month', 'transaction_weekday', 'is_weekend',
    'txn_count_1h', 'txn_count_24h', 'amt_mean_24h', 'amt_std_24h', 'amt_zscore_24h',
    'amt_mean_24h_missing', 'amt_std_24h_missing', 'amt_zscore_24h_missing',
    'geo_distance_km', 'merchant_risk', 'category_risk', 'job_risk',
    'lat', 'long', 'merch_lat', 'merch_long'
]


X_train = train_df[FINAL_FEATURES].copy()
y_train = train_df['is_fraud']
X_test  = test_df[FINAL_FEATURES].copy()
y_test  = test_df['is_fraud']

# -------------------------------
# Encode gender
# -------------------------------
X_train['gender'] = X_train['gender'].str.upper().map({'M':1,'F':0}).fillna(-1).astype(int)
X_test['gender']  = X_test['gender'].str.upper().map({'M':1,'F':0}).fillna(-1).astype(int)

In [2]:
import mlflow
import dagshub
# ========================== CONFIG ==========================
CONFIG = {
    "experiment_name": "hyperparameter-tuning for credit-card fraud detection",
    "mlflow_uri": "https://dagshub.com/VIKR4NT10/codesoft.mlflow",
    "repo_owner": "VIKR4NT10",
    "repo_name": "codesoft"
}

# ========================== MLflow + DAGsHub ==========================
mlflow.set_tracking_uri(CONFIG["mlflow_uri"])
dagshub.init(
    repo_owner=CONFIG["repo_owner"],
    repo_name=CONFIG["repo_name"],
    mlflow=True
)
mlflow.set_experiment(CONFIG["experiment_name"])

<Experiment: artifact_location='mlflow-artifacts:/e38178a5cb324196a652f5d18b11e664', creation_time=1768754683163, experiment_id='6', last_update_time=1768754683163, lifecycle_stage='active', name='hyperparameter-tuning for credit-card fraud detection', tags={}>

In [3]:
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline
import mlflow

# -------------------------
# 1. Subsample training data (10%)
# -------------------------
FRAC = 0.1

X_train_sub, _, y_train_sub, _ = train_test_split(
    X_train,
    y_train,
    train_size=FRAC,
    stratify=y_train,
    random_state=42
)

# -------------------------
# 2. Define pipeline
# -------------------------
pipeline = Pipeline([
    ("sampler", SMOTETomek(random_state=42)),
    ("rf", RandomForestClassifier(
        n_jobs=-1,
        random_state=42,
        class_weight=None  # handled by sampling
    ))
])

# -------------------------
# 3. Hyperparameter search space
# -------------------------
param_dist = {
    "rf__n_estimators": [100, 200, 300],
    "rf__max_depth": [None, 8, 12, 16],
    "rf__min_samples_leaf": [20, 50, 100],
    "rf__max_features": ["sqrt", "log2", 0.3, 0.5],
    "rf__bootstrap": [True, False]
}

# -------------------------
# 4. Cross-validation strategy
# -------------------------
cv = StratifiedKFold(
    n_splits=3,
    shuffle=True,
    random_state=42
)

# -------------------------
# 5. RandomizedSearchCV
# -------------------------
search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=15,                       # safe for Colab
    scoring="average_precision",     # PR-AUC
    cv=cv,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# -------------------------
# 6. Run hyperparameter tuning
# -------------------------
with mlflow.start_run(run_name="RF_SMOTETomek_RandomSearch"):
    search.fit(X_train_sub, y_train_sub)

    mlflow.log_params(search.best_params_)
    mlflow.log_metric("best_cv_pr_auc", search.best_score_)

# -------------------------
# 7. Output best results
# -------------------------
print("Best PR-AUC (CV):", search.best_score_)
print("Best parameters:")
for k, v in search.best_params_.items():
    print(f"  {k}: {v}")


Fitting 3 folds for each of 15 candidates, totalling 45 fits
üèÉ View run RF_SMOTETomek_RandomSearch at: https://dagshub.com/VIKR4NT10/codesoft.mlflow/#/experiments/6/runs/a3e0da395f644921b060efabb9ff7cf4
üß™ View experiment at: https://dagshub.com/VIKR4NT10/codesoft.mlflow/#/experiments/6
Best PR-AUC (CV): 0.8378649542477078
Best parameters:
  rf__n_estimators: 200
  rf__min_samples_leaf: 20
  rf__max_features: 0.5
  rf__max_depth: None
  rf__bootstrap: False
