In [1]:
{
    "cells": [
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "# MoMoGuard-GH: Fraud Detection Model Training\n",
                "\n",
                "This notebook demonstrates the process of training a machine learning model to detect fraudulent mobile money transactions. For the MVP, we'll use synthetic data that simulates typical fraud patterns in Ghana's mobile money ecosystem."
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 1,
            "metadata": {},
            "source": [
                "import numpy as np\n",
                "import pandas as pd\n",
                "import matplotlib.pyplot as plt\n",
                "import seaborn as sns\n",
                "from sklearn.model_selection import train_test_split, GridSearchCV\n",
                "from sklearn.ensemble import RandomForestClassifier\n",
                "from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc\n",
                "from imblearn.over_sampling import SMOTE\n",
                "import joblib\n",
                "\n",
                "# Set random seed for reproducibility\n",
                "np.random.seed(42)"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "## 1. Generate Synthetic Mobile Money Transaction Data\n",
                "\n",
                "Since real mobile money transaction data is sensitive and not easily available, we'll create synthetic data that resembles typical mobile money transactions in Ghana, with some fraudulent patterns included."
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 2,
            "metadata": {},
            "source": [
                "def generate_synthetic_data(n_samples=10000, fraud_ratio=0.05):\n",
                "    \"\"\"\n",
                "    Generate synthetic mobile money transaction data with fraud indicators\n",
                "    \n",
                "    Parameters:\n",
                "    n_samples (int): Number of transactions to generate\n",
                "    fraud_ratio (float): Ratio of fraudulent transactions\n",
                "    \n",
                "    Returns:\n",
                "    DataFrame: Synthetic transaction data\n",
                "    \"\"\"\n",
                "    # Number of fraudulent and legitimate transactions\n",
                "    n_fraud = int(n_samples * fraud_ratio)\n",
                "    n_legitimate = n_samples - n_fraud\n",
                "    \n",
                "    # Generate legitimate transactions\n",
                "    legitimate_data = {\n",
                "        'amount': np.random.gamma(2, 100, n_legitimate),  # Typical transaction amounts\n",
                "        'is_foreign_receiver': np.random.choice([0, 1], size=n_legitimate, p=[0.95, 0.05]),  # Most receivers are local\n",
                "        'num_recent_transactions': np.random.poisson(10, n_legitimate),  # Average transaction history\n",
                "        'avg_transaction_amount': np.random.gamma(2, 80, n_legitimate),  # Typical average amounts\n",
                "        'transaction_frequency_change': np.random.normal(0, 0.3, n_legitimate),  # Stable transaction patterns\n",
                "        'is_new_receiver': np.random.choice([0, 1], size=n_legitimate, p=[0.7, 0.3]),  # Most receivers are known\n",
                "        'time_of_day_risk': np.random.choice([0, 1], size=n_legitimate, p=[0.85, 0.15]),  # Mostly daytime transactions\n",
                "        'is_fraud': np.zeros(n_legitimate)  # Not fraud\n",
                "    }\n",
                "    \n",
                "    # Generate fraudulent transactions with distinct patterns\n",
                "    fraud_data = {\n",
                "        'amount': np.random.gamma(5, 150, n_fraud),  # Higher amounts on average\n",
                "        'is_foreign_receiver': np.random.choice([0, 1], size=n_fraud, p=[0.6, 0.4]),  # More foreign receivers\n",
                "        'num_recent_transactions': np.random.poisson(3, n_fraud),  # Less transaction history\n",
                "        'avg_transaction_amount': np.random.gamma(1.5, 50, n_fraud),  # Lower historical amounts\n",
                "        'transaction_frequency_change': np.random.normal(1.5, 0.8, n_fraud),  # Sudden increase in frequency\n",
                "        'is_new_receiver': np.random.choice([0, 1], size=n_fraud, p=[0.2, 0.8]),  # Mostly new receivers\n",
                "        'time_of_day_risk': np.random.choice([0, 1], size=n_fraud, p=[0.3, 0.7]),  # More night transactions\n",
                "        'is_fraud': np.ones(n_fraud)  # Is fraud\n",
                "    }\n",
                "    \n",
                "    # Combine legitimate and fraudulent data\n",
                "    for key in legitimate_data:\n",
                "        legitimate_data[key] = np.concatenate([legitimate_data[key], fraud_data[key]])\n",
                "        \n",
                "    # Create DataFrame\n",
                "    df = pd.DataFrame(legitimate_data)\n",
                "    \n",
                "    # Shuffle the data\n",
                "    df = df.sample(frac=1).reset_index(drop=True)\n",
                "    \n",
                "    return df\n",
                "\n",
                "# Generate data\n",
                "transactions_df = generate_synthetic_data()\n",
                "\n",
                "# Display first few rows\n",
                "transactions_df.head()"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 3,
            "metadata": {},
            "source": [
                "# Save synthetic data to CSV\n",
                "transactions_df.to_csv('../data/sim_swap_fraud.csv', index=False)\n",
                "\n",
                "# Basic statistics\n",
                "print(f\"Total transactions: {len(transactions_df)}\")\n",
                "print(f\"Fraudulent transactions: {transactions_df['is_fraud'].sum()} ({transactions_df['is_fraud'].mean()*100:.2f}%)\")"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "## 2. Exploratory Data Analysis"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 4,
            "metadata": {},
            "source": [
                "# Summary statistics\n",
                "print(\"Summary statistics for all transactions:\")\n",
                "transactions_df.describe()\n",
                "\n",
                "# Summary statistics by fraud status\n",
                "print(\"\\nSummary statistics for legitimate transactions:\")\n",
                "transactions_df[transactions_df['is_fraud'] == 0].describe()\n",
                "\n",
                "print(\"\\nSummary statistics for fraudulent transactions:\")\n",
                "transactions_df[transactions_df['is_fraud'] == 1].describe()"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 5,
            "metadata": {},
            "source": [
                "# Set up the visualization style\n",
                "plt.style.use('seaborn-whitegrid')\n",
                "plt.rcParams['figure.figsize'] = (12, 8)\n",
                "plt.rcParams['font.size'] = 12\n",
                "\n",
                "# Distribution of transaction amounts by fraud status\n",
                "plt.figure(figsize=(12, 6))\n",
                "sns.histplot(data=transactions_df, x='amount', hue='is_fraud', bins=50, kde=True, element='step')\n",
                "plt.title('Distribution of Transaction Amounts by Fraud Status')\n",
                "plt.xlabel('Transaction Amount (GHS)')\n",
                "plt.ylabel('Count')\n",
                "plt.legend(['Legitimate', 'Fraudulent'])\n",
                "plt.xlim(0, 1500)  # Limit x-axis for better visualization\n",
                "plt.show()\n",
                "\n",
                "# Transaction frequency change by fraud status\n",
                "plt.figure(figsize=(12, 6))\n",
                "sns.boxplot(data=transactions_df, x='is_fraud', y='transaction_frequency_change')\n",
                "plt.title('Transaction Frequency Change by Fraud Status')\n",
                "plt.xlabel('Is Fraud')\n",
                "plt.ylabel('Transaction Frequency Change')\n",
                "plt.xticks([0, 1], ['Legitimate', 'Fraudulent'])\n",
                "plt.show()"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 6,
            "metadata": {},
            "source": [
                "# Correlation matrix\n",
                "plt.figure(figsize=(10, 8))\n",
                "correlation_matrix = transactions_df.corr()\n",
                "sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, fmt='.2f')\n",
                "plt.title('Correlation Matrix of Transaction Features')\n",
                "plt.show()\n",
                "\n",
                "# Feature distribution comparisons\n",
                "fig, axes = plt.subplots(2, 3, figsize=(18, 10))\n",
                "axes = axes.flatten()\n",
                "\n",
                "# Plot histograms for numerical features\n",
                "numerical_features = ['amount', 'num_recent_transactions', 'avg_transaction_amount', \n",
                "                     'transaction_frequency_change']\n",
                "\n",
                "for i, feature in enumerate(numerical_features):\n",
                "    sns.histplot(data=transactions_df, x=feature, hue='is_fraud', kde=True, element='step', ax=axes[i])\n",
                "    axes[i].set_title(f'Distribution of {feature.replace(\"_\", \" \").title()}')\n",
                "    axes[i].legend(['Legitimate', 'Fraudulent'])\n",
                "\n",
                "# Plot count plots for categorical features\n",
                "categorical_features = ['is_foreign_receiver', 'is_new_receiver', 'time_of_day_risk']\n",
                "for i, feature in enumerate(categorical_features):\n",
                "    sns.countplot(data=transactions_df, x=feature, hue='is_fraud', ax=axes[i+len(numerical_features)])\n",
                "    axes[i+len(numerical_features)].set_title(f'Count of {feature.replace(\"_\", \" \").title()}')\n",
                "    axes[i+len(numerical_features)].legend(['Legitimate', 'Fraudulent'])\n",
                "\n",
                "plt.tight_layout()\n",
                "plt.show()"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "## 3. Feature Engineering\n",
                "\n",
                "Let's create some additional features that might be helpful for fraud detection."
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 7,
            "metadata": {},
            "source": [
                "def engineer_features(df):\n",
                "    \"\"\"\n",
                "    Create new features for fraud detection\n",
                "    \n",
                "    Parameters:\n",
                "    df (DataFrame): Original transaction data\n",
                "    \n",
                "    Returns:\n",
                "    DataFrame: Data with additional features\n",
                "    \"\"\"\n",
                "    # Create a copy to avoid modifying the original\n",
                "    df_new = df.copy()\n",
                "    \n",
                "    # Amount ratio compared to average (unusual amounts)\n",
                "    df_new['amount_avg_ratio'] = df_new['amount'] / (df_new['avg_transaction_amount'] + 1)  # +1 to avoid division by zero\n",
                "    \n",
                "    # Combine risk factors\n",
                "    df_new['combined_risk_score'] = (\n",
                "        df_new['is_foreign_receiver'] * 2 +\n",
                "        df_new['is_new_receiver'] * 1.5 +\n",
                "        df_new['time_of_day_risk'] * 1 +\n",
                "        (df_new['transaction_frequency_change'] > 1) * 2\n",
                "    )\n",
                "    \n",
                "    # Transaction amount risk (high amounts are riskier)\n",
                "    df_new['amount_risk'] = np.log1p(df_new['amount']) / 10  # Log-transform and scale\n",
                "    \n",
                "    # Transaction history risk (fewer transactions means higher risk)\n",
                "    df_new['history_risk'] = np.exp(-df_new['num_recent_transactions'] / 10)\n",
                "    \n",
                "    # Compute overall risk score\n",
                "    df_new['risk_score'] = (\n",
                "        df_new['combined_risk_score'] * 0.4 +\n",
                "        df_new['amount_risk'] * 0.3 +\n",
                "        df_new['history_risk'] * 0.3\n",
                "    )\n",
                "    \n",
                "    return df_new\n",
                "\n",
                "# Apply feature engineering\n",
                "enhanced_df = engineer_features(transactions_df)\n",
                "\n",
                "# Display first few rows with new features\n",
                "enhanced_df.head()"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 8,
            "metadata": {},
            "source": [
                "# Analyze new features\n",
                "plt.figure(figsize=(12, 6))\n",
                "sns.boxplot(data=enhanced_df, x='is_fraud', y='risk_score')\n",
                "plt.title('Risk Score by Fraud Status')\n",
                "plt.xlabel('Is Fraud')\n",
                "plt.ylabel('Risk Score')\n",
                "plt.xticks([0, 1], ['Legitimate', 'Fraudulent'])\n",
                "plt.show()\n",
                "\n",
                "# Correlation of new features with fraud\n",
                "new_features_corr = enhanced_df[['amount_avg_ratio', 'combined_risk_score', 'amount_risk', \n",
                "                               'history_risk', 'risk_score', 'is_fraud']].corr()['is_fraud'].sort_values()\n",
                "plt.figure(figsize=(10, 6))\n",
                "new_features_corr.drop('is_fraud').plot(kind='barh')\n",
                "plt.title('Correlation of New Features with Fraud')\n",
                "plt.xlabel('Correlation Coefficient')\n",
                "plt.tight_layout()\n",
                "plt.show()"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "## 4. Model Training\n",
                "\n",
                "Now let's prepare the data for modeling and train a Random Forest classifier."
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 9,
            "metadata": {},
            "source": [
                "# Select features and target\n",
                "features = ['amount', 'is_foreign_receiver', 'num_recent_transactions', 'avg_transaction_amount',\n",
                "           'transaction_frequency_change', 'is_new_receiver', 'time_of_day_risk',\n",
                "           'amount_avg_ratio', 'combined_risk_score', 'amount_risk', 'history_risk', 'risk_score']\n",
                "\n",
                "X = enhanced_df[features]\n",
                "y = enhanced_df['is_fraud']\n",
                "\n",
                "# Split the data into training and testing sets\n",
                "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)\n",
                "\n",
                "print(f\"Training set shape: {X_train.shape}\")\n",
                "print(f\"Testing set shape: {X_test.shape}\")\n",
                "print(f\"Fraud ratio in training set: {y_train.mean():.4f}\")\n",
                "print(f\"Fraud ratio in testing set: {y_test.mean():.4f}\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 10,
            "metadata": {},
            "source": [
                "# Apply SMOTE to handle class imbalance\n",
                "smote = SMOTE(random_state=42)\n",
                "X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)\n",
                "\n",
                "print(f\"Original training set shape: {X_train.shape}\")\n",
                "print(f\"Resampled training set shape: {X_train_resampled.shape}\")\n",
                "print(f\"Original fraud ratio in training set: {y_train.mean():.4f}\")\n",
                "print(f\"Resampled fraud ratio in training set: {y_train_resampled.mean():.4f}\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 11,
            "metadata": {},
            "source": [
                "# Train a Random Forest classifier\n",
                "rf_model = RandomForestClassifier(random_state=42)\n",
                "\n",
                "# Define parameter grid for hyperparameter tuning\n",
                "param_grid = {\n",
                "    'n_estimators': [50, 100, 200],\n",
                "    'max_depth': [None, 10, 20, 30],\n",
                "    'min_samples_split': [2, 5, 10],\n",
                "    'min_samples_leaf': [1, 2, 4]\n",
                "}\n",
                "\n",
                "# Use GridSearchCV to find the best hyperparameters\n",
                "grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='f1', n_jobs=-1)\n",
                "grid_search.fit(X_train_resampled, y_train_resampled)\n",
                "\n",
                "# Get the best model\n",
                "best_rf_model = grid_search.best_estimator_\n",
                "print(f\"Best hyperparameters: {grid_search.best_params_}\")\n",
                "print(f\"Best cross-validation F1 score: {grid_search.best_score_:.4f}\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 12,
            "metadata": {},
            "source": [
                "# Evaluate the model on the test set\n",
                "y_pred = best_rf_model.predict(X_test)\n",
                "y_prob = best_rf_model.predict_proba(X_test)[:, 1]\n",
                "\n",
                "# Classification report\n",
                "print(\"Classification Report:\")\n",
                "print(classification_report(y_test, y_pred))\n",
                "\n",
                "# Confusion matrix\n",
                "conf_matrix = confusion_matrix(y_test, y_pred)\n",
                "plt.figure(figsize=(8, 6))\n",
                "sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',\n",
                "           xticklabels=['Legitimate', 'Fraudulent'],\n",
                "           yticklabels=['Legitimate', 'Fraudulent'])\n",
                "plt.title('Confusion Matrix')\n",
                "plt.xlabel('Predicted')\n",
                "plt.ylabel('Actual')\n",
                "plt.show()"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 13,
            "metadata": {},
            "source": [
                "# ROC curve\n",
                "fpr, tpr, _ = roc_curve(y_test, y_prob)\n",
                "roc_auc = auc(fpr, tpr)\n",
                "\n",
                "plt.figure(figsize=(8, 6))\n",
                "plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')\n",
                "plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')\n",
                "plt.xlim([0.0, 1.0])\n",
                "plt.ylim([0.0, 1.05])\n",
                "plt.xlabel('False Positive Rate')\n",
                "plt.ylabel('True Positive Rate')\n",
                "plt.title('Receiver Operating Characteristic (ROC) Curve')\n",
                "plt.legend(loc='lower right')\n",
                "plt.show()\n",
                "\n",
                "# Feature importance\n",
                "feature_importance = pd.DataFrame({\n",
                "    'Feature': features,\n",
                "    'Importance': best_rf_model.feature_importances_\n",
                "}).sort_values(by='Importance', ascending=False)\n",
                "\n",
                "plt.figure(figsize=(10, 6))\n",
                "sns.barplot(data=feature_importance, x='Importance', y='Feature')\n",
                "plt.title('Feature Importance')\n",
                "plt.xlabel('Importance')\n",
                "plt.tight_layout()\n",
                "plt.show()"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "## 5. Model Tuning and Threshold Optimization\n",
                "\n",
                "For fraud detection, we often need to adjust the classification threshold to balance between false positives and false negatives."
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 14,
            "metadata": {},
            "source": [
                "# Calculate and plot precision-recall curve\n",
                "from sklearn.metrics import precision_recall_curve, average_precision_score\n",
                "\n",
                "precision, recall, thresholds = precision_recall_curve(y_test, y_prob)\n",
                "avg_precision = average_precision_score(y_test, y_prob)\n",
                "\n",
                "plt.figure(figsize=(10, 6))\n",
                "plt.plot(recall, precision, color='blue', lw=2, label=f'Precision-Recall curve (AP = {avg_precision:.2f})')\n",
                "plt.xlabel('Recall')\n",
                "plt.ylabel('Precision')\n",
                "plt.title('Precision-Recall Curve')\n",
                "plt.legend(loc='best')\n",
                "plt.grid(True)\n",
                "plt.show()\n",
                "\n",
                "# Plot thresholds vs precision and recall\n",
                "plt.figure(figsize=(10, 6))\n",
                "plt.plot(thresholds, precision[:-1], 'b--', label='Precision')\n",
                "plt.plot(thresholds, recall[:-1], 'g-', label='Recall')\n",
                "plt.xlabel('Threshold')\n",
                "plt.title('Precision and Recall vs. Threshold')\n",
                "plt.legend()\n",
                "plt.grid(True)\n",
                "plt.show()"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 15,
            "metadata": {},
            "source": [
                "# Calculate F1 score for different thresholds\n",
                "from sklearn.metrics import f1_score\n",
                "\n",
                "f1_scores = []\n",
                "thresholds_to_try = np.arange(0.1, 0.9, 0.05)\n",
                "\n",
                "for threshold in thresholds_to_try:\n",
                "    y_pred_threshold = (y_prob >= threshold).astype(int)\n",
                "    f1 = f1_score(y_test, y_pred_threshold)\n",
                "    f1_scores.append(f1)\n",
                "\n",
                "# Find the threshold with the highest F1 score\n",
                "best_threshold_idx = np.argmax(f1_scores)\n",
                "best_threshold = thresholds_to_try[best_threshold_idx]\n",
                "best_f1 = f1_scores[best_threshold_idx]\n",
                "\n",
                "plt.figure(figsize=(10, 6))\n",
                "plt.plot(thresholds_to_try, f1_scores, 'r-')\n",
                "plt.axvline(x=best_threshold, color='green', linestyle='--', \n",
                "           label=f'Best Threshold = {best_threshold:.2f}, F1 = {best_f1:.2f}')\n",
                "plt.xlabel('Threshold')\n",
                "plt.ylabel('F1 Score')\n",
                "plt.title('F1 Score vs. Threshold')\n",
                "plt.legend()\n",
                "plt.grid(True)\n",
                "plt.show()\n",
                "\n",
                "print(f\"Best threshold: {best_threshold:.2f}\")\n",
                "print(f\"F1 score at best threshold: {best_f1:.4f}\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 16,
            "metadata": {},
            "source": [
                "# Evaluate the model with the optimized threshold\n",
                "y_pred_optimized = (y_prob >= best_threshold).astype(int)\n",
                "\n",
                "print(\"Classification Report with Optimized Threshold:\")\n",
                "print(classification_report(y_test, y_pred_optimized))\n",
                "\n",
                "# Confusion matrix with optimized threshold\n",
                "conf_matrix_opt = confusion_matrix(y_test, y_pred_optimized)\n",
                "plt.figure(figsize=(8, 6))\n",
                "sns.heatmap(conf_matrix_opt, annot=True, fmt='d', cmap='Blues',\n",
                "           xticklabels=['Legitimate', 'Fraudulent'],\n",
                "           yticklabels=['Legitimate', 'Fraudulent'])\n",
                "plt.title('Confusion Matrix with Optimized Threshold')\n",
                "plt.xlabel('Predicted')\n",
                "plt.ylabel('Actual')\n",
                "plt.show()"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "## 6. Save the Model\n",
                "\n",
                "Let's save the trained model for later use in the MoMoGuard-GH application."
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 17,
            "metadata": {},
            "source": [
                "# Create a model package with the model, optimized threshold, and feature list\n",
                "model_package = {\n",
                "    'model': best_rf_model,\n",
                "    'threshold': best_threshold,\n",
                "    'features': features\n",
                "}\n",
                "\n",
                "# Save the model package\n",
                "joblib.dump(model_package, '../models/momoguard_gh_model.pkl')\n",
                "print(\"Model package saved successfully!\")"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "## 7. Example Real-time Fraud Detection Function\n",
                "\n",
                "Let's create a function that can be used in production to detect fraud in real-time."
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 18,
            "metadata": {},
            "source": [
                "def detect_fraud(transaction_data, model_package):\n",
                "    \"\"\"\n",
                "    Detect potential fraud in a mobile money transaction\n",
                "    \n",
                "    Parameters:\n",
                "    transaction_data (dict): Transaction data with required fields\n",
                "    model_package (dict): Model package containing the model, threshold, and features\n",
                "    \n",
                "    Returns:\n",
                "    dict: Fraud detection results\n",
                "    \"\"\"\n",
                "    # Extract model components\n",
                "    model = model_package['model']\n",
                "    threshold = model_package['threshold']\n",
                "    features = model_package['features']\n",
                "    \n",
                "    # Apply feature engineering to transaction data\n",
                "    # (In production, this would pull additional data like user history)\n",
                "    transaction_df = pd.DataFrame([transaction_data])\n",
                "    \n",
                "    #"
            ]
        }
    ]
}

{'cells': [{'cell_type': 'markdown',
   'metadata': {},
   'source': ['# MoMoGuard-GH: Fraud Detection Model Training\n',
    '\n',
    "This notebook demonstrates the process of training a machine learning model to detect fraudulent mobile money transactions. For the MVP, we'll use synthetic data that simulates typical fraud patterns in Ghana's mobile money ecosystem."]},
  {'cell_type': 'code',
   'execution_count': 1,
   'metadata': {},
   'source': ['import numpy as np\n',
    'import pandas as pd\n',
    'import matplotlib.pyplot as plt\n',
    'import seaborn as sns\n',
    'from sklearn.model_selection import train_test_split, GridSearchCV\n',
    'from sklearn.ensemble import RandomForestClassifier\n',
    'from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc\n',
    'from imblearn.over_sampling import SMOTE\n',
    'import joblib\n',
    '\n',
    '# Set random seed for reproducibility\n',
    'np.random.seed(42)']},
  {'cell_type': 'markdown',
 