# FraudShield: Fraud Detection with >95% Accuracy

**FraudShield** is a fraud detection pipeline targeting >95% accuracy, tailored for your dataset with columns: `Transaction_ID`, `User_ID`, `Transaction_Amount`, `Transaction_Type`, `Time_of_Transaction`, `Device_Used`, `Location`, `Previous_Fraudulent_Transactions`, `Account_Age`, `Number_of_Transactions_Last_24H`, `Payment_Method`, and `Fraudulent`. This notebook:
- Loads **Fraud Detection Dataset.csv** with encoding detection.
- Applies feature engineering and trains a RandomForestClassifier with SMOTE.
- Creates a Streamlit app for predictions.

## Features
- Handles missing values with imputation.
- Interactive Streamlit dashboard with visualizations.
- Downloads predictions and logs.

## Instructions
1. Ensure **Fraud Detection Dataset.csv** is at `C:\Users\heave\Downloads\Bron_Projects\%Pro\FraudShield\data\`.
2. Run all cells to set up and train.
3. Run the Streamlit app from the final cell.

## Prerequisites
- Python 3.8+
- Jupyter Notebook
- Install dependencies: `pip install -r requirements.txt`

Setup Project Structure

In [1]:
import os

# Create directories
os.makedirs('model_artifacts', exist_ok=True)
os.makedirs('predictions', exist_ok=True)
os.makedirs('scripts', exist_ok=True)
os.makedirs('notebooks', exist_ok=True)
os.makedirs('data', exist_ok=True)

print('Project directories created: model_artifacts, predictions, scripts, notebooks, data')

Project directories created: model_artifacts, predictions, scripts, notebooks, data


Load and Preprocess Dataset

In [2]:
import os
import chardet
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Set the file path
file_path = r"C:\Users\heave\Downloads\Bron_Projects\%Pro\FraudShield\Fraud Detection Dataset.csv"

# Detect encoding
with open(file_path, "rb") as f:
    result = chardet.detect(f.read())
    encoding = result['encoding']
    print(f"Detected file encoding: {encoding}")

# Load dataset
try:
    df = pd.read_csv(file_path, encoding=encoding)
    print(f'Dataset loaded successfully. Shape: {df.shape}')
except FileNotFoundError:
    print('Error: Fraud Detection Dataset.csv not found in the specified path. Please check and try again.')
    raise
except Exception as e:
    print(f"Error reading CSV file: {e}")
    raise

# Feature engineering with handling for missing values
try:
    df['Transaction_Frequency'] = df.groupby('User_ID')['Transaction_ID'].transform('count') / df['Account_Age'].replace('', 1).astype(float)
    df['Amount_ZScore'] = (df['Transaction_Amount'] - df.groupby('User_ID')['Transaction_Amount'].transform('mean')) / df.groupby('User_ID')['Transaction_Amount'].transform('std').fillna(1)
    df['Is_Night_Transaction'] = df['Time_of_Transaction'].replace('', 0).astype(float).apply(lambda x: 1 if 0 <= x <= 6 else 0)
    df['Transaction_Velocity'] = df['Number_of_Transactions_Last_24H'] / (df['Account_Age'].replace('', 1).astype(float) / 30).clip(lower=1)
    df['Location_Anomaly'] = df.groupby('User_ID')['Location'].transform(lambda x: 1 if x.nunique() > 2 else 0)
    df['Transaction_Acceleration'] = df['Number_of_Transactions_Last_24H'] / df.groupby('User_ID')['Number_of_Transactions_Last_24H'].transform('mean').clip(lower=1)
    df['Device_Anomaly'] = df.groupby('User_ID')['Device_Used'].transform(lambda x: 1 if x.nunique() > 2 else 0)
except Exception as e:
    print(f"Error during feature engineering: {e}")
    raise

# Remove duplicates
df = df.drop_duplicates()

# Save test dataset (20% sample, without target column)
try:
    test_df = df.sample(frac=0.2, random_state=42).copy()
    if 'Fraudulent' in test_df.columns:
        test_df = test_df.drop('Fraudulent', axis=1)

    output_path = os.path.join("data", "sample_transactions.csv")
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    test_df.to_csv(output_path, index=False)
    
    print(f'Test dataset saved to {output_path} with {len(test_df)} samples')

    if 'Fraudulent' in df.columns:
        fraud_rate = df["Fraudulent"].mean() * 100
        print(f'Training dataset has {len(df)} samples, {df["Fraudulent"].sum()} frauds ({fraud_rate:.2f}% fraud rate)')
    else:
        print("Warning: 'Fraudulent' column not found. Training will be skipped.")
except Exception as e:
    print(f"Error saving test dataset: {e}")
    raise

  from pandas.core import (


Detected file encoding: ascii
Dataset loaded successfully. Shape: (51000, 12)
Test dataset saved to data\sample_transactions.csv with 10024 samples
Training dataset has 50119 samples, 2467 frauds (4.92% fraud rate)


Train and Tune Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, precision_recall_curve, roc_curve
from imblearn.over_sampling import SMOTE
from joblib import dump
import json
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

if 'Fraudulent' not in df.columns:
    print("Error: 'Fraudulent' column is missing. Please ensure the dataset includes the target column and rerun.")
else:
    # Preprocess data
    categorical_cols = ['Transaction_Type', 'Device_Used', 'Location', 'Payment_Method']
    numerical_cols = ['User_ID', 'Transaction_Amount', 'Time_of_Transaction', 'Previous_Fraudulent_Transactions', 
                      'Account_Age', 'Number_of_Transactions_Last_24H', 'Transaction_Frequency', 'Amount_ZScore', 
                      'Is_Night_Transaction', 'Transaction_Velocity', 'Location_Anomaly', 'Transaction_Acceleration', 'Device_Anomaly']

    # Imputation for missing values
    df = df.fillna({
        'Transaction_Type': 'Unknown',
        'Device_Used': 'Unknown',
        'Location': 'Unknown',
        'Payment_Method': 'Unknown',
        'Time_of_Transaction': 0,
        'Account_Age': 1
    })
    precomputed_means = df[numerical_cols].mean()
    precomputed_modes = df[categorical_cols].mode().iloc[0]
    df = df.fillna(precomputed_means).fillna(precomputed_modes)

    # Encode categorical variables
    encoders = {}
    for col in categorical_cols:
        if col in df.columns:
            encoders[col] = LabelEncoder().fit(df[col].astype(str))
            df[col] = encoders[col].transform(df[col].astype(str))

    # Scale numerical features
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

    # Features and target
    feature_names = [col for col in numerical_cols + categorical_cols if col in df.columns]
    X = df[feature_names]
    y = df['Fraudulent']

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Apply SMOTE
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

    # Hyperparameter tuning
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2'],
        'class_weight': ['balanced', None]
    }
    model = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train_smote, y_train_smote)

    # Best model
    best_model = grid_search.best_estimator_
    print(f'Best parameters: {grid_search.best_params_}')

    # Evaluate on test set
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])

    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-Score: {f1:.4f}')
    print(f'ROC-AUC: {roc_auc:.4f}')

    # Visualizations
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

    fpr, tpr, _ = roc_curve(y_test, best_model.predict_proba(X_test)[:, 1])
    plt.figure(figsize=(6, 4))
    plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.4f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend()
    plt.show()

    precision_vals, recall_vals, _ = precision_recall_curve(y_test, best_model.predict_proba(X_test)[:, 1])
    plt.figure(figsize=(6, 4))
    plt.plot(recall_vals, precision_vals, label='Precision-Recall Curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend()
    plt.show()

    feature_importance = pd.DataFrame({
        'Feature': feature_names,
        'Importance': best_model.feature_importances_
    }).sort_values(by='Importance', ascending=False)
    plt.figure(figsize=(8, 6))
    sns.barplot(x='Importance', y='Feature', data=feature_importance)
    plt.title('Feature Importance')
    plt.show()

    # Save artifacts
    dump(best_model, 'model_artifacts/ensemble_model.joblib')
    dump(feature_names, 'model_artifacts/feature_names.joblib')
    dump(scaler, 'model_artifacts/scaler.joblib')
    dump(precomputed_means, 'model_artifacts/precomputed_means.joblib')
    dump(precomputed_modes, 'model_artifacts/precomputed_modes.joblib')
    for col, encoder in encoders.items():
        dump(encoder, f'model_artifacts/encoder_{col}.joblib')

    metadata = {
        'Model': 'RandomForestClassifier',
        'Version': '4.0',
        'Training_Date': '2025-08-05 17:15 GMT',
        'Accuracy': float(accuracy),
        'Best_Parameters': grid_search.best_params_,
        'Feature_Importance': feature_importance.to_dict(orient='records')
    }
    with open('model_artifacts/model_metadata.json', 'w') as f:
        json.dump(metadata, f, indent=4)

    print('Model and artifacts saved to model_artifacts/')
    if accuracy < 0.95:
        print('Warning: Accuracy below 95%. Consider expanding param_grid or adding features.')

CREATE PREDICTION LOGIC

In [None]:
import logging
from datetime import datetime
from joblib import load

logging.basicConfig(
    filename='fraud_detection_predict.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

MODEL_DIR = 'model_artifacts'
PREDICTIONS_DIR = 'predictions'

def load_data(file=None):
    import chardet
    import pandas as pd
    if file is not None:
        try:
            if isinstance(file, str):
                with open(file, "rb") as f:
                    result = chardet.detect(f.read())
                    encoding = result['encoding']
                if file.endswith('.csv'):
                    df = pd.read_csv(file, encoding=encoding)
                elif file.endswith('.json'):
                    df = pd.read_json(file)
                elif file.endswith(('.xlsx', '.xls')):
                    df = pd.read_excel(file)
                else:
                    raise ValueError('Unsupported file format. Use CSV, JSON, or Excel.')
            else:
                if file.name.endswith('.csv'):
                    with open(file.name, "rb") as f:
                        result = chardet.detect(f.read())
                        encoding = result['encoding']
                    df = pd.read_csv(file, encoding=encoding)
                elif file.name.endswith('.json'):
                    df = pd.read_json(file)
                elif file.name.endswith(('.xlsx', '.xls')):
                    df = pd.read_excel(file)
                else:
                    raise ValueError('Unsupported file format. Use CSV, JSON, or Excel.')
            logging.info(f'Dataset loaded successfully from {file}. Shape: {df.shape}')
            return df
        except Exception as e:
            logging.error(f'Error loading file {file}: {str(e)}')
            raise
    logging.info('Loading sample dataset.')
    df = pd.read_csv('data/sample_transactions.csv')
    return df

def preprocess_data(df, encoders, feature_names, scaler, precomputed_means, precomputed_modes):
    logging.info('Starting data preprocessing...')
    try:
        df['Transaction_Frequency'] = df.groupby('User_ID')['Transaction_ID'].transform('count') / df['Account_Age'].replace('', 1).astype(float)
        df['Amount_ZScore'] = (df['Transaction_Amount'] - df.groupby('User_ID')['Transaction_Amount'].transform('mean')) / df.groupby('User_ID')['Transaction_Amount'].transform('std').fillna(1)
        df['Is_Night_Transaction'] = df['Time_of_Transaction'].replace('', 0).astype(float).apply(lambda x: 1 if 0 <= x <= 6 else 0)
        df['Transaction_Velocity'] = df['Number_of_Transactions_Last_24H'] / (df['Account_Age'].replace('', 1).astype(float) / 30).clip(lower=1)
        df['Location_Anomaly'] = df.groupby('User_ID')['Location'].transform(lambda x: 1 if x.nunique() > 2 else 0)
        df['Transaction_Acceleration'] = df['Number_of_Transactions_Last_24H'] / df.groupby('User_ID')['Number_of_Transactions_Last_24H'].transform('mean').clip(lower=1)
        df['Device_Anomaly'] = df.groupby('User_ID')['Device_Used'].transform(lambda x: 1 if x.nunique() > 2 else 0)

        df = df.fillna({
            'Transaction_Type': 'Unknown',
            'Device_Used': 'Unknown',
            'Location': 'Unknown',
            'Payment_Method': 'Unknown',
            'Time_of_Transaction': 0,
            'Account_Age': 1
        })
        df = df.fillna(precomputed_means).fillna(precomputed_modes)
        logging.info('Missing values filled with precomputed means and modes.')
        
        for col in encoders:
            if col in df.columns:
                try:
                    df[col] = encoders[col].transform(df[col].astype(str))
                except ValueError:
                    unknown_count = sum(~df[col].astype(str).isin(encoders[col].classes_))
                    logging.warning(f'{unknown_count} unknown categories in {col}. Using default encoding.')
                    df[col] = df[col].astype(str).map(lambda x: x if x in encoders[col].classes_ else encoders[col].classes_[0])
                    df[col] = encoders[col].transform(df[col])
        
        numerical_cols = [col for col in feature_names if col in df.columns and df[col].dtype in ['int64', 'float64']]
        if numerical_cols:
            df[numerical_cols] = scaler.transform(df[numerical_cols])
            logging.info(f'Scaled numerical columns: {numerical_cols}')
        
        for col in feature_names:
            if col not in df.columns:
                df[col] = 0
                logging.warning(f'Added missing feature {col} with default value 0.')
        
        if 'Transaction_ID' in df.columns:
            transaction_ids = df['Transaction_ID']
            df = df.drop('Transaction_ID', axis=1)
        else:
            transaction_ids = [f'T{i}' for i in range(len(df))]
        
        df = df[feature_names]
        logging.info('Data preprocessing completed.')
        return df, transaction_ids
    except Exception as e:
        logging.error(f'Data preprocessing failed: {str(e)}')
        raise

def predict_fraud(df=None):
    try:
        required_files = [
            'ensemble_model.joblib', 'feature_names.joblib', 'scaler.joblib',
            'precomputed_means.joblib', 'precomputed_modes.joblib',
            'encoder_Transaction_Type.joblib', 'encoder_Device_Used.joblib',
            'encoder_Location.joblib', 'encoder_Payment_Method.joblib'
        ]
        for file_name in required_files:
            if not os.path.exists(os.path.join(MODEL_DIR, file_name)):
                raise FileNotFoundError(f'Required file {file_name} not found in {MODEL_DIR}.')
        
        model = load(os.path.join(MODEL_DIR, 'ensemble_model.joblib'))
        feature_names = load(os.path.join(MODEL_DIR, 'feature_names.joblib'))
        scaler = load(os.path.join(MODEL_DIR, 'scaler.joblib'))
        precomputed_means = load(os.path.join(MODEL_DIR, 'precomputed_means.joblib'))
        precomputed_modes = load(os.path.join(MODEL_DIR, 'precomputed_modes.joblib'))
        encoders = {
            'Transaction_Type': load(os.path.join(MODEL_DIR, 'encoder_Transaction_Type.joblib')),
            'Device_Used': load(os.path.join(MODEL_DIR, 'encoder_Device_Used.joblib')),
            'Location': load(os.path.join(MODEL_DIR, 'encoder_Location.joblib')),
            'Payment_Method': load(os.path.join(MODEL_DIR, 'encoder_Payment_Method.joblib'))
        }
        
        metadata_path = os.path.join(MODEL_DIR, 'model_metadata.json')
        metadata = {'Model': 'Unknown', 'Version': 'Unknown', 'Training_Date': 'Unknown', 'Accuracy': 0.0}
        if os.path.exists(metadata_path):
            with open(metadata_path, 'r') as f:
                metadata = json.load(f)
        
        if df is None:
            df = load_data()
        
        X, transaction_ids = preprocess_data(df, encoders, feature_names, scaler, precomputed_means, precomputed_modes)
        
        predictions = model.predict(X)
        fraud_probabilities = model.predict_proba(X)[:, 1]
        logging.info(f'Predicted {sum(predictions)} fraudulent transactions out of {len(predictions)}.')
        
        results = pd.DataFrame({
            'Transaction_ID': transaction_ids,
            'Fraud_Prediction': ['Fraudulent' if pred == 1 else 'Legitimate' for pred in predictions],
            'Fraud_Probability': fraud_probabilities
        })
        results['Fraud_Probability'] = results['Fraud_Probability'].round(4)
        
        os.makedirs(PREDICTIONS_DIR, exist_ok=True)
        json_path = os.path.join(PREDICTIONS_DIR, f'predictions_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json')
        excel_path = os.path.join(PREDICTIONS_DIR, f'predictions_{datetime.now().strftime("%Y%m%d_%H%M%S")}.xlsx')
        results.to_json(json_path, orient='records', indent=4)
        results.to_excel(excel_path, index=False)
        
        return results, json_path, excel_path, metadata
    except FileNotFoundError as e:
        logging.error(f'Model or artifact files not found: {str(e)}')
        print(f'Error: {str(e)}. Run the training cell to generate artifacts.')
        raise
    except Exception as e:
        logging.error(f'Fraud prediction failed: {str(e)}')
        raise

# Test prediction
results, json_path, excel_path, metadata = predict_fraud('data/sample_transactions.csv')
print('Prediction Results:')
display(results.head())
print(f'Results saved to {json_path} and {excel_path}')
print('Metadata:', metadata)

CREATE STREAMLIT APP

In [None]:
app_code = '''
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import logging
import json
import os
from datetime import datetime, timedelta
from joblib import load

# Configure logging
logging.basicConfig(
    filename='fraud_detection_predict.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

MODEL_DIR = os.getenv('MODEL_DIR', 'model_artifacts')
PREDICTIONS_DIR = os.getenv('PREDICTIONS_DIR', 'predictions')
RETENTION_DAYS = 7

st.set_page_config(page_title='FraudShield Dashboard', layout='wide', page_icon='🛡️')

def cleanup_old_files(directory, retention_days=RETENTION_DAYS):
    try:
        now = datetime.now()
        for filename in os.listdir(directory):
            file_path = os.path.join(directory, filename)
            if os.path.isfile(file_path):
                file_mtime = datetime.fromtimestamp(os.path.getmtime(file_path))
                if now - file_mtime > timedelta(days=retention_days):
                    os.remove(file_path)
                    logging.info(f'Deleted old file: {file_path}')
        log_file = 'fraud_detection_predict.log'
        if os.path.exists(log_file):
            log_mtime = datetime.fromtimestamp(os.path.getmtime(log_file))
            if now - log_mtime > timedelta(days=retention_days):
                with open(log_file, 'w'):
                    pass
                logging.info('Truncated old log file.')
    except Exception as e:
        logging.error(f'Failed to clean up old files: {str(e)}')
        st.warning(f'Could not clean up old files: {str(e)}')

def load_data(file=None):
    import chardet
    if file is not None:
        try:
            if isinstance(file, str):
                with open(file, "rb") as f:
                    result = chardet.detect(f.read())
                    encoding = result['encoding']
                if file.endswith('.csv'):
                    df = pd.read_csv(file, encoding=encoding)
                elif file.endswith('.json'):
                    df = pd.read_json(file)
                elif file.endswith(('.xlsx', '.xls')):
                    df = pd.read_excel(file)
                else:
                    raise ValueError('Unsupported file format. Use CSV, JSON, or Excel.')
            else:
                if file.name.endswith('.csv'):
                    with open(file.name, "rb") as f:
                        result = chardet.detect(f.read())
                        encoding = result['encoding']
                    df = pd.read_csv(file, encoding=encoding)
                elif file.name.endswith('.json'):
                    df = pd.read_json(file)
                elif file.name.endswith(('.xlsx', '.xls')):
                    df = pd.read_excel(file)
                else:
                    raise ValueError('Unsupported file format. Use CSV, JSON, or Excel.')
            logging.info(f'Dataset loaded successfully from {file}. Shape: {df.shape}')
            return df
        except Exception as e:
            logging.error(f'Error loading file {file}: {str(e)}')
            st.error(f'Error loading file: {str(e)}')
            return None
    logging.info('Loading sample dataset.')
    df = pd.read_csv('data/sample_transactions.csv')
    return df

def preprocess_data(df, encoders, feature_names, scaler, precomputed_means, precomputed_modes):
    logging.info('Starting data preprocessing...')
    try:
        df['Transaction_Frequency'] = df.groupby('User_ID')['Transaction_ID'].transform('count') / df['Account_Age'].replace('', 1).astype(float)
        df['Amount_ZScore'] = (df['Transaction_Amount'] - df.groupby('User_ID')['Transaction_Amount'].transform('mean')) / df.groupby('User_ID')['Transaction_Amount'].transform('std').fillna(1)
        df['Is_Night_Transaction'] = df['Time_of_Transaction'].replace('', 0).astype(float).apply(lambda x: 1 if 0 <= x <= 6 else 0)
        df['Transaction_Velocity'] = df['Number_of_Transactions_Last_24H'] / (df['Account_Age'].replace('', 1).astype(float) / 30).clip(lower=1)
        df['Location_Anomaly'] = df.groupby('User_ID')['Location'].transform(lambda x: 1 if x.nunique() > 2 else 0)
        df['Transaction_Acceleration'] = df['Number_of_Transactions_Last_24H'] / df.groupby('User_ID')['Number_of_Transactions_Last_24H'].transform('mean').clip(lower=1)
        df['Device_Anomaly'] = df.groupby('User_ID')['Device_Used'].transform(lambda x: 1 if x.nunique() > 2 else 0)

        df = df.fillna({
            'Transaction_Type': 'Unknown',
            'Device_Used': 'Unknown',
            'Location': 'Unknown',
            'Payment_Method': 'Unknown',
            'Time_of_Transaction': 0,
            'Account_Age': 1
        })
        df = df.fillna(precomputed_means).fillna(precomputed_modes)
        logging.info('Missing values filled with precomputed means and modes.')
        
        for col in encoders:
            if col in df.columns:
                try:
                    df[col] = encoders[col].transform(df[col].astype(str))
                except ValueError:
                    unknown_count = sum(~df[col].astype(str).isin(encoders[col].classes_))
                    logging.warning(f'{unknown_count} unknown categories in {col}. Using default encoding.')
                    df[col] = df[col].astype(str).map(lambda x: x if x in encoders[col].classes_ else encoders[col].classes_[0])
                    df[col] = encoders[col].transform(df[col])
        
        numerical_cols = [col for col in feature_names if col in df.columns and df[col].dtype in ['int64', 'float64']]
        if numerical_cols:
            df[numerical_cols] = scaler.transform(df[numerical_cols])
            logging.info(f'Scaled numerical columns: {numerical_cols}')
        
        for col in feature_names:
            if col not in df.columns:
                df[col] = 0
                logging.warning(f'Added missing feature {col} with default value 0.')
        
        if 'Transaction_ID' in df.columns:
            transaction_ids = df['Transaction_ID']
            df = df.drop('Transaction_ID', axis=1)
        else:
            transaction_ids = [f'T{i}' for i in range(len(df))]
        
        df = df[feature_names]
        logging.info('Data preprocessing completed.')
        return df, transaction_ids
    except Exception as e:
        logging.error(f'Data preprocessing failed: {str(e)}')
        st.error(f'Data preprocessing failed: {str(e)}')
        return None, None

def predict_fraud(df=None):
    try:
        required_files = [
            'ensemble_model.joblib', 'feature_names.joblib', 'scaler.joblib',
            'precomputed_means.joblib', 'precomputed_modes.joblib',
            'encoder_Transaction_Type.joblib', 'encoder_Device_Used.joblib',
            'encoder_Location.joblib', 'encoder_Payment_Method.joblib'
        ]
        for file_name in required_files:
            if not os.path.exists(os.path.join(MODEL_DIR, file_name)):
                raise FileNotFoundError(f'Required file {file_name} not found in {MODEL_DIR}.')
        
        model = load(os.path.join(MODEL_DIR, 'ensemble_model.joblib'))
        feature_names = load(os.path.join(MODEL_DIR, 'feature_names.joblib'))
        scaler = load(os.path.join(MODEL_DIR, 'scaler.joblib'))
        precomputed_means = load(os.path.join(MODEL_DIR, 'precomputed_means.joblib'))
        precomputed_modes = load(os.path.join(MODEL_DIR, 'precomputed_modes.joblib'))
        encoders = {
            'Transaction_Type': load(os.path.join(MODEL_DIR, 'encoder_Transaction_Type.joblib')),
            'Device_Used': load(os.path.join(MODEL_DIR, 'encoder_Device_Used.joblib')),
            'Location': load(os.path.join(MODEL_DIR, 'encoder_Location.joblib')),
            'Payment_Method': load(os.path.join(MODEL_DIR, 'encoder_Payment_Method.joblib'))
        }
        
        metadata_path = os.path.join(MODEL_DIR, 'model_metadata.json')
        metadata = {'Model': 'Unknown', 'Version': 'Unknown', 'Training_Date': 'Unknown', 'Accuracy': 0.0}
        if os.path.exists(metadata_path):
            with open(metadata_path, 'r') as f:
                metadata = json.load(f)
        
        if df is None:
            df = load_data()
            if df is None:
                return None, None, None, None
        
        X, transaction_ids = preprocess_data(df, encoders, feature_names, scaler, precomputed_means, precomputed_modes)
        if X is None:
            return None, None, None, None
        
        predictions = model.predict(X)
        fraud_probabilities = model.predict_proba(X)[:, 1]
        logging.info(f'Predicted {sum(predictions)} fraudulent transactions out of {len(predictions)}.')
        
        results = pd.DataFrame({
            'Transaction_ID': transaction_ids,
            'Fraud_Prediction': ['Fraudulent' if pred == 1 else 'Legitimate' for pred in predictions],
            'Fraud_Probability': fraud_probabilities
        })
        results['Fraud_Probability'] = results['Fraud_Probability'].round(4)
        
        os.makedirs(PREDICTIONS_DIR, exist_ok=True)
        json_path = os.path.join(PREDICTIONS_DIR, f'predictions_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json')
        excel_path = os.path.join(PREDICTIONS_DIR, f'predictions_{datetime.now().strftime("%Y%m%d_%H%M%S")}.xlsx')
        results.to_json(json_path, orient='records', indent=4)
        results.to_excel(excel_path, index=False)
        
        return results, json_path, excel_path, metadata
    except FileNotFoundError as e:
        logging.error(f'Model or artifact files not found: {str(e)}')
        st.error(f'Error: {str(e)}. Run the training cell to generate artifacts.')
        return None, None, None, None
    except Exception as e:
        logging.error(f'Fraud prediction failed: {str(e)}')
        st.error(f'Prediction failed: {str(e)}')
        return None, None, None, None

def main():
    st.title('🛡️ FraudShield Dashboard')
    st.markdown('Upload a transaction dataset (CSV, JSON, or Excel) or use the sample dataset to predict fraudulent transactions.')

    cleanup_old_files(PREDICTIONS_DIR)
    
    st.sidebar.header('Data Input & Filters')
    uploaded_file = st.sidebar.file_uploader('Upload Transactions', type=['csv', 'json', 'xlsx', 'xls'])
    
    df = load_data(uploaded_file)
    if df is None:
        return
    
    results, json_path, excel_path, metadata = predict_fraud(df)
    if results is None:
        return
    
    st.sidebar.header('Model Metadata')
    st.sidebar.text(f'**Model**: {metadata.get("Model", "Unknown")}')
    st.sidebar.text(f'**Version**: {metadata.get("Version", "Unknown")}')
    st.sidebar.text(f'**Training Date**: {metadata.get("Training_Date", "Unknown")}')
    st.sidebar.text(f'**Accuracy**: {metadata.get("Accuracy", 0.0):.4f}')
    
    st.header('Prediction Summary')
    fraud_count = len(results[results['Fraud_Prediction'] == 'Fraudulent'])
    col1, col2, col3 = st.columns(3)
    col1.metric('Total Transactions', len(results))
    col2.metric('Fraudulent Transactions', f'{fraud_count} ({fraud_count/len(results)*100:.2f}%)')
    col3.metric('Average Fraud Probability', f'{results["Fraud_Probability"].mean():.4f}')
    
    st.sidebar.subheader('Filter Results')
    prediction_filter = st.sidebar.selectbox('Filter by Prediction', ['All', 'Fraudulent', 'Legitimate'])
    probability_threshold = st.sidebar.slider('Fraud Probability Threshold', 0.0, 1.0, 0.5, 0.01)
    
    filtered_results = results.copy()
    if prediction_filter != 'All':
        filtered_results = filtered_results[filtered_results['Fraud_Prediction'] == prediction_filter]
    filtered_results = filtered_results[filtered_results['Fraud_Probability'] >= probability_threshold]
    
    st.header('Top 5 Risky Transactions')
    top_risky = results.sort_values(by='Fraud_Probability', ascending=False).head(5)
    st.dataframe(top_risky, use_container_width=True)
    
    st.download_button(
        label='Download JSON Results',
        data=open(json_path, 'r').read(),
        file_name=os.path.basename(json_path),
        mime='application/json'
    )
    st.download_button(
        label='Download Excel Results',
        data=open(excel_path, 'rb').read(),
        file_name=os.path.basename(excel_path),
        mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
    )

if __name__ == '__main__':
    main()
'''

with open('scripts/app.py', 'w') as f:
    f.write(app_code)

print('Streamlit app saved to scripts/app.py')

CREAT PREDICTION SCRIPT

In [None]:
predict_fraud_code = '''
import logging
from datetime import datetime
from joblib import load
import pandas as pd
import os
import chardet

logging.basicConfig(
    filename='fraud_detection_predict.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

MODEL_DIR = 'model_artifacts'
PREDICTIONS_DIR = 'predictions'

def load_data(file=None):
    if file is not None:
        try:
            if isinstance(file, str):
                with open(file, "rb") as f:
                    result = chardet.detect(f.read())
                    encoding = result['encoding']
                if file.endswith('.csv'):
                    df = pd.read_csv(file, encoding=encoding)
                elif file.endswith('.json'):
                    df = pd.read_json(file)
                elif file.endswith(('.xlsx', '.xls')):
                    df = pd.read_excel(file)
                else:
                    raise ValueError('Unsupported file format. Use CSV, JSON, or Excel.')
            else:
                if file.name.endswith('.csv'):
                    with open(file.name, "rb") as f:
                        result = chardet.detect(f.read())
                        encoding = result['encoding']
                    df = pd.read_csv(file, encoding=encoding)
                elif file.name.endswith('.json'):
                    df = pd.read_json(file)
                elif file.name.endswith(('.xlsx', '.xls')):
                    df = pd.read_excel(file)
                else:
                    raise ValueError('Unsupported file format. Use CSV, JSON, or Excel.')
            logging.info(f'Dataset loaded successfully from {file}. Shape: {df.shape}')
            return df
        except Exception as e:
            logging.error(f'Error loading file {file}: {str(e)}')
            raise
    logging.info('Loading sample dataset.')
    df = pd.read_csv('data/sample_transactions.csv')
    return df

def preprocess_data(df, encoders, feature_names, scaler, precomputed_means, precomputed_modes):
    logging.info('Starting data preprocessing...')
    try:
        df['Transaction_Frequency'] = df.groupby('User_ID')['Transaction_ID'].transform('count') / df['Account_Age'].replace('', 1).astype(float)
        df['Amount_ZScore'] = (df['Transaction_Amount'] - df.groupby('User_ID')['Transaction_Amount'].transform('mean')) / df.groupby('User_ID')['Transaction_Amount'].transform('std').fillna(1)
        df['Is_Night_Transaction'] = df['Time_of_Transaction'].replace('', 0).astype(float).apply(lambda x: 1 if 0 <= x <= 6 else 0)
        df['Transaction_Velocity'] = df['Number_of_Transactions_Last_24H'] / (df['Account_Age'].replace('', 1).astype(float) / 30).clip(lower=1)
        df['Location_Anomaly'] = df.groupby('User_ID')['Location'].transform(lambda x: 1 if x.nunique() > 2 else 0)
        df['Transaction_Acceleration'] = df['Number_of_Transactions_Last_24H'] / df.groupby('User_ID')['Number_of_Transactions_Last_24H'].transform('mean').clip(lower=1)
        df['Device_Anomaly'] = df.groupby('User_ID')['Device_Used'].transform(lambda x: 1 if x.nunique() > 2 else 0)

        df = df.fillna({
            'Transaction_Type': 'Unknown',
            'Device_Used': 'Unknown',
            'Location': 'Unknown',
            'Payment_Method': 'Unknown',
            'Time_of_Transaction': 0,
            'Account_Age': 1
        })
        df = df.fillna(precomputed_means).fillna(precomputed_modes)
        logging.info('Missing values filled with precomputed means and modes.')
        
        for col in encoders:
            if col in df.columns:
                try:
                    df[col] = encoders[col].transform(df[col].astype(str))
                except ValueError:
                    unknown_count = sum(~df[col].astype(str).isin(encoders[col].classes_))
                    logging.warning(f'{unknown_count} unknown categories in {col}. Using default encoding.')
                    df[col] = df[col].astype(str).map(lambda x: x if x in encoders[col].classes_ else encoders[col].classes_[0])
                    df[col] = encoders[col].transform(df[col])
        
        numerical_cols = [col for col in feature_names if col in df.columns and df[col].dtype in ['int64', 'float64']]
        if numerical_cols:
            df[numerical_cols] = scaler.transform(df[numerical_cols])
            logging.info(f'Scaled numerical columns: {numerical_cols}')
        
        for col in feature_names:
            if col not in df.columns:
                df[col] = 0
                logging.warning(f'Added missing feature {col} with default value 0.')
        
        if 'Transaction_ID' in df.columns:
            transaction_ids = df['Transaction_ID']
            df = df.drop('Transaction_ID', axis=1)
        else:
            transaction_ids = [f'T{i}' for i in range(len(df))]
        
        df = df[feature_names]
        logging.info('Data preprocessing completed.')
        return df, transaction_ids
    except Exception as e:
        logging.error(f'Data preprocessing failed: {str(e)}')
        raise

def predict_fraud(df=None):
    try:
        required_files = [
            'ensemble_model.joblib', 'feature_names.joblib', 'scaler.joblib',
            'precomputed_means.joblib', 'precomputed_modes.joblib',
            'encoder_Transaction_Type.joblib', 'encoder_Device_Used.joblib',
            'encoder_Location.joblib', 'encoder_Payment_Method.joblib'
        ]
        for file_name in required_files:
            if not os.path.exists(os.path.join(MODEL_DIR, file_name)):
                raise FileNotFoundError(f'Required file {file_name} not found in {MODEL_DIR}.')
        
        model = load(os.path.join(MODEL_DIR, 'ensemble_model.joblib'))
        feature_names = load(os.path.join(MODEL_DIR, 'feature_names.joblib'))
        scaler = load(os.path.join(MODEL_DIR, 'scaler.joblib'))
        precomputed_means = load(os.path.join(MODEL_DIR, 'precomputed_means.joblib'))
        precomputed_modes = load(os.path.join(MODEL_DIR, 'precomputed_modes.joblib'))
        encoders = {
            'Transaction_Type': load(os.path.join(MODEL_DIR, 'encoder_Transaction_Type.joblib')),
            'Device_Used': load(os.path.join(MODEL_DIR, 'encoder_Device_Used.joblib')),
            'Location': load(os.path.join(MODEL_DIR, 'encoder_Location.joblib')),
            'Payment_Method': load(os.path.join(MODEL_DIR, 'encoder_Payment_Method.joblib'))
        }
        
        metadata_path = os.path.join(MODEL_DIR, 'model_metadata.json')
        metadata = {'Model': 'Unknown', 'Version': 'Unknown', 'Training_Date': 'Unknown', 'Accuracy': 0.0}
        if os.path.exists(metadata_path):
            with open(metadata_path, 'r') as f:
                metadata = json.load(f)
        
        if df is None:
            df = load_data()
        
        X, transaction_ids = preprocess_data(df, encoders, feature_names, scaler, precomputed_means, precomputed_modes)
        
        predictions = model.predict(X)
        fraud_probabilities = model.predict_proba(X)[:, 1]
        logging.info(f'Predicted {sum(predictions)} fraudulent transactions out of {len(predictions)}.')
        
        results = pd.DataFrame({
            'Transaction_ID': transaction_ids,
            'Fraud_Prediction': ['Fraudulent' if pred == 1 else 'Legitimate' for pred in predictions],
            'Fraud_Probability': fraud_probabilities
        })
        results['Fraud_Probability'] = results['Fraud_Probability'].round(4)
        
        os.makedirs(PREDICTIONS_DIR, exist_ok=True)
        json_path = os.path.join(PREDICTIONS_DIR, f'predictions_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json')
        excel_path = os.path.join(PREDICTIONS_DIR, f'predictions_{datetime.now().strftime("%Y%m%d_%H%M%S")}.xlsx')
        results.to_json(json_path, orient='records', indent=4)
        results.to_excel(excel_path, index=False)
        
        return results, json_path, excel_path, metadata
    except FileNotFoundError as e:
        logging.error(f'Model or artifact files not found: {str(e)}')
        raise
    except Exception as e:
        logging.error(f'Fraud prediction failed: {str(e)}')
        raise

if __name__ == '__main__':
    # Example usage
    results, json_path, excel_path, metadata = predict_fraud('data/sample_transactions.csv')
    print('Prediction Results:')
    print(results.head())
    print(f'Results saved to {json_path} and {excel_path}')
    print('Metadata:', metadata)
'''

with open('scripts/predict_fraud.py', 'w') as f:
    f.write(predict_fraud_code)

print('Prediction script saved to scripts/predict_fraud.py')

In [None]:
##CREATE REQUIREMENT FILE
requirements = '''
streamlit==1.39.0
pandas==2.2.2
numpy==1.26.4
scikit-learn==1.5.2
joblib==1.4.2
plotly==5.24.1
imbalanced-learn==0.12.3
matplotlib==3.9.2
seaborn==0.13.2
chardet==5.2.0
'''

with open('requirements.txt', 'w') as f:
    f.write(requirements)

print('Requirements file saved to requirements.txt')

In [None]:
## CREATE .gitignore
gitignore = '''
# Python
__pycache__/
*.py[cod]
*$py.class

# Virtual environments
venv/
env/

# Model artifacts and predictions
model_artifacts/
predictions/

# Logs
*.log

# Data (to avoid committing sensitive data)
data/

# OS generated files
.DS_Store
Thumbs.db

# IDE and editor files
.idea/
*.sublime-workspace
*.sublime-project

# Jupyter Notebook checkpoints
.ipynb_checkpoints/
'''

with open('.gitignore', 'w') as f:
    f.write(gitignore)

print('.gitignore file saved')