# Inspecting the CSV files

In [2]:
import pandas as pd

# Define file paths
train_transaction_path = r"D:\FIT\Senior Year\SPRING 2025\BDM\Grand Project\CodeAndData\Data\train_transaction.csv"
train_identity_path = r"D:\FIT\Senior Year\SPRING 2025\BDM\Grand Project\CodeAndData\Data\train_identity.csv"

# Load the datasets
train_transaction = pd.read_csv(train_transaction_path, nrows=5)  # Read first 5 rows
train_identity = pd.read_csv(train_identity_path, nrows=5)

# Display basic info
print("Train Transaction CSV:")
display(train_transaction)

print("\nTrain Identity CSV:")
display(train_identity)


Train Transaction CSV:


Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



Train Identity CSV:


Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987004,0.0,70787.0,,,,,,,,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
1,2987008,-5.0,98945.0,,,0.0,-5.0,,,,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2,2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 62.0,,,,F,F,T,T,desktop,Windows
3,2987011,-5.0,221832.0,,,0.0,-6.0,,,,...,chrome 62.0,,,,F,F,T,T,desktop,
4,2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,...,chrome 62.0,24.0,1280x800,match_status:2,T,F,T,T,desktop,MacOS


In [3]:
# Display data types for each column in train_transaction
print("Data types in Train Transaction CSV:")
print(train_transaction.dtypes)

# Display data types for each column in train_identity
print("\nData types in Train Identity CSV:")
print(train_identity.dtypes)

Data types in Train Transaction CSV:
TransactionID       int64
isFraud             int64
TransactionDT       int64
TransactionAmt    float64
ProductCD          object
                   ...   
V335              float64
V336              float64
V337              float64
V338              float64
V339              float64
Length: 394, dtype: object

Data types in Train Identity CSV:
TransactionID      int64
id_01            float64
id_02            float64
id_03            float64
id_04            float64
id_05            float64
id_06            float64
id_07            float64
id_08            float64
id_09            float64
id_10            float64
id_11            float64
id_12             object
id_13            float64
id_14            float64
id_15             object
id_16             object
id_17            float64
id_18            float64
id_19            float64
id_20            float64
id_21            float64
id_22            float64
id_23            float64
id_24          

In [1]:
import os
import pandas as pd
import joblib
import logging
import time
from py2neo import Graph
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Try using rich for better progress bars, otherwise fallback to tqdm
try:
    from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn
    USE_RICH = True
except ImportError:
    from tqdm import tqdm
    USE_RICH = False

# Logging Configuration
logging.basicConfig(
    level=logging.INFO,  # Change to INFO to reduce verbosity
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[logging.StreamHandler()]
)

# File Paths
TRANSACTION_FILE = r"D:\FIT\Senior Year\SPRING 2025\BDM\Grand Project\CodeAndData\Data\train_transaction_cut.csv"
IDENTITY_FILE = r"D:\FIT\Senior Year\SPRING 2025\BDM\Grand Project\CodeAndData\Data\train_identity.csv"
MODEL_PATH = r"D:\FIT\Senior Year\SPRING 2025\BDM\Grand Project\CodeAndData\Code\fraud_detection_model.pkl"

# Total number of pipeline stages
TOTAL_STEPS = 5  


def setup_progress_bar():
    """Sets up the progress bar."""
    if USE_RICH:
        return Progress(
            SpinnerColumn(),
            TextColumn("[bold blue]{task.description}"),
            BarColumn(),
            TextColumn("[progress.percentage]{task.percentage:>3.0f}%")
        )
    return None


def load_and_preprocess_data():
    """
    Loads and preprocesses the transaction and identity datasets.

    Returns:
        pd.DataFrame: Merged and preprocessed dataset.
    """
    logging.info("📂 Loading dataset...")

    transactions = pd.read_csv(TRANSACTION_FILE)
    identity = pd.read_csv(IDENTITY_FILE)

    df = transactions.merge(identity, on="TransactionID", how="left")
    logging.info(f"✅ Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns.")

    # Fix isFraud column
    df["isFraud"] = df["isFraud"].astype(str).str.strip().astype(int)

    # Convert TransactionDT to meaningful time features
    df["TransactionDT_days"] = df["TransactionDT"] // (24 * 60 * 60)
    df["TransactionDT_hours"] = df["TransactionDT"] // (60 * 60)

    # Scale TransactionAMT
    scaler = StandardScaler()
    df["TransactionAmt_scaled"] = scaler.fit_transform(df[["TransactionAmt"]])

    # Encode categorical features
    categorical_cols = [
        "ProductCD", "card1", "card2", "card3", "card4", "card5", "card6",
        "addr1", "addr2", "P_emaildomain", "R_emaildomain",
        "DeviceType", "DeviceInfo"
    ] + [f"id_{i}" for i in range(12, 39)]  # id_12 to id_38

    for col in categorical_cols:
        if col in df.columns:
            df[col] = df[col].fillna("Unknown")
            df[col] = df[col].astype(str)
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])

    logging.info("✅ Data preprocessing complete.")
    return df


def construct_graph(df: pd.DataFrame):
    """
    Constructs a transaction graph in Neo4j.

    Args:
        df (pd.DataFrame): Processed transaction data.

    Returns:
        Graph: Neo4j connection object.
    """
    logging.info("🔗 Connecting to Neo4j...")
    graph = Graph("bolt://localhost:7687", auth=("neo4j", "12345678")) #TODO: MAYBE CHECK CONNECTION HERE
    
    logging.info("🛠️ Creating transaction nodes in Neo4j...")
    # with graph.begin() as tx:
    #     for row in df.itertuples(index=False):
    #         tx.run(
    #             "MERGE (t:Transaction {id: $TransactionID, isFraud: $isFraud})",
    #             TransactionID=row.TransactionID, isFraud=row.isFraud
    #         )
            
    tx = graph.begin()
    try:
        for row in df.itertuples(index=False):
            tx.run(
                "MERGE (t:Transaction {id: $TransactionID, isFraud: $isFraud})",
                TransactionID=row.TransactionID, isFraud=row.isFraud
            )
        tx.commit()
    except Exception as e:
        tx.rollback()
    raise e


    logging.info("🔗 Creating relationships between transactions...")
    for col in ["card1", "addr1"]:
        groups = df.groupby(col)["TransactionID"].apply(list)
        for transactions in groups:
            for i in range(len(transactions)):
                for j in range(i + 1, len(transactions)):
                    graph.run(
                        """
                        MATCH (a:Transaction {id: $id1}), (b:Transaction {id: $id2})
                        MERGE (a)-[:LINKED_TO]->(b)
                        """,
                        id1=transactions[i], id2=transactions[j]
                    )

    logging.info("✅ Graph construction complete.")
    return graph


def train_model(df: pd.DataFrame):
    """
    Trains a RandomForest classifier for fraud detection.

    Args:
        df (pd.DataFrame): Processed dataset.

    Returns:
        RandomForestClassifier: Trained model.
    """
    logging.info("📊 Preparing data for training...")
    X = df.drop(columns=["isFraud", "TransactionID"])
    y = df["isFraud"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    logging.info(f"🔍 Training set: {len(X_train)} samples, Test set: {len(X_test)} samples")

    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    logging.info(f"✅ Model trained successfully! Accuracy: {acc:.4f}")
    logging.info("📜 Classification Report:\n%s", classification_report(y_test, y_pred))
    return clf


def save_model(model: RandomForestClassifier):
    """
    Saves the trained model to a file.

    Args:
        model (RandomForestClassifier): The trained model.
    """
    joblib.dump(model, MODEL_PATH)
    logging.info(f"✅ Model saved at {MODEL_PATH}")


if __name__ == "__main__":
    logging.info("🚀 Starting fraud detection pipeline...")

    progress = setup_progress_bar()
    if progress:
        with progress:
            task = progress.add_task("🔄 Running Pipeline", total=TOTAL_STEPS)
            df = load_and_preprocess_data()
            progress.advance(task)

            graph = construct_graph(df)
            progress.advance(task)

            model = train_model(df)
            progress.advance(task)

            save_model(model)
            progress.advance(task)
    else:
        df = load_and_preprocess_data()
        graph = construct_graph(df)
        model = train_model(df)
        save_model(model)

    logging.info("🎉 Fraud detection pipeline completed successfully!")
    logging.info("🛑 Exiting...")

2025-04-01 21:17:48,151 [INFO] - 🚀 Starting fraud detection pipeline...


Output()

2025-04-01 21:17:48,165 [INFO] - 📂 Loading dataset...


2025-04-01 21:17:49,174 [INFO] - ✅ Dataset loaded: 20000 rows, 434 columns.


2025-04-01 21:17:49,467 [INFO] - ✅ Data preprocessing complete.
2025-04-01 21:17:49,481 [INFO] - 🔗 Connecting to Neo4j...


2025-04-01 21:17:49,633 [INFO] - 🛠️ Creating transaction nodes in Neo4j...


UnboundLocalError: cannot access local variable 'e' where it is not associated with a value

In [None]:
print(X_train.dtypes)  # Check data types of all columns
print(X_train.head())  # Look at sample values
print(X_test.dtypes)  # Check data types of all columns
print(X_test.head())  # Look at sample values