In [None]:
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# -----------------------------
# Function to load all pickle files from a folder
# -----------------------------
def load_pkl_folder(folder_path):
    all_files = [f for f in os.listdir(folder_path) if f.endswith(".pkl")]
    dfs = []
    for file in all_files:
        file_path = os.path.join(folder_path, file)
        df = pd.read_pickle(file_path)
        dfs.append(df)
    combined_df = pd.concat(dfs, ignore_index=True)
    return combined_df

# -----------------------------
# Folder containing .pkl files
# -----------------------------
folder_path = "/content/drive/MyDrive/internshipProject/fraud_detection/fraud_detection/data"
df = load_pkl_folder(folder_path)
print(f"Loaded {len(df)} transactions from {folder_path}")

# -----------------------------
# Preprocessing
# -----------------------------
# Convert datetime
df['TX_DATETIME'] = pd.to_datetime(df['TX_DATETIME'])

# Extract datetime features
df['TX_HOUR'] = df['TX_DATETIME'].dt.hour
df['TX_DAY'] = df['TX_DATETIME'].dt.day
df['TX_WEEKDAY'] = df['TX_DATETIME'].dt.weekday
df['TX_MONTH'] = df['TX_DATETIME'].dt.month

# -----------------------------
# Frequency encoding for categorical IDs
# -----------------------------
for col in ['CUSTOMER_ID', 'TERMINAL_ID']:
    freq = df[col].value_counts() / len(df)
    df[col + '_freq'] = df[col].map(freq)

# Features and target
feature_cols = ['CUSTOMER_ID_freq', 'TERMINAL_ID_freq', 'TX_AMOUNT',
                'TX_HOUR', 'TX_DAY', 'TX_WEEKDAY', 'TX_MONTH']
X = df[feature_cols]
y = df['TX_FRAUD']

# -----------------------------
# Scale TX_AMOUNT
# -----------------------------
scaler = StandardScaler()
X['TX_AMOUNT'] = scaler.fit_transform(X[['TX_AMOUNT']])

# -----------------------------
# Optional: Sample for faster training (memory-efficient)
# -----------------------------
df_sample = df.sample(frac=0.1, random_state=42)  # 10% of data
X = df_sample[feature_cols]
y = df_sample['TX_FRAUD']

# -----------------------------
# Train/Test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# Train RandomForestClassifier
# -----------------------------
clf = RandomForestClassifier(n_estimators=500, max_depth=10, random_state=42)
clf.fit(X_train, y_train)

# -----------------------------
# Evaluate
# -----------------------------
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Loaded 1754155 transactions from /content/drive/MyDrive/internshipProject/fraud_detection/fraud_detection/data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['TX_AMOUNT'] = scaler.fit_transform(X[['TX_AMOUNT']])


Accuracy: 0.9937863413521834

Confusion Matrix:
 [[34800     1]
 [  217    66]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00     34801
           1       0.99      0.23      0.38       283

    accuracy                           0.99     35084
   macro avg       0.99      0.62      0.69     35084
weighted avg       0.99      0.99      0.99     35084



In [None]:
# -----------------------------
# Save the trained model
# -----------------------------
model_path = "/content/drive/MyDrive/internshipProject/fraud_detection/fraud_transaction_model.pkl"
joblib.dump(clf, model_path)
joblib.dump(scaler, "/content/drive/MyDrive/internshipProject/fraud_detection/tx_amount_scaler.pkl")
print(f"Model and scaler saved successfully!")

# -----------------------------
# Real-time prediction function
# -----------------------------
def predict_fraud(new_transactions, model, scaler, customer_freq_map, terminal_freq_map):
    """
    new_transactions: pd.DataFrame with columns ['CUSTOMER_ID', 'TERMINAL_ID', 'TX_AMOUNT', 'TX_DATETIME']
    """
    new_transactions['TX_DATETIME'] = pd.to_datetime(new_transactions['TX_DATETIME'])
    new_transactions['TX_HOUR'] = new_transactions['TX_DATETIME'].dt.hour
    new_transactions['TX_DAY'] = new_transactions['TX_DATETIME'].dt.day
    new_transactions['TX_WEEKDAY'] = new_transactions['TX_DATETIME'].dt.weekday
    new_transactions['TX_MONTH'] = new_transactions['TX_DATETIME'].dt.month

    # Frequency encoding
    new_transactions['CUSTOMER_ID_freq'] = new_transactions['CUSTOMER_ID'].map(customer_freq_map).fillna(0)
    new_transactions['TERMINAL_ID_freq'] = new_transactions['TERMINAL_ID'].map(terminal_freq_map).fillna(0)

    # Scale TX_AMOUNT
    new_transactions['TX_AMOUNT'] = scaler.transform(new_transactions[['TX_AMOUNT']])

    feature_cols = ['CUSTOMER_ID_freq', 'TERMINAL_ID_freq', 'TX_AMOUNT',
                    'TX_HOUR', 'TX_DAY', 'TX_WEEKDAY', 'TX_MONTH']
    X_new = new_transactions[feature_cols]

    new_transactions['Predicted_Fraud'] = model.predict(X_new)
    return new_transactions

# -----------------------------
# Example usage of real-time prediction
# -----------------------------
# Create frequency maps for new data
customer_freq_map = df['CUSTOMER_ID'].value_counts() / len(df)
terminal_freq_map = df['TERMINAL_ID'].value_counts() / len(df)

# Example new transactions
new_data = pd.DataFrame({
    'CUSTOMER_ID': ['C123', 'C45'],
    'TERMINAL_ID': ['T789', 'T321'],
    'TX_AMOUNT': [150, 300],
    'TX_DATETIME': ['2025-10-24 10:30:00', '2025-10-24 14:45:00']
})

predictions = predict_fraud(new_data, clf, scaler, customer_freq_map, terminal_freq_map)
print(predictions)

Model and scaler saved successfully!
  CUSTOMER_ID TERMINAL_ID  TX_AMOUNT         TX_DATETIME  TX_HOUR  TX_DAY  \
0        C123        T789   2.276771 2025-10-24 10:30:00       10      24   
1         C45        T321   5.820652 2025-10-24 14:45:00       14      24   

   TX_WEEKDAY  TX_MONTH  CUSTOMER_ID_freq  TERMINAL_ID_freq  Predicted_Fraud  
0           4        10               0.0               0.0                0  
1           4        10               0.0               0.0                0  


In [None]:
import pandas as pd
import numpy as np
import joblib
import time
from sklearn.preprocessing import StandardScaler

# -----------------------------
# Load the trained model
# -----------------------------
model_path = "/content/drive/MyDrive/internshipProject/fraud_detection/fraud_transaction_model.pkl"
clf = joblib.load(model_path)
print("Model loaded successfully!")

# -----------------------------
# Load original dataset for frequency maps
# -----------------------------
data_path = "/content/drive/MyDrive/internshipProject/fraud_detection/fraud_detection/data"
df_list = []
import os
for f in os.listdir(data_path):
    if f.endswith(".pkl"):
        df_list.append(pd.read_pickle(os.path.join(data_path, f)))
df_orig = pd.concat(df_list, ignore_index=True)

# Frequency encoding maps
customer_freq_map = df_orig['CUSTOMER_ID'].value_counts(normalize=True).to_dict()
terminal_freq_map = df_orig['TERMINAL_ID'].value_counts(normalize=True).to_dict()

# Scale TX_AMOUNT
scaler = StandardScaler()
scaler.fit(df_orig[['TX_AMOUNT']])

# -----------------------------
# Feature columns
# -----------------------------
feature_cols = ['CUSTOMER_ID_freq', 'TERMINAL_ID_freq', 'TX_AMOUNT',
                'TX_HOUR', 'TX_DAY', 'TX_WEEKDAY', 'TX_MONTH']

# -----------------------------
# Function to generate realistic random transactions
# -----------------------------
def generate_random_transaction():
    is_high_risk = np.random.rand() < 0.1  # 10% chance of fraud-like transaction

    if is_high_risk:
        TX_AMOUNT = np.random.randint(221, 500)  # likely fraud
    else:
        TX_AMOUNT = np.random.randint(1, 220)

    CUSTOMER_ID = np.random.choice(df_orig['CUSTOMER_ID'].unique())
    TERMINAL_ID = np.random.choice(df_orig['TERMINAL_ID'].unique())

    txn = pd.DataFrame({
        'CUSTOMER_ID': [CUSTOMER_ID],
        'TERMINAL_ID': [TERMINAL_ID],
        'TX_AMOUNT': [TX_AMOUNT],
        'TX_DATETIME': [pd.Timestamp.now() + pd.to_timedelta(np.random.randint(-86400, 86400), unit='s')]
    })

    # Extract datetime features
    txn['TX_HOUR'] = txn['TX_DATETIME'].dt.hour
    txn['TX_DAY'] = txn['TX_DATETIME'].dt.day
    txn['TX_WEEKDAY'] = txn['TX_DATETIME'].dt.weekday
    txn['TX_MONTH'] = txn['TX_DATETIME'].dt.month

    # Frequency encoding
    txn['CUSTOMER_ID_freq'] = txn['CUSTOMER_ID'].map(customer_freq_map).fillna(0)
    txn['TERMINAL_ID_freq'] = txn['TERMINAL_ID'].map(terminal_freq_map).fillna(0)

    # Scale TX_AMOUNT
    txn['TX_AMOUNT'] = scaler.transform(txn[['TX_AMOUNT']])

    return txn

# -----------------------------
# Continuous prediction loop
# -----------------------------
predictions_list = []
output_path = "/content/drive/MyDrive/internshipProject/fraud_detection/fraud_detection/real_time_fraud_predictionsfile_.csv"

print("Starting random fraud transaction predictions...")
print("Press Ctrl + C to stop.\n")

try:
    while True:
        txn_data = generate_random_transaction()
        prob_fraud = clf.predict_proba(txn_data[feature_cols])[0][1]  # probability of fraud
        txn_data['Predicted_Fraud'] = int(prob_fraud > 0.5)  # threshold 0.5
        txn_data['Fraud_Probability'] = round(prob_fraud, 4)
        txn_data['Timestamp'] = pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")

        predictions_list.append(txn_data)
        print(txn_data)
        print("-" * 80)
        time.sleep(2)  # wait 2 seconds

except KeyboardInterrupt:
    print("\nPrediction stopped by user.")

    if predictions_list:
        final_predictions = pd.concat(predictions_list, ignore_index=True)
        final_predictions.to_csv(output_path, index=False)
        print(f"\nPredictions saved to: {output_path}")
    else:
        print("No predictions were made.")


Model loaded successfully!
Starting random fraud transaction predictions...
Press Ctrl + C to stop.

   CUSTOMER_ID  TERMINAL_ID  TX_AMOUNT                TX_DATETIME  TX_HOUR  \
0          105         7385  -0.747341 2025-10-25 08:27:00.017327        8   

   TX_DAY  TX_WEEKDAY  TX_MONTH  CUSTOMER_ID_freq  TERMINAL_ID_freq  \
0      25           5        10          0.000071          0.000133   

   Predicted_Fraud  Fraud_Probability            Timestamp  
0                0             0.0312  2025-10-24 13:44:23  
--------------------------------------------------------------------------------
   CUSTOMER_ID  TERMINAL_ID  TX_AMOUNT                TX_DATETIME  TX_HOUR  \
0         4003         3651   2.465778 2025-10-25 05:43:10.223864        5   

   TX_DAY  TX_WEEKDAY  TX_MONTH  CUSTOMER_ID_freq  TERMINAL_ID_freq  \
0      25           5        10          0.000084          0.000082   

   Predicted_Fraud  Fraud_Probability            Timestamp  
0                0             0.00