In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Load dataset
data = pd.read_csv('transactions.csv')

# Normalize column names: remove spaces and convert to lowercase
data.columns = data.columns.str.strip().str.lower()

# Print actual column names for debugging
print("Dataset Columns:", data.columns.tolist())

# Rename columns if needed (e.g., if you have 'amount (inr)', rename to 'amount')
if 'amount (inr)' in data.columns:
    data.rename(columns={'amount (inr)': 'amount'}, inplace=True)

# Extract additional features from timestamp if available
if 'timestamp' in data.columns:
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    data['hour'] = data['timestamp'].dt.hour
    data['day'] = data['timestamp'].dt.day
    data['month'] = data['timestamp'].dt.month
else:
    data['hour'], data['day'], data['month'] = 0, 0, 0  # Placeholder values if timestamp is missing

# Prepare a dictionary to store label encoders
encoders = {}

# Encode categorical variables if they exist in the dataset
# For 'status'
if 'status' in data.columns:
    le_status = LabelEncoder()
    data['status'] = le_status.fit_transform(data['status'])
    encoders['status'] = le_status

# For 'sender_upi' (if available)
if 'sender_upi' in data.columns:
    le_sender = LabelEncoder()
    data['sender_upi'] = le_sender.fit_transform(data['sender_upi'])
    encoders['sender_upi'] = le_sender

# For 'receiver_upi' (if available)
if 'receiver_upi' in data.columns:
    le_receiver = LabelEncoder()
    data['receiver_upi'] = le_receiver.fit_transform(data['receiver_upi'])
    encoders['receiver_upi'] = le_receiver

# Selecting relevant features for training
# Start with base features that are always present
features = ['amount', 'hour', 'day', 'month']

# Append additional features if they exist in the dataset
if 'status' in data.columns:
    features.append('status')
if 'sender_upi' in data.columns:
    features.append('sender_upi')
if 'receiver_upi' in data.columns:
    features.append('receiver_upi')

print("Features used for training:", features)

# Create feature matrix X and a placeholder target y
X = data[features]
y = np.random.randint(0, 2, size=len(data))  # Placeholder for fraud labels

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the RandomForest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save the trained model and the dictionary of encoders
joblib.dump(model, 'fraud_detection_model.pkl')
joblib.dump(encoders, 'label_encoders.pkl')

print("Model training complete. Saved as fraud_detection_model.pkl and label_encoders.pkl")


Dataset Columns: ['transaction id', 'timestamp', 'sender name', 'sender upi id', 'receiver name', 'receiver upi id', 'amount (inr)', 'status']
Features used for training: ['amount', 'hour', 'day', 'month', 'status']
Model training complete. Saved as fraud_detection_model.pkl and label_encoders.pkl
