In [106]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

In [107]:
train_data = pd.read_csv('fraudTrain.csv')
test_data = pd.read_csv('fraudTest.csv')

In [108]:
print("Training Data Shape:", train_data.shape)
print("Fraud Cases in Train:", len(train_data[train_data['is_fraud'] == 1]))
print("Legitimate Cases in Train:", len(train_data[train_data['is_fraud'] == 0]))
print("\nTest Data Shape:", test_data.shape)

Training Data Shape: (1296675, 23)
Fraud Cases in Train: 7506
Legitimate Cases in Train: 1289169

Test Data Shape: (555719, 23)


In [109]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,28654,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,99160,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,83252,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,59632,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,24433,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [110]:
train_data['trans_date_trans_time'] = pd.to_datetime(train_data['trans_date_trans_time'])
test_data['trans_date_trans_time'] = pd.to_datetime(test_data['trans_date_trans_time'])

In [111]:
train_data['trans_hour'] = train_data['trans_date_trans_time'].dt.hour
train_data['trans_day_of_week'] = train_data['trans_date_trans_time'].dt.dayofweek
test_data['trans_hour'] = test_data['trans_date_trans_time'].dt.hour
test_data['trans_day_of_week'] = test_data['trans_date_trans_time'].dt.dayofweek

In [112]:
train_data = train_data.drop(columns=['trans_date_trans_time'])
test_data = test_data.drop(columns=['trans_date_trans_time'])

In [113]:
import joblib
from sklearn.preprocessing import LabelEncoder
import numpy as np

categorical_cols = ['merchant', 'category', 'gender', 'city', 'state', 'job']
encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col])
    
    # Handle unknowns in test set
    test_data[col] = test_data[col].map(lambda s: '<unknown>' if s not in le.classes_ else s)
    le.classes_ = np.append(le.classes_, '<unknown>')
    test_data[col] = le.transform(test_data[col])
    
    # Save encoder
    encoders[col] = le

# Save all encoders together
joblib.dump(encoders, 'label_encoders.pkl')


['label_encoders.pkl']

In [114]:
columns_to_drop = ['trans_num', 'first', 'last', 'street', 'dob', 'Unnamed: 0']
train_data = train_data.drop(columns=[col for col in columns_to_drop if col in train_data.columns])
test_data = test_data.drop(columns=[col for col in columns_to_drop if col in test_data.columns])

In [115]:
train_data.isnull().sum()
test_data.isnull().sum()

cc_num               0
merchant             0
category             0
amt                  0
gender               0
city                 0
state                0
zip                  0
lat                  0
long                 0
city_pop             0
job                  0
unix_time            0
merch_lat            0
merch_long           0
is_fraud             0
trans_hour           0
trans_day_of_week    0
dtype: int64

In [116]:
missing_cols = set(train_data.columns) - set(test_data.columns)
missing_cols

set()

In [117]:
X_train = train_data.drop('is_fraud', axis=1)
y_train = train_data['is_fraud']
X_test = test_data.drop('is_fraud', axis=1)
y_test = test_data['is_fraud']

In [118]:
print("Features after preprocessing:", X_train.columns.tolist())

Features after preprocessing: ['cc_num', 'merchant', 'category', 'amt', 'gender', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'unix_time', 'merch_lat', 'merch_long', 'trans_hour', 'trans_day_of_week']


In [119]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [120]:
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']

In [121]:
smote = SMOTE(sampling_strategy=0.1, random_state=42)  
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

In [122]:
print("Shape of resampled training data:", X_train_res.shape)

Shape of resampled training data: (1418085, 17)


In [123]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42, n_jobs=1),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=1) 
}

In [124]:
import joblib
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test_scaled)
    
    print(f"\nResults for {name}:")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_test, y_pred))
    
    # Save model
    joblib.dump(model, f"{name}_model.pkl")
    print(f"Model saved as {name}_model.pkl")



Training Logistic Regression...

Results for Logistic Regression:
Confusion Matrix:
[[550528   3046]
 [  1100   1045]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00    553574
           1       0.26      0.49      0.34      2145

    accuracy                           0.99    555719
   macro avg       0.63      0.74      0.67    555719
weighted avg       1.00      0.99      0.99    555719

ROC AUC Score: 0.7408385305631203
Model saved as Logistic Regression_model.pkl

Training Decision Tree...

Results for Decision Tree:
Confusion Matrix:
[[549654   3920]
 [   742   1403]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00    553574
           1       0.26      0.65      0.38      2145

    accuracy                           0.99    555719
   macro avg       0.63      0.82      0.69    555719
weighted avg       1.00      0.99      0.99   

In [125]:
import xgboost as xgb
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

# X_train, y_train are already SMOTE-balanced
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
y_prob = xgb_model.predict_proba(X_test)[:, 1]

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nROC AUC Score:", roc_auc_score(y_test, y_prob))


Confusion Matrix:
 [[553070    504]
 [   922   1223]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.71      0.57      0.63      2145

    accuracy                           1.00    555719
   macro avg       0.85      0.78      0.82    555719
weighted avg       1.00      1.00      1.00    555719


ROC AUC Score: 0.9734340341634036


In [126]:
import joblib

# Save model
joblib.dump(xgb_model, 'xgboost_model.pkl')



['xgboost_model.pkl']

In [None]:
import streamlit as st
import joblib
import pandas as pd
import numpy as np
from datetime import datetime

# ---------------- Page Configuration ----------------
st.set_page_config(
    page_title="🛡️ Fraud Detection CLI Interface → Web App",
    page_icon="🛡️",
    layout="wide",
    initial_sidebar_state="expanded"
)

# ---------------- Custom CSS Styling ----------------
st.markdown("""
<style>
    body {
        font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
        background: linear-gradient(135deg, #0f2027, #203a43, #2c5364);
        color: white;
    }
    .stApp {
        background: transparent;
    }
    .big-title {
        font-size: 3rem;
        font-weight: 800;
        text-align: center;
        margin-bottom: 10px;
        color: #ffffff;
        background: linear-gradient(90deg, #00c9ff, #92fe9d);
        -webkit-background-clip: text;
        -webkit-text-fill-color: transparent;
    }
    .subtitle {
        text-align: center;
        font-size: 1.3rem;
        margin-bottom: 30px;
        color: rgba(255,255,255,0.7);
    }
    .stForm, .stSelectbox, .stNumberInput, .stTextInput, .stSlider {
        background-color: #ffffff0a !important;
        border-radius: 12px !important;
    }
    .stButton>button {
        background: linear-gradient(45deg, #00c9ff, #92fe9d);
        color: black;
        border: none;
        border-radius: 10px;
        font-weight: 700;
        font-size: 1.1rem;
        padding: 0.75rem 2rem;
        transition: 0.3s ease;
    }
    .stButton>button:hover {
        transform: scale(1.03);
        box-shadow: 0px 0px 15px rgba(0, 201, 255, 0.5);
    }
    .prediction-box {
        background: rgba(255,255,255,0.1);
        border-left: 6px solid #00c9ff;
        padding: 1rem;
        border-radius: 10px;
        margin-bottom: 1rem;
        color: white;
    }
</style>
""", unsafe_allow_html=True)

# ---------------- Load Models and Encoders ----------------
@st.cache_resource
def load_artifacts():
    scaler = joblib.load('scaler.pkl')
    encoders = joblib.load('label_encoders.pkl')
    models = {
        'Logistic Regression': joblib.load('Logistic Regression_model.pkl'),
        'Decision Tree': joblib.load('Decision Tree_model.pkl'),
        'Random Forest': joblib.load('Random Forest_model.pkl'),
        'XGBoost': joblib.load('xgboost_model.pkl')
    }
    return scaler, encoders, models

scaler, encoders, models = load_artifacts()

feature_order = [
    'cc_num', 'merchant', 'category', 'amt', 'gender', 'city', 'state', 'zip', 'lat', 'long',
    'city_pop', 'job', 'unix_time', 'merch_lat', 'merch_long', 'trans_hour', 'trans_day_of_week'
]

categorical_cols = ['merchant', 'category', 'gender', 'city', 'state', 'job']

# ---------------- Streamlit Layout ----------------
st.markdown('<div class="big-title">🛡️ Fraud Detection System</div>', unsafe_allow_html=True)
st.markdown('<div class="subtitle">Transforming CLI logic into a Billion-Dollar UI</div>', unsafe_allow_html=True)

with st.form("fraud_form"):
    col1, col2 = st.columns(2)

    with col1:
        cc_num = st.number_input("💳 Credit Card Number", value=123456789.0)
        amt = st.number_input("💰 Transaction Amount", value=100.0)
        zip_code = st.number_input("📮 ZIP Code", value=10001.0)
        lat = st.number_input("🌐 Latitude", value=40.0)
        long = st.number_input("🌐 Longitude", value=-75.0)
        city_pop = st.number_input("🏙️ City Population", value=8500000.0)

    with col2:
        unix_time = st.number_input("🕒 Unix Timestamp", value=float(datetime.now().timestamp()))
        merch_lat = st.number_input("🏪 Merchant Latitude", value=40.0)
        merch_long = st.number_input("🏪 Merchant Longitude", value=-75.0)
        trans_hour = st.slider("⏰ Transaction Hour", 0, 23, 12)
        trans_day = st.slider("📅 Day of Week", 0, 6, 3)

    st.markdown("### 🧠 Enter Categorical Values")
    merchant = st.text_input("Merchant", value="fraud_Rippin, Kub and Mann")
    category = st.selectbox("Category", encoders['category'].classes_.tolist())
    gender = st.selectbox("Gender", encoders['gender'].classes_.tolist())
    city = st.text_input("City", value="New York")
    state = st.selectbox("State", encoders['state'].classes_.tolist())
    job = st.selectbox("Job", encoders['job'].classes_.tolist())

    submit = st.form_submit_button("🔍 Predict")

if submit:
    try:
        sample_input = {
            'cc_num': cc_num,
            'merchant': encoders['merchant'].transform([merchant])[0] if merchant in encoders['merchant'].classes_ else 0,
            'category': encoders['category'].transform([category])[0],
            'amt': amt,
            'gender': encoders['gender'].transform([gender])[0],
            'city': encoders['city'].transform([city])[0] if city in encoders['city'].classes_ else 0,
            'state': encoders['state'].transform([state])[0],
            'zip': zip_code,
            'lat': lat,
            'long': long,
            'city_pop': city_pop,
            'job': encoders['job'].transform([job])[0],
            'unix_time': unix_time,
            'merch_lat': merch_lat,
            'merch_long': merch_long,
            'trans_hour': trans_hour,
            'trans_day_of_week': trans_day
        }

        input_df = pd.DataFrame([sample_input], columns=feature_order)
        input_scaled = scaler.transform(input_df)

        st.markdown("---")
        st.subheader("📊 Model Predictions")

        for name, model in models.items():
            pred = model.predict(input_scaled)[0]
            prob = model.predict_proba(input_scaled)[0][1]
            status = "🚨 Fraud" if pred == 1 else "✅ Legitimate"
            st.markdown(f"<div class='prediction-box'><strong>{name}:</strong> {status} <br>Probability: {prob:.2%}</div>", unsafe_allow_html=True)

    except Exception as e:
        st.error(f"Prediction failed: {e}")



Enter Transaction Details:


Logistic Regression Prediction: Legitimate
Logistic Regression Probability of Fraud: 0.0000

Decision Tree Prediction: Legitimate
Decision Tree Probability of Fraud: 0.0000

Random Forest Prediction: Legitimate
Random Forest Probability of Fraud: 0.2000

XGBoost Prediction: Legitimate
XGBoost Probability of Fraud: 0.0143
