# 💳 Credit Card Fraud Detection using XGBoost (with manual input fields)

In [13]:
import pandas as pd
import numpy as np
import joblib
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE


In [14]:
# Load dataset
df = pd.read_csv('dataset.csv')
df.dropna(inplace=True)
df.head()


Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [15]:
# Define features and label
features = ['merchant', 'category', 'amt', 'gender', 'lat', 'long', 'city_pop']
X = df[features]
y = df['is_fraud']

# Define numeric and categorical features
num_features = ['amt', 'lat', 'long', 'city_pop']
cat_features = ['merchant', 'category', 'gender']

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
])

X_processed = preprocessor.fit_transform(X)

# Save the preprocessor
joblib.dump(preprocessor, 'preprocessor.pkl')

# Balance the dataset using SMOTE
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_processed, y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)
joblib.dump((X_train, X_test, y_train, y_test), 'dataset_split.pkl')


['dataset_split.pkl']

In [16]:
# Train XGBoost model
model = XGBClassifier(eval_metric='logloss')
model.fit(X_train, y_train)

# Save model
joblib.dump(model, 'fraud_model.pkl')


['fraud_model.pkl']

In [17]:
# Example manual input (replace with Streamlit later)
sample = pd.DataFrame([{
    'merchant': 'Online Store',
    'category': 'shopping_net',
    'amt': 250.75,
    'gender': 'F',
    'lat': 37.7749,
    'long': -122.4194,
    'city_pop': 100000
}])

# Transform input
preprocessor = joblib.load('preprocessor.pkl')
model = joblib.load('fraud_model.pkl')

X_sample = preprocessor.transform(sample)
prediction = model.predict(X_sample)[0]
'Fraud' if prediction == 1 else 'Legitimate'


'Legitimate'

In [18]:
# Load data
df = pd.read_csv('dataset.csv').dropna()

# Drop unused field (cc_num is usually not useful for prediction)
X_all = df[['merchant', 'category', 'amt', 'gender', 'lat', 'long', 'city_pop']]
y_true = df['is_fraud']  # Ground truth

# Load preprocessor and model
preprocessor = joblib.load('preprocessor.pkl')
model = joblib.load('fraud_model.pkl')

# Transform features
X_all_processed = preprocessor.transform(X_all)

# Predict fraud for all rows
predictions = model.predict(X_all_processed)

# Add predictions to DataFrame
df['predicted_is_fraud'] = predictions
df['match'] = df['is_fraud'] == df['predicted_is_fraud']

# Show some predictions
df[['merchant', 'category', 'amt', 'is_fraud', 'predicted_is_fraud', 'match']].head(20)


Unnamed: 0,merchant,category,amt,is_fraud,predicted_is_fraud,match
0,"fraud_Rippin, Kub and Mann",misc_net,4.97,0,0,True
1,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,0,0,True
2,fraud_Lind-Buckridge,entertainment,220.11,0,0,True
3,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,0,0,True
4,fraud_Keeling-Crist,misc_pos,41.96,0,0,True
5,"fraud_Stroman, Hudson and Erdman",gas_transport,94.63,0,0,True
6,fraud_Rowe-Vandervort,grocery_net,44.54,0,0,True
7,fraud_Corwin-Collins,gas_transport,71.65,0,0,True
8,fraud_Herzog Ltd,misc_pos,4.27,0,0,True
9,"fraud_Schoen, Kuphal and Nitzsche",grocery_pos,198.39,0,0,True


In [19]:
# Show full columns for the predicted fraudulent transactions
fraud_preds = df[df['predicted_is_fraud'] == 1]

# ✅ Display the top 10 full fraudulent rows including cc_num, lat, long, etc.
fraud_preds[[
    'cc_num', 'merchant', 'category', 'amt', 'gender', 'lat', 'long', 'city_pop', 
    'is_fraud', 'predicted_is_fraud'
]].head(10)



Unnamed: 0,cc_num,merchant,category,amt,gender,lat,long,city_pop,is_fraud,predicted_is_fraud
70,4512828414983801773,fraud_Jaskolski-Dibbert,grocery_net,16.16,F,39.8936,-79.7856,328,0,1
138,4060579726528237,"fraud_Baumbach, Hodkiewicz and Walsh",shopping_pos,636.41,M,39.2136,-95.4404,2661,0,1
232,4311368326621416041,fraud_Kassulke PLC,shopping_net,1055.47,M,40.7692,-103.0968,648,0,1
296,571844099986,fraud_Heathcote LLC,shopping_net,909.66,F,38.2507,-85.7476,736284,0,1
587,3597337756918966,"fraud_Hagenes, Kohler and Hoppe",food_dining,116.04,F,39.7813,-76.7477,7565,0,1
723,4265776278887457,"fraud_Rippin, Kub and Mann",misc_net,1047.52,F,35.2087,-92.2123,969,0,1
764,5501083170975659,fraud_Metz-Boehm,shopping_pos,658.55,M,41.1464,-81.5107,47772,0,1
770,38580485618059,fraud_Mohr-Bayer,shopping_net,767.18,F,48.8856,-103.0098,248,0,1
824,630469040731,fraud_Auer-West,shopping_net,1433.54,F,45.671,-121.8686,1288,0,1
919,4255397449664185994,fraud_Gibson-Deckow,entertainment,426.45,M,32.69,-96.9177,1263321,0,1


In [20]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_true, predictions))
print(classification_report(y_true, predictions))



[[1267317   21852]
 [    572    6934]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99   1289169
           1       0.24      0.92      0.38      7506

    accuracy                           0.98   1296675
   macro avg       0.62      0.95      0.69   1296675
weighted avg       1.00      0.98      0.99   1296675

