In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score
# Reproducible random state for splits and resampling
RANDOM_STATE = 42
import joblib

In [2]:
print("üîπ Loading dataset...")
df = pd.read_csv('fraudTest.csv')
df = df.dropna()
# Convert transaction time to datetime
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['trans_hour'] = df['trans_date_trans_time'].dt.hour
df['trans_day_of_week'] = df['trans_date_trans_time'].dt.dayofweek

# Sort and compute time difference between transactions per card
df = df.sort_values(by=['cc_num', 'trans_date_trans_time'])
df['time_diff'] = df.groupby('cc_num')['trans_date_trans_time'].diff().dt.total_seconds().fillna(0)

# üßÆ Compute distance between customer and merchant locations
def haversine(lat1, lon1, lat2, lon2):
    """Calculate the great-circle distance between two points in kilometers."""
    R = 6371  # Earth radius (km)
    # Convert degrees to radians
    lat1, lon1, lat2, lon2 = [np.radians(x) for x in (lat1, lon1, lat2, lon2)]
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    return 2 * R * np.arcsin(np.sqrt(a))

df["distance_km"] = haversine(df["lat"], df["long"], df["merch_lat"], df["merch_long"])

# Select columns
categorical_cols = ['category', 'gender', 'state']
numerical_cols = ['amt', 'city_pop', 'trans_hour', 'trans_day_of_week', 'time_diff', 'distance_km']

# Drop irrelevant columns
cols_to_drop = [
    'Unnamed: 0', 'cc_num', 'merchant', 'first', 'last', 'street', 'city',
    'zip', 'job', 'dob', 'trans_num', 'unix_time', 'trans_date_trans_time',
    'lat', 'long', 'merch_lat', 'merch_long'
]
df_processed = df.drop(columns=cols_to_drop)

# One-hot encode categorical variables
df_processed = pd.get_dummies(df_processed, columns=categorical_cols, drop_first=True)


üîπ Loading dataset...


In [3]:
print("üîπ Cleaning target column 'is_fraud'...")
if 'is_fraud' in df_processed.columns:
    df_processed = df_processed[pd.notnull(df_processed['is_fraud'])]
    df_processed['is_fraud'].replace([np.inf, -np.inf], np.nan, inplace=True)
    df_processed.dropna(subset=['is_fraud'], inplace=True)
    df_processed['is_fraud'] = df_processed['is_fraud'].astype(int)
else:
    raise KeyError("'is_fraud' column not found in dataset")


üîπ Cleaning target column 'is_fraud'...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed['is_fraud'].replace([np.inf, -np.inf], np.nan, inplace=True)


In [4]:
print("\nüîç Data Quality Check:")
print("Missing values per column:\n", df_processed.isna().sum().sort_values(ascending=False).head())
print("\nClass distribution of 'is_fraud':")
print(df_processed['is_fraud'].value_counts())



üîç Data Quality Check:
Missing values per column:
 amt                  0
city_pop             0
is_fraud             0
trans_hour           0
trans_day_of_week    0
dtype: int64

Class distribution of 'is_fraud':
is_fraud
0    553574
1      2145
Name: count, dtype: int64


In [5]:
scaler = StandardScaler()
df_processed[numerical_cols] = scaler.fit_transform(df_processed[numerical_cols])


In [7]:
X = df_processed.drop('is_fraud', axis=1)
y = df_processed['is_fraud']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


import json

# After: X = df_processed.drop('is_fraud', axis=1)
feature_columns = X.columns.tolist()

with open("columns.json", "w") as f:
    json.dump(feature_columns, f)

with open("numerical_cols.json", "w") as f:
    json.dump(numerical_cols, f)

# import pickle

# # Save model
# with open("fraud_model.pkl", "wb") as f:
#     pickle.dump(lr_clf, f)

# # Save scaler
# with open("scaler.pkl", "wb") as f:
#     pickle.dump(scaler, f)

# print("Pickle & JSON files saved successfully!")



In [8]:
print("\nüîπ Training Logistic Regression Model...")
lr_clf = LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced')
lr_clf.fit(X_train, y_train)



üîπ Training Logistic Regression Model...


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'liblinear'
,max_iter,100


In [9]:
print("\nüîπ Evaluating Model Performance...")
y_pred = lr_clf.predict(X_test)
y_pred_proba = lr_clf.predict_proba(X_test)[:, 1]

print("\nüìä Classification Report:")
print(classification_report(y_test, y_pred))
print(f"AUC-ROC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")



üîπ Evaluating Model Performance...

üìä Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.89      0.94    110715
           1       0.03      0.72      0.05       429

    accuracy                           0.89    111144
   macro avg       0.51      0.81      0.50    111144
weighted avg       1.00      0.89      0.94    111144

AUC-ROC Score: 0.9063


In [11]:
def predict_fraud_with_input(model, scaler, df_template, categorical_cols, numerical_cols):
    """
    Predicts fraud for a single transaction.
    Uses easy and realistic inputs including distance_km instead of lat/long.
    """

    print("\n===== üí≥ CREDIT CARD FRAUD PREDICTION SYSTEM =====\n")
    print("Please enter the transaction details below.\n")

    feature_hints = {
        "amt": "Transaction amount (e.g., 250.00)",
        "city_pop": "City population (e.g., 50000)",
        "trans_hour": "Hour of transaction (0-23, e.g., 14)",
        "trans_day_of_week": "Day of week (0=Sun ... 6=Sat)",
        "time_diff": "Time since last transaction in seconds (e.g., 120)",
        "distance_km": "Approx. distance between customer and merchant in km (e.g., 2 for local, 3000 for abroad)",
    }

    categorical_hints = {
        "category": "Transaction type (e.g., 'shopping_pos', 'travel', 'gas_transport')",
        "gender": "Gender ('M' or 'F')",
        "state": "State abbreviation (e.g., 'NY', 'CA', 'TX')",
    }

    input_data = {}

    print("---- üßÆ Numerical Features ----")
    for col in numerical_cols:
        hint = feature_hints.get(col, "")
        while True:
            try:
                value = float(input(f"Enter {col} ({hint}): "))
                input_data[col] = value
                break
            except ValueError:
                print("‚ùå Invalid input! Please enter a number.\n")

    print("\n---- üî§ Categorical Features ----")
    for col in categorical_cols:
        hint = categorical_hints.get(col, "")
        value = input(f"Enter {col} ({hint}): ")
        input_data[col] = value

    # Show summary
    print("\n‚úÖ You entered:")
    for k, v in input_data.items():
        print(f"  {k:20}: {v}")

    # Convert to DataFrame
    input_df = pd.DataFrame([input_data])
    input_df = pd.get_dummies(input_df, columns=categorical_cols)

    # Align columns with training data
    template = df_template.drop('is_fraud', axis=1).copy()
    for col in template.columns:
        if col not in input_df.columns:
            input_df[col] = 0
    input_df = input_df[template.columns]

    # Scale numerical columns
    input_df[numerical_cols] = scaler.transform(input_df[numerical_cols])

    # Prediction
    print("\nüîç Predicting transaction status...")
    prediction = model.predict(input_df)[0]

    if prediction == 1:
        print("\nüö® Prediction: FRAUDULENT TRANSACTION üö®")
    else:
        print("\n‚úÖ Prediction: NON-FRAUDULENT TRANSACTION ‚úÖ")

    return prediction



In [12]:
predict_fraud_with_input(lr_clf, scaler, df_processed, categorical_cols, numerical_cols)



===== üí≥ CREDIT CARD FRAUD PREDICTION SYSTEM =====

Please enter the transaction details below.

---- üßÆ Numerical Features ----

---- üî§ Categorical Features ----

‚úÖ You entered:
  amt                 : 250.0
  city_pop            : 50000.0
  trans_hour          : 14.0
  trans_day_of_week   : 2.0
  time_diff           : 120.0
  distance_km         : 2.0
  category            : travel
  gender              : m
  state               : ca

üîç Predicting transaction status...

üö® Prediction: FRAUDULENT TRANSACTION üö®


np.int64(1)

In [13]:
import pickle

# Save model
with open("fraud_model.pkl", "wb") as f:
    pickle.dump(lr_clf, f)

# Save scaler
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("Pickle files saved successfully!")


Pickle files saved successfully!
