In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from skopt import BayesSearchCV
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import RFE
from skopt.space import Integer, Categorical
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from skopt.space import Real
import dill

df = pd.read_csv("creditcard.csv")
print("Dataset Shape:", df.shape)
print(df.head())
print(df.info())
X = df.drop(columns=['Class'])
y = df['Class']
scaler = StandardScaler()
X[['Time', 'Amount']] = scaler.fit_transform(X[['Time', 'Amount']])

with open("scaler.pkl", "wb") as file:
    dill.dump(scaler, file)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

####Smote
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

###let's use a lightgbm algorithm here for now
lgbm = LGBMClassifier(random_state=42)
lgbm.fit(X_train_balanced, y_train_balanced)

####Finding the best parameters
param_space_lgbm = {
    'n_estimators': Integer(50, 500),
    'max_depth': Integer(3, 15),
    'learning_rate': Real(0.01, 0.3, prior='log-uniform'),
    'lambda_l2': Real(1, 5),
    'bagging_fraction': Real(0.5, 1.0),
    'feature_fraction': Real(0.5, 1.0)
}
bayesian_search_lgbm = BayesSearchCV(lgbm, param_space_lgbm, n_iter=20, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
bayesian_search_lgbm.fit(X_train_balanced, y_train_balanced)
print(f"Best Parameters (LightGBM): {bayesian_search_lgbm.best_params_}")

###Trian the model using the best features from above
best_lgbm = LGBMClassifier(random_state=42, verbose=0, **bayesian_search_lgbm.best_params_)
best_lgbm.fit(X_train_balanced, y_train_balanced)

#########Save the model
import dill
with open("fraudulent.pkl", "wb") as file:
    dill.dump(best_lgbm, file)
"""
#------------------------------------------------------------------------------------------------------------------
####using MI
mi_scores = mutual_info_classif(X_train_balanced, y_train_balanced)
mi_df = pd.DataFrame({'Feature': X_train_balanced.columns, 'MI Score': mi_scores})
mi_df = mi_df.sort_values(by='MI Score', ascending=False)
selected_features_mi = mi_df['Feature'].head(20).tolist()
print(f" Selected Features (Mutual Information): {selected_features_mi}")
################
X_train_mi = X_train_balanced[selected_features_mi]
rfe = RFE(estimator=lgbm, n_features_to_select=10)
rfe.fit(X_train_mi, y_train_balanced)
selected_features = X_train_mi.columns[rfe.support_].tolist()
print(f" Selected Features(Overall): {selected_features}")

#------------------------------------------------------------------------------------------------------------------

#######Train the models based on the selected features
X_train_selected = X_train_balanced[selected_features]  
X_test_selected = X_test[selected_features]
lgbm.fit(X_train_selected, y_train_balanced)
y_pred = lgbm.predict(X_test_selected)

#Formulate for diff measureing quantities
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

#Print out the scores after using feature selection
print(f" Model Accuracy After Feature Selection: {accuracy:.4f}")
print(f" Model Precision After Feature Selection: {precision:.4f}")
print(f" Model Recall After Feature Selection: {recall:.4f}")
print(f" Model F1 Score After Feature Selection: {f1:.4f}")

#---------------------------------------------------------------------------------------------------------------
"""
#without using feature selection and using all the features

best_lgbm.fit(X_train_balanced, y_train_balanced)
y_pred1 = lgbm.predict(X_test)

#####
#Formulate for diff measureing quantities
accuracy1 = accuracy_score(y_test, y_pred1)
precision1 = precision_score(y_test, y_pred1)
recall1 = recall_score(y_test, y_pred1)
f11 = f1_score(y_test, y_pred1)

#Print out the scores before using feature selection
print(f"📊 Model Accuracy Before Feature Selection: {accuracy1:.4f}")
print(f"📊 Model Precision Before Feature Selection: {precision1:.4f}")
print(f"📊 Model Recall Before Feature Selection: {recall1:.4f}")
print(f"📊 Model F1 Score Before Feature Selection: {f11:.4f}")


print("✅ Notebook saved successfully!")




Dataset Shape: (284807, 31)
   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

    

In [8]:
import dill
import numpy as np
import pandas as pd
import xgboost as xgb

# Load the scaler
with open("scaler.pkl", "rb") as file:
    scaler = dill.load(file)

# Load the model
with open("fraudulent.pkl", "rb") as file:
    model = dill.load(file)

# Features list
feature_names = ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 
                 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 
                 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 
                 'V28', 'Amount']

# Accepting user inputs safely
user_data = {}
for feature in feature_names:
    while True:
        value = input(f"Enter {feature} (or leave blank if unknown): ")
        if value.strip() == "":
            user_data[feature] = np.nan  # Handle missing values
            break
        try:
            user_data[feature] = float(value)
            break  # If input is valid, exit loop
        except ValueError:
            print(f"Invalid input! Please enter a numeric value for {feature}.")

# Convert to DataFrame
user_df = pd.DataFrame([user_data])

print("\nUser Input DataFrame (Before Handling Missing Values):")
print(user_df)

# Check how many missing values
missing_count = user_df.isna().sum().sum()
print(f"\nMissing values count: {missing_count}")

if missing_count > len(feature_names) // 2:
    print("\n Too many missing values! Prediction might not be accurate.")

# Fill missing values with median before scaling (to avoid errors)
user_df_imputed = user_df.fillna(user_df.median())

print("\nUser DataFrame (After Handling Missing Values):")
print(user_df_imputed)

# Apply scaling to all features
user_df_scaled = scaler.transform(user_df_imputed[feature_names])

# Make a prediction (ensure correct format for model)
prediction = model.predict(pd.DataFrame(user_df_scaled, columns=feature_names))[0]

if prediction == 1:
    print("\n ALERT: This transaction is likely FRAUDULENT!")
else:
    print("\n This transaction seems legitimate.")





Enter Time (or leave blank if unknown):  0.8889696
Enter V1 (or leave blank if unknown):  0.
Enter V2 (or leave blank if unknown):  0
Enter V3 (or leave blank if unknown):  0.26262
Enter V4 (or leave blank if unknown):  0.6595495
Enter V5 (or leave blank if unknown):  1.629595
Enter V6 (or leave blank if unknown):  0.65495
Enter V7 (or leave blank if unknown):  1.45899
Enter V8 (or leave blank if unknown):  1.62198219
Enter V9 (or leave blank if unknown):  1.9859898
Enter V10 (or leave blank if unknown):  0.9898498
Enter V11 (or leave blank if unknown):  0.89889
Enter V12 (or leave blank if unknown):  1.569898
Enter V13 (or leave blank if unknown):  0.56198489
Enter V14 (or leave blank if unknown):  1.+189198


Invalid input! Please enter a numeric value for V14.


Enter V14 (or leave blank if unknown):  0.11891
Enter V15 (or leave blank if unknown):  2.519219851
Enter V16 (or leave blank if unknown):  1.629982
Enter V17 (or leave blank if unknown):  1.5215198
Enter V18 (or leave blank if unknown):  0.5219298
Enter V19 (or leave blank if unknown):  1.59589
Enter V20 (or leave blank if unknown):  1.529889
Enter V21 (or leave blank if unknown):  1.649898
Enter V22 (or leave blank if unknown):  0.592982
Enter V23 (or leave blank if unknown):  0.52665
Enter V24 (or leave blank if unknown):  1.59298
Enter V25 (or leave blank if unknown):  1.9559298
Enter V26 (or leave blank if unknown):  0.9852985
Enter V27 (or leave blank if unknown):  1.5219598
Enter V28 (or leave blank if unknown):  0.98598
Enter Amount (or leave blank if unknown):  138.55



User Input DataFrame (Before Handling Missing Values):
      Time   V1   V2       V3       V4        V5       V6       V7        V8  \
0  0.88897  0.0  0.0  0.26262  0.65955  1.629595  0.65495  1.45899  1.621982   

        V9  ...       V20       V21       V22      V23      V24      V25  \
0  1.98599  ...  1.529889  1.649898  0.592982  0.52665  1.59298  1.95593   

        V26      V27      V28  Amount  
0  0.985298  1.52196  0.98598  138.55  

[1 rows x 30 columns]

Missing values count: 0

User DataFrame (After Handling Missing Values):
      Time   V1   V2       V3       V4        V5       V6       V7        V8  \
0  0.88897  0.0  0.0  0.26262  0.65955  1.629595  0.65495  1.45899  1.621982   

        V9  ...       V20       V21       V22      V23      V24      V25  \
0  1.98599  ...  1.529889  1.649898  0.592982  0.52665  1.59298  1.95593   

        V26      V27      V28  Amount  
0  0.985298  1.52196  0.98598  138.55  

[1 rows x 30 columns]


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- V1
- V10
- V11
- V12
- V13
- ...
